2016-09-21

小说《石油咽喉保卫战》爬虫

最近机缘巧合，为了跟学长做某个项目正在入门python，写了个小说《石油咽喉保卫战》的爬虫作为练习。能爬取第208章《汉海军不败》以后的章节
顺便推荐一个入门WEB一些技术的好网站
汇智网网址
上代码：

import requests
import codecs
from bs4 import  BeautifulSoup
URL="http://www.junshishu.com/Book10429/Content1709923.html" #要爬取的地址
counter = 0
HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
}#user-agent，模仿浏览器，防止被目标网站反爬
def download_page(url):#下载页面
    data = requests.get(url,headers=HEADERS).content #请求页面，获取要爬取的页面内容
    return data
def parse_html(html):
    soup= BeautifulSoup(html,from_encoding="gbk")#使用bs解析获取的页面，测试是可以使用print soup.pretiffy()打印查看获得的页面
    novels_soup = soup.find('div',attrs={'id':'mouseRight'})#根据css获得要爬取的页面信息
    novel_list=[]
    for novel in novels_soup.find_all('p'): #遍历页面中文字段落
        novel_txt = novel.getText()
        novel_list.append(novel_txt) #添加到list中
    next_page = soup.find('div',attrs={'class':'p_02'}).find_all('a')[2] #找到下一页
    if next_page:
        global counter
        counter += 1
        print(counter)
        return novel_list,next_page['href'] #继续爬取下一页
    return novel_list,None #返回存有文字片段的list
def main():
    url=URL
    with codecs.open('novels.txt','wb',encoding='utf-8') as fp: #打开文件，使用utf-8编码
        while url:
            html = download_page(url)  #获取页面
            novels,url=parse_html(html) #分析页面获取信息
            fp.write(u'{novels}\n'.format(novels='\n'.join(novels))) #将获得的信息写入文件
if __name__=='__main__':
    main()