python寫的整本書的小說爬蟲(並寫入txt文件)

終於將一本書的小說爬蟲完善了。在這裏我爬取的是“去看看小說網”中的“劍來”一書。

代碼中關於爬取內容的地方,讀者可參考,然後按照自己想要爬取的網頁進行修改。

(以下代碼爲完整代碼,已測試)

#-*- coding=utf-8 -*-

import urllib2
import urlparse
import Queue
import time
from bs4 import BeautifulSoup

# s1 = '大家好啊'
# open('hello.txt','w').write(s1)
  
# 得到小說內容
def link_crawler(seed_url):
    firstTitle = BeautifulSoup(download(seed_url)).title.text.split('(')[0]     # 拿到小說名字
    filename = firstTitle+'.txt'
    file = open(filename,'w+')      # 創建以小說名爲名的txt文件,並將文件設置爲追加模式
    file.write(firstTitle+'\n\n')
#     print firstTitle
    crawler_queue = Queue.deque([seed_url])     # 將鏈接存到crawler_queue列表中,並且按照降序存放
    seen = set(crawler_queue)       # 將訪問過的鏈接存放在seen中
    while crawler_queue:
        url = crawler_queue.pop()   # 提取第一個鏈接
        html = download(url)
#         soup = BeautifulSoup(html)
        soup = BeautifulSoup(html).find_all('dd',{'class':'col-md-3'})
#         print soup
        for link in soup:
            title = link.string     # 章節名稱
            file.write(title+'\n')
            link = link.find('a')['href']
            print title   
#             print link
            if link not in seen:
                # 因第一個鏈接所示內容爲新書感言,需要從裏頭拿到的鏈接與後面的不一樣,故單獨記錄
                first = link.split('.')[0]  
                link = urlparse.urljoin(url,link)   # 將鏈接補充完整
                html2 = download(link)
                content1 = BeautifulSoup(html2).find(id='htmlContent')      # 章節內容
                neilink = BeautifulSoup(html2).find(id='linkNext')
                bb = neilink['href']
#                 print bb
                if first == '1':
                    content = content1.text
                    file.write(content+'\n\n')
                else:
                    html3 = download(bb)
                    content2 = BeautifulSoup(html3).find(id='htmlContent')
                    content1 = content1.text
                    content2 = content2.text
                    file.write(content1+'\n')
                    file.write(content2+'\n\n')
#                     content = content1+content2(錯誤代碼,不可以用+相連)
#                 print content.text
                 
                seen.add(link)
                crawler_queue.append(link)
                time.sleep(1)       # 睡眠(每隔一秒鐘找一次)
                 
 
# 得到已知鏈接的網頁源代碼(各句功能在以前的博客中都有,不再贅述)
def download(url,user_agent = 'wswp',proxy = None,num_retries = 2):
    print 'downloading:',url
    headers = {'User-agent':user_agent}
    request = urllib2.Request(url,headers = headers)
      
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme:proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print 'download error',e.reason
        html = None
        if num_retries > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                html = download(url,user_agent,proxy,num_retries-1)
                  
    return html
 
seed_url = 'http://www.7kankan.la/book/1/'
# seed_url = 'http://www.biquge5200.com/52_52542/'
link_crawler(seed_url)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章