終於將一本書的小說爬蟲完善了。在這裏我爬取的是“去看看小說網”中的“劍來”一書。
代碼中關於爬取內容的地方,讀者可參考,然後按照自己想要爬取的網頁進行修改。
(以下代碼爲完整代碼,已測試)
#-*- coding=utf-8 -*-
import urllib2
import urlparse
import Queue
import time
from bs4 import BeautifulSoup
# s1 = '大家好啊'
# open('hello.txt','w').write(s1)
# 得到小說內容
def link_crawler(seed_url):
firstTitle = BeautifulSoup(download(seed_url)).title.text.split('(')[0] # 拿到小說名字
filename = firstTitle+'.txt'
file = open(filename,'w+') # 創建以小說名爲名的txt文件,並將文件設置爲追加模式
file.write(firstTitle+'\n\n')
# print firstTitle
crawler_queue = Queue.deque([seed_url]) # 將鏈接存到crawler_queue列表中,並且按照降序存放
seen = set(crawler_queue) # 將訪問過的鏈接存放在seen中
while crawler_queue:
url = crawler_queue.pop() # 提取第一個鏈接
html = download(url)
# soup = BeautifulSoup(html)
soup = BeautifulSoup(html).find_all('dd',{'class':'col-md-3'})
# print soup
for link in soup:
title = link.string # 章節名稱
file.write(title+'\n')
link = link.find('a')['href']
print title
# print link
if link not in seen:
# 因第一個鏈接所示內容爲新書感言,需要從裏頭拿到的鏈接與後面的不一樣,故單獨記錄
first = link.split('.')[0]
link = urlparse.urljoin(url,link) # 將鏈接補充完整
html2 = download(link)
content1 = BeautifulSoup(html2).find(id='htmlContent') # 章節內容
neilink = BeautifulSoup(html2).find(id='linkNext')
bb = neilink['href']
# print bb
if first == '1':
content = content1.text
file.write(content+'\n\n')
else:
html3 = download(bb)
content2 = BeautifulSoup(html3).find(id='htmlContent')
content1 = content1.text
content2 = content2.text
file.write(content1+'\n')
file.write(content2+'\n\n')
# content = content1+content2(錯誤代碼,不可以用+相連)
# print content.text
seen.add(link)
crawler_queue.append(link)
time.sleep(1) # 睡眠(每隔一秒鐘找一次)
# 得到已知鏈接的網頁源代碼(各句功能在以前的博客中都有,不再贅述)
def download(url,user_agent = 'wswp',proxy = None,num_retries = 2):
print 'downloading:',url
headers = {'User-agent':user_agent}
request = urllib2.Request(url,headers = headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
html = opener.open(request).read()
except urllib2.URLError as e:
print 'download error',e.reason
html = None
if num_retries > 0:
if hasattr(e,'code') and 500 <= e.code < 600:
html = download(url,user_agent,proxy,num_retries-1)
return html
seed_url = 'http://www.7kankan.la/book/1/'
# seed_url = 'http://www.biquge5200.com/52_52542/'
link_crawler(seed_url)