網絡爬蟲(Web Spider),又被稱爲網頁蜘蛛,是一種按照一定的規則,自動地抓取網站信息的程序或者腳本。
爬蟲流程:
1,先由 urllib 的 request 打開 Url 得到網頁 html 文檔
2,瀏覽器打開網頁源代碼分析元素節點
3,通過 Beautiful Soup 或正則表達式提取想要的數據
4,存儲數據到本地磁盤或數據庫(抓取,分析,存儲)
下面以爬取 筆趣網 裏面的小說爲例:
整個邏輯我這裏倒着來講述;
下圖是 一篇小說某個章節的正文內容,我們用 Chrome 瀏覽器打開後,右鍵 -> 檢查 ,再按照下圖操作;
從上圖可以,看出 一篇小說的某個章節的正文在 <div class = ‘showtxt’> 的節點裏面,知道這個規律之後,我們就可以開始碼代碼了;
def getContent(target):
req = requests.get(url = target)
# 這裏注意 encode,否則可能亂碼
html = req.text.encode(req.encoding, 'ignore')
bf = BeautifulSoup(html, "lxml")
# 找到所有 <div class = 'showtxt'> 節點
texts = bf.find_all('div', class_ = 'showtxt')
# 替換掉每個段落前面的 8個
texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts
上面這段代碼是拿到某個章節的內容, 接下來看看 一篇小說的所有章節怎麼拿到?
上圖是 一篇小說的所有章節目錄,從上面這個規律,可以發現就是 <div class = ‘listmain’> 裏面的所有 <a> 標籤;
def getZjUrl(bookurl):
zjlist = []
server = 'http://www.biqukan.com/'
req = requests.get(url = bookurl)
# 這裏注意 encode,否則可能亂碼
html = req.text.encode(req.encoding, 'ignore')
div_bf = BeautifulSoup(html, "lxml")
# 找到所有 <div class = 'listmain'> 節點
div = div_bf.find_all('div', class_ = 'listmain')
# 再在 div 裏面找到所有 <a> 節點
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
zjlist.append((each.string, server + each.get('href')))
return zjlist
最後就是把小說內容寫到硬盤裏面;
def writer(name, path, text):
f = open(path, 'a', encoding='utf-8')
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
最後附上 完整的代碼;
注意:完整的代碼是 抓取一個作者的所有小說到本地硬盤的,所以很慢,建議別直接運行
from bs4 import BeautifulSoup
import requests, sys, os, multiprocessing, threading
def getBookUrl(author):
booklist = []
server = 'http://www.biqukan.com/'
target = 'https://so.biqusoso.com/s.php?ie=utf-8&siteid=biqukan.com&q='+author
req = requests.get(url = target)
html = req.text.encode(req.encoding, 'ignore')
div_bf = BeautifulSoup(html, "lxml")
div = div_bf.find_all('div', class_ = 'search-list')
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
zz = a_bf.find_all('span', class_ = 's4')
for index in range(0,len(a)):
each = a[index]
booklist.append((each.string, each.get('href'), zz[index+1].string))
return booklist
def getZjUrl(bookurl):
zjlist = []
server = 'http://www.biqukan.com/'
req = requests.get(url = bookurl)
# 這裏注意 encode,否則可能亂碼
html = req.text.encode(req.encoding, 'ignore')
div_bf = BeautifulSoup(html, "lxml")
# 找到所有 <div class = 'listmain'> 節點
div = div_bf.find_all('div', class_ = 'listmain')
# 再在 div 裏面找到所有 <a> 節點
a_bf = BeautifulSoup(str(div[0]), "lxml")
a = a_bf.find_all('a')
for each in a:
zjlist.append((each.string, server + each.get('href')))
return zjlist
def getContent(target):
req = requests.get(url = target)
# 這裏注意 encode,否則可能亂碼
html = req.text.encode(req.encoding, 'ignore')
bf = BeautifulSoup(html, "lxml")
# 找到所有 <div class = 'showtxt'> 節點
texts = bf.find_all('div', class_ = 'showtxt')
# 替換掉每個段落前面的 8個
texts = texts[0].text.replace('\xa0'*8,'\n\n')
return texts
def writer(name, path, text):
f = open(path, 'a', encoding='utf-8')
f.write(name + '\n')
f.writelines(text)
f.write('\n\n')
# book = ('書名', '鏈接', '作者')
def saveOneBook(book, qauthor):
if book[2] == qauthor :
if not os.path.exists(qauthor) :
os.mkdir(qauthor)
zjlist = getZjUrl(book[1])
filename = qauthor + '/' + book[0] + '.txt'
isFileExist = os.path.exists(filename)
print(book[0] + str(isFileExist))
if not isFileExist :
print(filename + ' 開始下載:')
for zjIndex in range(12,len(zjlist)):
texts = getContent(zjlist[zjIndex][1])
writer(zjlist[zjIndex][0], filename,texts)
sys.stdout.write(book[0] + " 已下載:%d%%" % int(zjIndex * 100 / len(zjlist)) + '\r')
sys.stdout.flush()
print(filename + ' 下載完成')
def saveAllBookBy(qauthor):
booklist = getBookUrl(qauthor)
for bookIndex in range(0,len(booklist)):
book = booklist[bookIndex]
saveOneBook(book, qauthor)
# '我吃西紅柿', 'zhttty' ,'耳根' ,'天蠶土豆' ,'忘語', '唐家三少', '辰東', '魚人二代', '蝴蝶藍', '蕭鼎','跳舞','貓膩','煙雨江南','夢入神機','發飆的蝸牛'
# 最後這裏 因爲主要是I/O操作多,用了多線程
if __name__ == "__main__":
qauthorlist = ['蕭鼎']
for qauthor in qauthorlist:
booklist = getBookUrl(qauthor)
for book in booklist:
t1 = threading.Thread(target=saveOneBook, args=(book, qauthor,))
t1.start()