2.6貼吧實戰
from urllib import request
#請求網頁頁面,並返回相關內容
def loadpage(url,filename):
print("正在下載內容"+filename)
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763"}
req= request.Request(url,headers=header)
response = request.urlopen(req).read()#rea()添加decode()會出錯
return response
#將爬取的數據進行存儲
def writepage(html,filename):
print("正在存儲信息")
with open(filename,"wb") as f:
f.write(html)
print(".........")
#理清頁數,編寫爬蟲
def spider(url,beginpage,endpage):#需要爬取的首頁與尾頁
for yeshu in range(beginpage,endpage+1):#range取到的最大值是endpage-1
yema=(yeshu-1)*50#第一頁是0
url = url + str(yema)#貼吧網址
filename = "第"+ str(yeshu) +"頁"#文件名
html = loadpage(url,filename)
writepage(html,filename)
print("下載完成")
if __name__=="__main__":
url="https://tieba.baidu.com/f?kw=python&ie=utf-8&pn="
spider(url,4,5)
結果產生了兩個網頁內容的文件