用XPath來做一個簡單的爬蟲,我們嘗試爬取某個貼吧裏的所有帖子,並且將該這個帖子裏每個樓層發佈的圖片下載到本地。
import requests
from lxml import etree
import json
class Tieba:
def __init__(self,tieba_name):
self.tieba_name = tieba_name
self.headers = {"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1"}
def get_total_url_list(self):
'''獲取所有的urllist'''
url = "https://tieba.baidu.com/f?kw="+self.tieba_name+"&ie=utf-8&pn={}&"
url_list = []
for i in range(100):
url_list.append(url.format(i*50))
return url_list
def parse_url(self,url):
'''一個發送請求,獲取響應,同時etree處理html'''
print("parsing url:",url)
response = requests.get(url,headers=self.headers,timeout=10)
html = response.content.decode()
html = etree.HTML(html)
return html
def get_title_href(self,url):
'''獲取一個頁面的title和href'''
html = self.parse_url(url)
li_temp_list = html.xpath("//li[@class='tl_shadow']")
total_items = []
for i in li_temp_list:
href = "https:"+i.xpath("./a/@href")[0] if len(i.xpath("./a/@href"))>0 else None
text = i.xpath("./a/div[1]/span[1]/text()")
text = text[0] if len(text)>0 else None
item = dict(
href = href,
text = text
)
total_items.append(item)
return total_items
def get_img(self,url):
'''獲取一個帖子裏面的所有圖片'''
html = self.parse_url(url)
img_list = html.xpath('//div[@data-class="BDE_Image"]/@data-url')
img_list = [i.split("src=")[-1] for i in img_list]
img_list = [requests.utils.unquote(i) for i in img_list]
return img_list
def save_item(self,item):
'''保存一個item'''
with open("teibatupian.txt","a") as f:
f.write(json.dumps(item,ensure_ascii=False,indent=2))
f.write("\n")
def run(self):
url_list = self.get_total_url_list()
for url in url_list:
total_item = self.get_title_href(url)
for item in total_item:
href = item["href"]
img_list = self.get_img(href)
item["img"] = img_list
print(item)
self.save_item(item)
if __name__ == "__main__":
tieba = Tieba("貓")
tieba.run()