import requests
from lxml import etree
import queue
import threading
import time
class ImageSpider(object):
def __init__(self):
self.base_url = "http://sc.chinaz.com/tupian/beijingtupian_{}.html"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.list_url_q = queue.Queue(80)
for i in range(2,79):
self.list_url_q.put(self.base_url.format(i))
self.img_urls_q = queue.Queue(3000)
# 請求方法
def get_html_text(self, url):
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.content.decode()
else:
return None
# 解析列表頁方法
def parse_list_page(self, text):
html = etree.HTML(text)
images_tag = html.xpath("//div[@class='box picblock col3']")
items = []
for i_t in images_tag:
item = {}
item["title"] = i_t.xpath(".//img/@alt")[0]
item["url"] = i_t.xpath(".//img/@src2")[0]
self.img_urls_q.put(item)
#請求列表頁獲取img的url的方法
def requests_list_page(self):
while self.list_url_q.not_empty:
list_url = self.list_url_q.get()
text = self.get_html_text(list_url)
self.parse_list_page(text)
self.list_url_q.task_done()
#保存圖片到本地
def save_imgs_toLocal(self):
while self.img_urls_q.not_empty:
item = self.img_urls_q.get()
img_title = item["title"]
img_url = item["url"]
with open("./data/多線程背景圖片爬蟲/" + img_title + ".jpg", "wb") as fp:
fp.write(requests.get(img_url, headers=self.headers).content)
print(img_title + "save to local direction sucessfully...")
self.img_urls_q.task_done()
#主方法
def run(self):
thread_list = []
for i in range(5):
thread_list_page = threading.Thread(target=self.requests_list_page)
thread_list.append(thread_list_page)
for i in range(10):
thread_img_Save = threading.Thread(target=self.save_imgs_toLocal)
thread_list.append(thread_img_Save)
for t in thread_list:
#將子線程全部設置爲守護線程後啓動全部子線程,這意味着當主線程結束時,全部子線程也結束運行
t.setDaemon(True)
t.start()
#阻塞主線程,待隊列list_url_q和img_urls_q中的任務被全部接收並處理完成後再解除阻塞
#這裏的接收並處理完成指的是隊列q執行q.get()和q.task_done()方法
#這樣就會使得程序執行到此時會判斷兩個隊列是否全部爲空即子線程任務是否全部完成
#若子線程任務全部完成則解除阻塞,主線程結束,子線程也隨之結束
#若子線程任務未全部完成,則會使主線程阻塞,那麼子線程將繼續執行其任務直到全部完成
self.list_url_q.join()
self.img_urls_q.join()
if __name__ == '__main__':
isr = ImageSpider()
isr.run()
爬取結果如下: