爬蟲練習--多線程 抓取圖片

鬥圖網

單線程

# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import urllib.request


def parse_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
    }
    try:
        resp = requests.get(url, headers=headers)
        text = resp.text
        img_urls = re.findall(r'data-original="(.*?)"', text)
        img_names = re.findall(r' alt="(.*?)"', text)

        for i,j in zip(img_urls, img_names):
            j = re.sub(r"[\?\.]", "", j)
            img_name = j + os.path.splitext(i)[1]
            urllib.request.urlretrieve(i, "images/"+img_name)
            print(img_name + "     下載完成")
    except Exception as e:
        print(e)


def main():
    t1 = time.time()
    for i in range(5):
        url = rf"http://www.doutula.com/photo/list/?page={i}"
        parse_page(url)

    t2 = time.time()

    t = int(t2) - int(t1)

    print(f"共耗時{t}")


if __name__ == '__main__':
    main()

多線程

# -*- coding: utf-8 -*-
import os
import re
import time
import requests
import urllib.request
import threading
from queue import Queue


class Producer(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
    }
    proxys = {
        "http": "http://60.5.254.169:8081"
    }
    def __init__(self, page_queue, image_queue, *args, **kwargs):
        threading.Thread.__init__(self, *args, **kwargs)
        self.image_queue = image_queue
        self.page_queue = page_queue
      #  print("生產者初始化")

    def run(self):
     #   print("run")
        while 1:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.parse_page(url)
            print(url)

    def parse_page(self, url):
      #  print("開始解析")
        resp = requests.get(url, headers=self.headers, proxies=self.proxys)
        text = resp.text
        img_urls = re.findall(r'data-original="(.*?)"', text)
        img_names = re.findall(r' alt="(.*?)"', text)

        for i,j in zip(img_urls, img_names):
            j = re.sub(r"[\?\.]", "", j)
            img_name = j + os.path.splitext(i)[1]
            self.image_queue.put((i, img_name))


class Consumer(threading.Thread):
    def __init__(self, page_queue, image_queue,*args, **kwargs):
        threading.Thread.__init__(self, *args, **kwargs)
        self.image_queue = image_queue
        self.page_queue = page_queue

    def run(self) -> None:
        while 1:
            if self.image_queue.empty() and self.page_queue.empty():
                break
            img_url, filename = self.image_queue.get()
            urllib.request.urlretrieve(img_url, "images/" + filename)
            print(filename+"   下載完成")
        

def main():
    print("程序開始")
    page_queue = Queue(500)
    img_queue = Queue(1000)

    for i in range(2,10):
        url = rf"http://www.doutula.com/photo/list/?page={i}"
        page_queue.put(url)

    for i in range(5):
        t1 = Producer(page_queue, img_queue)
       #  print("生產者")
        t1.start()

    for i in range(3):
        t = Consumer(page_queue, img_queue)
        t.start()
       # print("消費者")


if __name__ == '__main__':
    main()


# 測試的時候ip被封了。找代理有點麻煩,跳過了

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章