python爬蟲(二):爬蟲效率提升

參考資料
併發執行
如何優化 Python 爬蟲的速度?

1 需求描述

爬取較多數據時如何避免用時過長,時下通用方式主要爲多進程、多線程、協程和混合模式四種。

2 方法說明

2.1 多進程——threading

詳見threading官方文檔

2.2 多線程——multiprocessing

詳見multiprocessing官方文檔

2.2 協程——asyncio

詳見asyncio官方文檔

3 實戰記錄

3.1 完整代碼

以爬取比比電子招標信息平臺招標信息的網頁全文爲例進行演示說明,主要內容包括網頁全文、項目名稱、鏈接地址和發佈時間。

import requests
from lxml import etree
from bs4 import BeautifulSoup

import re
import time
import csv

import multiprocessing

import aiohttp
import asyncio


header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/81.0.4044.113 Safari/537.36'}

with open('data.csv', 'w', newline='', encoding='utf-8') as f:
    csvwriter = csv.writer(f, dialect='excel')
    csvwriter.writerow(['detail', 'name', 'url', 'publishDate'])
f.close()
# ---------------獲取某一頁所有的url地址-------------- #
def get_link_url(link_url):
    time.sleep(3)
    response = requests.get(link_url, headers=header, timeout=5)
    # print(response.text)
    table = re.findall('<table>(.*?)</table>', response.text, re.S | re.M)[0]
    urls = re.findall('(?<=href=\").*?(?=\")',table, re.S | re.M)
    # print(urls)
    return urls

# ---------------獲取鏈接詳情-------------- #
def get_link_info(url):
    time.sleep(3)
    with open('data.csv', 'a', newline='', encoding='utf-8') as f:
        response = requests.get(url, headers=header, timeout=5)
        soup = BeautifulSoup(response.text, 'lxml') # 將網頁轉換爲soup對象

        content = etree.HTML(response.text) # 將網頁轉換爲xpath對象

        html = soup.get_text() # 將網頁轉換爲純文本

        # ---------清除字符串空格-------- #
        # detail = html.replace(' ','') # 無法同時處理換行等引起的空格
        # detail = ''.join(html.split()) # 實用
        detail = re.sub('[\r\n\s]','', html) # 匹配回車、換行和空白符(空格符、製表符和換頁符等)
        
        # ============================= #
        # <p style="text-align: center; font-size: 18px; text-decoration: underline;">霍州市2020年“四好農村路”窄路基路面拓寬改造工程施工招標公告</p>
        # ============================= #

        # ----------xpath---------- #
        # name = content.xpath('//h3/text()')[0]
        # publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0]

        # ----------BeautifulSoup---------- #
        # name = soup.select('h3')[0].get_text()
        # publishDate = soup.select('.sum')[0].get_text()

        # ----------re---------- #
        name = re.findall('(?<=<h3>).*?(?=<p)', response.text, re.S|re.M)[0]
        name = ''.join(re.split('\s',name))
        publishDate = re.findall('(?<=發佈時間:).*?(?=|)', html, re.S|re.M)[0]

        f.write('{},{},{},{}\n'.format(detail, name, url, publishDate))
    f.close()

async def get_asy_link_info(url):
    time.sleep(3)
    with open('data.csv', 'a', newline='', encoding='utf-8') as f:
        # response = await requests.get(url, headers=header, timeout=5)  # 配套協程asyncio可能報錯,故選用aiohttp
		
        session = aiohttp.ClientSession()
        response = session.get(url)
        
        soup = BeautifulSoup(response.text, 'lxml') # 將網頁轉換爲soup對象

        # content = etree.HTML(response.text) # 將網頁轉換爲xpath對象

        html = soup.get_text() # 將網頁轉換爲純文本

        detail = re.sub('\s','', html)
        # ============================= #
        # <p style="text-align: center; font-size: 18px; text-decoration: underline;">霍州市2020年“四好農村路”窄路基路面拓寬改造工程施工招標公告</p>
        # ============================= #

        # ----------xpath---------- #
        # name = content.xpath('//h3/text()')[0]
        # publishDate = content.xpath('//p[contains(@class,"sum")]/text()')[0]

        # ----------BeautifulSoup---------- #
        # name = soup.select('h3')[0].get_text()
        # publishDate = soup.select('.sum')[0].get_text()

        # ----------re---------- #
        name = re.findall('(?<=<h3>).*?(?=<p)', response.text, re.S|re.M)[0]
        name = ''.join(re.split('\s',name))
        publishDate = re.findall('(?<=發佈時間:).*?(?=|)', html, re.S|re.M)[0]

        f.write('{},{},{},{}\n'.format(detail, name, url, publishDate))
    f.close()

if __name__ == '__main__':
    # 列表推導式獲取1~5頁的url地址
    link_urls = ['http://www.bitbid.cn/ggWeb!zhaobiaogg.action?gongShiType=1&currentPage={}&ggName=&type=&startDate=&endDate=&shengID=0'.format(i) for i in range(1, 6)]

    url_list = []
    for link_url in link_urls:
        urls = get_link_url(link_url)
        url_list = url_list + urls # 將1~5頁的url地址存入同一個列表url_list
    print(url_list)

    # **********************1 單進程************************** #
    t11 = time.time() # 常規單進程開始時間
    for url in url_list:
        get_link_info(url)
    t12 = time.time() # 常規單進程結束時間
    print('單進程', t12 - t11)

    # **********************2 多進程************************** #

    # ------------進程池---------- #
    t21 = time.time()  # 進程池開始時間
    pool = multiprocessing.Pool(processes=4) # 最大容量 multiprocessing.cpu_count()

	# for url in url_list:
	#	pool.apply_async(get_link_info, args=(url,)) # 非阻塞方式
		
    pool.map(get_link_info, url_list)
    t22 = time.time()  # 進程池結束時間
    pool.close()
    pool.join()
    print('進程池', t22 - t21)

    # ------------子進程---------- #
    t31 = time.time()  # 子進程開始時間
    for url in url_list:
        p = multiprocessing.Process(target=get_link_info, args=(url,))
        p.start()
        p.join()
    t32 = time.time()  # 子進程結束時間
    print('子進程', t32 - t31)
	
	# **********************3 協程************************** #
	
    t51 = time.time()  # 協程開始時間
    loop = asyncio.get_event_loop() # 返回一個事件循環對象
    tasks = [get_asy_link_info(url) for url in url_list]
    loop.run_until_complete(asyncio.wait(tasks)) # 直至對象運行完成,返回結果
    loop.close() # 關閉事件循環
    t52 = time.time()  # 協程結束時間
    print('協程', t52 - t51)

3.2 結果展示

單進程 237.6622188091278
進程池 65.96817064285278
子進程 348.5716996192932
協程 235.63298511505127
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章