aiohttp 高併發抓取

建立一個 session 會話對象

首先建立一個 session 會話對象,利用會話對象 session 去訪問網頁

訪問 python 官網,async,await 關鍵字是將函數設置爲異步操作,是 aiohttp 使用方式

import aiohttp
import asyncio


async def hello(URL):
    async with aiohttp.ClientSession() as session:
        async with session.get(URL) as response:
            responae = await response.text()
            print(response)


if __name__ == '__main__':
    URl = 'http://python.org'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(hello(URl))

 

 

請求頭,超時,cookies,代理

在第二段代碼修改

from aiohttp import ClientSession
import aiohttp
import asyncio


# 設置請求頭
headers = {'content-type' : "application/json"}
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL, headers=headers) as response:
            response = await response.text()
            print(response)


if __name__ == '__main__':
    URl = 'http://python.org'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(hello(URl))
# 設置超時,在會話中設置超時
timeout = aiohttp.ClientTimeout(total=60)
async def hello(URL):
    async with ClientSession(timeout=timeout) as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 設置超時,在請求中設置超時
timeout = aiohttp.ClientTimeout(total=60)
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL,timeout=timeout) as response:
            response = await response.text()
            print(response)
# 設置 cookies
cookies = {'cookies' : 'working'}
async def hello(URL):
    async with ClientSession(cookies=cookies) as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 設置代理 ip
proxy = 'http://117.191.11.72:8080'
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL,proxy=proxy) as response:
            response = await response.text()
            print(response)
# 支持代理授權
async def hello(URL):
    async with ClientSession() as session:
        proxy_auth = aiohttp.BasicAuth('user','pass')
        async with session.get('http://python.org',
                               proxy='http://proxy.com',
                               proxy_auth=proxy_auth) as response:
            response = await response.text()
            print(response)

 

 

get 請求方法

兩種,不帶參數,帶參數

# 不帶參數
async def hello(URL):
    async with ClientSession() as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 帶參數
# 在 URL 中設置參數
async def hello(URL):
    URl = 'http://httpbin.org/get?key=python'
    async with ClientSession() as session:
        async with session.get(URL) as response:
            response = await response.text()
            print(response)
# 設置請求參數 params
async def hello(URL):
    URl = 'http://httpbin.org/get'
    params = {'wd' : 'python'}
    async with ClientSession() as session:
        async with session.get(URL,params=params) as response:
            response = await response.text()
            print(response)

 

 

post 請求

# 字典格式寫入
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = {'wd' : 'python'}
    async with ClientSession() as session:
        async with session.get(URL,data=data) as response:
            response = await response.text()
            print(response)
# json 格式寫入
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = {'wd' : 'python'}
    async with ClientSession() as session:
        async with session.get(URL,json=data) as response:
            response = await response.text()
            print(response)
# 字符串格式寫入
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = 'python'
    async with ClientSession() as session:
        async with session.get(URL,data=data) as response:
            response = await response.text()
            print(response)
# 以字節流格式寫入(上傳文件)
async def hello(URL):
    URl = 'http://httpbin.org/post'
    data = 'python'
    async with ClientSession() as session:
        async with session.get(URL,data=data) as response:
            response = await response.text()
            print(response)

 

獲取響應內容方法

# 設置編碼格式
response = await response.text(encoding='utf-8')
# 以字節流格式返回
response = await response.read()
# 以 json 格式返回
response = await response.json()
# 獲取響應狀態碼
response = await response.status
# 獲取響應的請求頭
response = await response.headers
# 獲取 url 地址
url = response.url

 

 

異步爬取小說排行榜

import asyncio
import csv
from aiohttp import ClientSession
from bs4 import BeautifulSoup


# 網站訪問函數,將網站內容返回
async def getData(url,headers):
    # 創建回話對象
    async with ClientSession() as session:
        # 發送 get 請求,設置請求頭
        async with session.get(url,headers=headers) as response:
            # 返回響應內容
            return await response.text()


def savaData(result):
    for i in result:
        soup = BeautifulSoup(i,'lxml')
        find_div = soup.find_all('div',class_='book-mid-info')
        for d in find_div:
            # 小說名
            name = d.find('h4').getText()
            # 作者
            author = d.find('a',class_='name').getText()
            # 更新時間
            update = d.find('p',class_='update').getText()
            # 寫入 csv
            csvFile = open('data.csv','a',encoding='utf8',newline='')
            writer = csv.writer(csvFile)
            writer.writerow([name,author,update])
            csvFile.close()


def run():
    for i in range(25):
        # 構建不同的 url 傳入 getData,最後由 asyncio 模塊執行
        task = asyncio.ensure_future(getData(url.format(i+1),headers))
        # 將所有請求都加入到列表 tasks
        tasks.append(task)
    # 等待所有請求執行完成,一併返回全部響應內容
    result = loop.run_until_complete(asyncio.gather(*tasks))
    savaData(result)
    print(len(result))


if __name__ == '__main__':
    import time
    start = time.time()
    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
    }
    tasks = []
    url = 'https://www.qidian.com/rank/hotsales?page={}'
    # 創建 get_evevt_loop 對象
    loop = asyncio.get_event_loop()
    # 調用 run 函數
    run()
    end = time.time()
    print(end-start)

 

發佈了176 篇原創文章 · 獲贊 28 · 訪問量 10萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章