python 網絡爬蟲01

簡介

在這裏插入圖片描述

簡單的爬取一下百度

"""
爬起百度網頁
"""
import requests

# 指定要爬取的路徑
url = "https://www.baidu.com/s"

# 設置請求頭,添加UA字段,模擬瀏覽器操作 
# 如果不設置的話就會出現這樣的請求,百度那邊可以通過  User-Agent 識別的出來你是通過pythoon爬取網頁數據的 
# 這裏有個坑,一定要自己打開瀏覽器找到屬於自己的那個對應的請求頭,否則jj了。可以在瀏覽器通過f12在瀏覽器控制檯查看到你的請求頭裏面對應的參數
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400",
    "Cookie": "BAIDUID=80318213050C70FA4B0C391550F555AE:FG=1; BIDUPSID=80318213050C70FA4B0C391550F555AE; PSTM=1572569970; BD_UPN=1a314753; BDUSS=lUSC1Yd09ldWZ4Sm9OV0Nlc3FiTVVNZVVLclhETFVwSnF-Nn4xMkM1ZHJTfnRkRVFBQUFBJCQAAAAAAAAAAAEAAAD64E5HAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGu-011rvtNdbV; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ispeed_lsm=0; COOKIE_SESSION=35417_72_9_0_239_142_1_9_0_9_2_9_35536_0_3_232_1575198744_1574348164_1575198741%7C9%23452652_22_1574347932%7C5; BD_HOME=1; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; BD_CK_SAM=1; PSINO=7; H_PS_PSSID=1452_21102_30211_20880_29700; H_PS_645EC=00a7HVWKUJg8wftDg%2FwzzfxmJUP2dfpRwq21%2FMDNtQdM4%2FKNp9fv5hF%2BXkSwdACfD97T",
    "Host": "www.baidu.com"
}

# 設置請求參數
params = {
    "wd": "中國"
}

# 請求地址,獲得相響應
response = requests.get(url, params=params, headers=headers)


print(f"響應編碼:{response.encoding}")
print(f"響應狀態碼:{response.status_code}")

# 設置編碼格式 否則可能編碼
response.encoding="utf-8"
# 打印內容
print(response.text)
print(response.content)
print(response.url)

# 寫這玩意好麻煩
# f = open("百度.html", "w", encoding="UTF-8")
# f.write(response.text)
# f.close()


# 打開文件寫完就自動關閉,這是文件讀寫的特有方式
with open("中國.html", "w", encoding="UTF-8") as f:
    f.write(response.text)

爬取豆瓣電影TOP250,分頁保存電影數據

"""
爬取豆瓣電影TOP250,分頁保存電影數據
"""
import requests
import time

# 設置請求頭
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400",
}

# 循環爬取10頁數據
for i in range(10):
    # 這是請求地址
    url = f"https://movie.douban.com/top250?start={i * 25}"
    # 這是請求數據 verify=False 這是忽略掉 https請求中的警告,如果不設置就jj了 
    response = requests.get(url, headers=headers, verify=False)
    print(response.status_code)
    if response.status_code == 200:
        with open(f"第{i+1}頁.txt", "w", encoding="UTF-8") as f:
            f.write(response.text)
            print(f"{url} 保存成功")
    # 睡個兩秒 防止網頁封殺
    time.sleep(2)

voer

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章