requests庫的使用
一、簡介
二、發起請求
三、接收響應
四、session對象
五、練習
http://wz.sun0769.com/index.php/question/questionType?type=4
爬取投訴帖子的編號、帖子的url、帖子的標題,和帖子裏的內容,並將內容寫入到json文件中。
import re
import requests
import json
def request(url, headers=None):
res = requests.get(url, headers=headers)
res.encoding = "gbk"
# print(res.text)
return res
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
'Cookie': 'NSC_wt_xa.tvo0769.dpn=ffffffffc3a0145d45525d5f4f58455e445a4a423660'
}
url_1 = "http://wz.sun0769.com/index.php/question/questionType?type=4"
res = request(url_1, headers=headers)
# 匹配投訴編號
num = re.findall(r'<td width="53" height="30" align="center" bgcolor="#FFFFFF">(\d+?)</td>', res.text)
print(num)
# 匹配帖子url及標題
url_titles = re.findall(r'\[投訴\]</a> <a href="(.*?)" title="(.*?)"', res.text, re.S)
print(url_titles)
for i in range(len(url_titles)):
url = url_titles[i][0] # 取出投訴詳情的url
title = url_titles[i][1] # 取出投訴的標題文本
number = num[i] # 取出投訴編號
content = request(url)
text = re.findall(r'<meta name="description" content="(.+?)"', content.text, re.S) # 匹配出投訴的具體內容
# text_result = re.sub(r"<.*>", "", text[0]) # 將符合規則的內容替換成""(空)
# print(text_result)
complaint = {'投訴': {'編號': number, 'url': url, '標題': title, '內容': text}}
with open(r"D:\Python學院學習環境\pachong\complaints\complaint{}.json".format(i), "w")as f:
f.writelines(json.dumps(complaint, ensure_ascii=False, indent=4))
效果圖