需求:抓取人民網微信公衆號的文章和評論
使用工具: fiddler python3 微信pc客戶端
破解過程:
首先 使用fiddler對微信pc端抓包,需要配置https證書,另外最好加個filter方便抓取
然後操作微信客戶端獲取公衆號文章的請求。如圖下拉即可獲取更多文章
從響應中我們即可發現需要的請求是什麼
獲取評論同理,只需要點開頁面即可。綜上,我們可以獲取到兩個請求所需要得鏈接。
搜索文章
https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MjM5NzI3NDg4MA==&f=json&offset=10&count=10&is_ok=1&scene=123&uin=NDY0MzUxMzYw&key=29b06f60899ec6c867d61e173db2b6c16b3762f2e5cdb92c852efde600dbd2ed93c64d59a2e7679f0fd63d08330f4c780bc03b178bce8203793945bab8f872a7fdea5e798d91dd45d80c529a51d09c14&pass_ticket=Z32RQhfJ8U4CzAuY%2FaTrO5L34DoZwby1AfD5CEmpKDF2Yigcc1zCN9DSIurf7xSj&wxtoken=&appmsg_token=1028_Oy97owPKmKBOYmxdHjbMGArLCZ5wlxcrjyd-mQ~~&x5=0&f=json
獲取文章內容
https://mp.weixin.qq.com/s?__biz=MjM5NzI3NDg4MA==&mid=2658556895&idx=1&sn=45312818684054380d10a0618acf5c66&chksm=bd5e8fb08a2906a6517eebcc581be3a6388ebc5f8c13b81812c2c42436d6c12c6bb25613f755&scene=123&key=d7c08afaa78fc97eee4343839926433f6ee2e4f26e0159426d076475a63c92d1821c1e348a1b961db6d2016852a5c0e5be7945e114b962d1d465651687f789469edc5278369374b1a4acbd920c7052ae&ascene=7&uin=NDY0MzUxMzYw&devicetype=Windows+10&version=62060833&lang=zh_CN&pass_ticket=Z32RQhfJ8U4CzAuY%2FaTrO5L34DoZwby1AfD5CEmpKDF2Yigcc1zCN9DSIurf7xSj&winzoom=1
獲取評論
https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz=MjM5NzI3NDg4MA==&appmsgid=2658556601&idx=2&comment_id=1004536869712969731&offset=0&limit=100&uin=NDY0MzUxMzYw&key=8f9b2dec1479b72249d443377529925f3f37d34961a20f6851359bc2b11cdecff94f05ca14294b36a69fdcd5d3214cdb74a09acd6a8f981d12839a1ec21633295da3ee806104c152a76a16d7a313883d&pass_ticket=Z32RQhfJ8U4CzAuY%25252FaTrO5L34DoZwby1AfD5CEmpKDF2Yigcc1zCN9DSIurf7xSj&wxtoken=777&devicetype=Windows%26nbsp%3B10&clientversion=62060833&__biz=MjM5NzI3NDg4MA%3D%3D&appmsg_token=1028_mMmQbMJMmj1lym1%252F8oGZU3L_yWbiIYKmZNX8Ob7oNkx2WL4jybZ8rd-pgHaTszVggPSfIAcc4QTU9D3k&x5=0&f=json
裏面都有一堆參數,我們需要尋找這些參數是怎麼來的。 首先我們可以對比不同文章的鏈接找出其中固定的參數,然後再從前置的請求中尋找每篇文章不同的參數。通過這種方法我們可以找到大部分參數。其中,biz,pass_ticket,app_msg_token是不同文章相同的,而comment_id等可以從上一個請求的響應中獲取。
這樣下來我們還有2個必要的參數獲取不了,key和uin,其他的參數不是必要的,所以我選擇百度,對的,你沒有看錯。。。百度到的結果是可以固定uin=777&key=777,目測是在安卓平臺上抓包就能抓到這樣的請求。
最終代碼修改自前輩文章,刪除了數據庫相關的代碼,方便大家測試運行。
https://blog.csdn.net/qq_28804275/article/details/82150874
# coding=utf8
import json
import re
import time
from datetime import datetime
import requests
class WxMps(object):
"""微信公衆號文章、評論抓取爬蟲"""
def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, _offset=0):
self.offset = _offset
self.biz = _biz # 公衆號標誌
self.msg_token = _app_msg_token # 票據(非固定)
self.pass_ticket = _pass_ticket # 票據(非固定)
self.headers = {
'Cookie': _cookie, # Cookie(非固定)
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
def start(self):
"""請求獲取公衆號的文章接口"""
offset = self.offset
while True:
api = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={0}&f=json&offset={1}' \
'&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={2}&wxtoken=&appmsg_token' \
'={3}&x5=1&f=json'.format(self.biz, offset, self.pass_ticket, self.msg_token)
resp = requests.get(api, headers=self.headers,verify=False).json()
ret, status = resp.get('ret'), resp.get('errmsg') # 狀態信息
if ret == 0 or status == 'ok':
print('Crawl article: ' + api)
offset = resp['next_offset'] # 下一次請求偏移量
general_msg_list = resp['general_msg_list']
msg_list = json.loads(general_msg_list)['list'] # 獲取文章列表
for msg in msg_list:
comm_msg_info = msg['comm_msg_info'] # 該數據是本次推送多篇文章公共的
msg_id = comm_msg_info['id'] # 文章id
post_time = datetime.fromtimestamp(comm_msg_info['datetime']) # 發佈時間
# msg_type = comm_msg_info['type'] # 文章類型
# msg_data = json.dumps(comm_msg_info, ensure_ascii=False) # msg原數據
app_msg_ext_info = msg.get('app_msg_ext_info') # article原數據
if app_msg_ext_info:
# 本次推送的首條文章
self._parse_articles(app_msg_ext_info, msg_id, post_time)
# 本次推送的其餘文章
multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
if multi_app_msg_item_list:
for item in multi_app_msg_item_list:
msg_id = item['fileid'] # 文章id
if msg_id == 0:
msg_id = int(time.time() * 1000) # 設置唯一id,解決部分文章id=0出現唯一索引衝突的情況
self._parse_articles(item, msg_id, post_time)
print('next offset is %d' % offset)
else:
print('Before break , Current offset is %d' % offset)
break
def _parse_articles(self, info, msg_id, post_time):
"""解析嵌套文章數據並保存入庫"""
title = info.get('title') # 標題
cover = info.get('cover') # 封面圖
author = info.get('author') # 作者
digest = info.get('digest') # 關鍵字
source_url = info.get('source_url') # 原文地址
content_url = info.get('content_url') # 微信地址
# ext_data = json.dumps(info, ensure_ascii=False) # 原始數據
content_url = content_url.replace('amp;', '').replace('#wechat_redirect', '').replace('http', 'https')
self._parse_article_detail(content_url, 1)
def _parse_article_detail(self, content_url, article_id):
# 從文章頁提取相關參數用於獲取評論,article_id是已保存的文章id
try:
html = requests.get(content_url, headers=self.headers,verify=False).text
except Exception as e:
print('獲取評論失敗' + content_url)
else:
# group(0) is current line
str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
str_msg = re.search(r"var appmsgid = '' \|\| '(.*)'\|\|", html)
str_token = re.search(r'window.appmsg_token = "(.*)";', html)
if str_comment and str_msg and str_token:
comment_id = str_comment.group(1) # 評論id(固定)
app_msg_id = str_msg.group(1) # 票據id(非固定)
appmsg_token = str_token.group(1) # 票據token(非固定)
# 缺一不可
if appmsg_token and app_msg_id and comment_id:
print('Crawl article comments: ' + content_url)
self._crawl_comments(app_msg_id, comment_id, appmsg_token, article_id)
def _crawl_comments(self, app_msg_id, comment_id, appmsg_token, article_id):
"""抓取文章的評論"""
api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
'&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
'&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
'&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
self.pass_ticket, appmsg_token)
resp = requests.get(api, headers=self.headers).json()
ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
if ret == 0 or status == 'ok':
elected_comment = resp['elected_comment']
for comment in elected_comment:
nick_name = comment.get('nick_name') # 暱稱
logo_url = comment.get('logo_url') # 頭像
comment_time = datetime.fromtimestamp(comment.get('create_time')) # 評論時間
content = comment.get('content') # 評論內容
content_id = comment.get('content_id') # id
like_num = comment.get('like_num') # 點贊數
# reply_list = comment.get('reply')['reply_list'] # 回覆數據
print(nick_name,like_num)
if __name__ == '__main__':
biz = 'MjM5NzI3NDg4MA==' # "人民網"
pass_ticket = '從fiddler裏摳'
app_msg_token = 'xxxx'
cookie = 'wap_sid2=CIDhtd0BElxiNGFKQllUQmJ4WEwtU3FJT2JiV1ZMalBFTzNNcWpmWWQ4ajNuMEpUSlE4T3VfZHczT3ZpMkxvZjJST2U1dEhERGwyWHUxdy1iZGpGZmxKSk1LSGhTUVFFQUFBfjCQm8HsBTgNQAE='
# 以上信息不同公衆號每次抓取都需要藉助抓包工具做修改
wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie)
wxMps.start() # 開始爬取文章及評論
# 運行前關閉fiddler!!