關於微信公衆號文章的評論數網上的教程是可以用的,這裏就不另外講了,。這裏要說的我抓閱讀量的過程,太tm坎坷了,足足花了我10個小時,幹到半夜12點半有想法了又起來開機!!不過好在最終實現了全代碼運行,不需要模擬器或者手機之類的。不過初始的token參數需要fiddler抓取一下pc端微信。
首先,前面抓評論數的經驗是微信安卓端和PC端的請求是不一樣的,需要綜合兩者的請求一起抓。
然後,該帶的信息必須全部帶上,一個post十幾個參數,一個都不能少! 我就是在這個問題上浪費了太多時間。
最後,善用chrome調試,多把中間的頁面保存下來分析,上面的js你看不明白就對比下chrome發的請求和你自己拼接出來的請求一不一樣。
最後測試結果,還有部分文章沒抓出來,估計是我哪裏還有bug。然後超過10w的閱讀量是隻會顯示100001的。從搜索到獲取信息全部由python獨立完成,連selenium都沒用。不過如果不追求速度用selenium會非常簡單!
參考代碼
# coding=utf8
import json
import re
import time
from datetime import datetime
import urllib
import requests
import random
class WxMps(object):
"""微信公衆號文章、評論抓取爬蟲"""
def __init__(self, _biz, _pass_ticket, _app_msg_token, _cookie, _offset=0):
self.offset = _offset
self.biz = _biz # 公衆號標誌
self.msg_token = _app_msg_token # 票據(非固定)
self.pass_ticket = _pass_ticket # 票據(非固定)
self.session = requests.session()
headers = {
'Cookie': _cookie, # Cookie(非固定)
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'
# 修改頭信息 ,chrome就完美變成微信客戶端了!!
}
self.headers = headers
cookies = requests.utils.cookiejar_from_dict(headers, cookiejar=None, overwrite=True)
self.session.cookies=cookies
self.session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36 QBCore/3.53.1159.400 QQBrowser/9.0.2524.400 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat'})
def start(self):
"""請求獲取公衆號的文章接口"""
offset = self.offset
while True:
api = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={0}&f=json&offset={1}' \
'&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={2}&wxtoken=&appmsg_token' \
'={3}&x5=1&f=json'.format(self.biz, offset, self.pass_ticket, self.msg_token)
print(api)
resp = self.session.get(api, verify=False).json()
ret, status = resp.get('ret'), resp.get('errmsg') # 狀態信息
if ret == 0 or status == 'ok':
offset = resp['next_offset'] # 下一次請求偏移量
general_msg_list = resp['general_msg_list']
msg_list = json.loads(general_msg_list)['list'] # 獲取文章列表
for msg in msg_list:
comm_msg_info = msg['comm_msg_info'] # 該數據是本次推送多篇文章公共的
msg_id = comm_msg_info['id'] # 文章id
post_time = datetime.fromtimestamp(comm_msg_info['datetime']) # 發佈時間
# msg_type = comm_msg_info['type'] # 文章類型
# msg_data = json.dumps(comm_msg_info, ensure_ascii=False) # msg原數據
app_msg_ext_info = msg.get('app_msg_ext_info') # article原數據
if app_msg_ext_info:
# 本次推送的首條文章
self._parse_articles(app_msg_ext_info, msg_id, post_time)
# 本次推送的其餘文章
multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
if multi_app_msg_item_list:
for item in multi_app_msg_item_list:
msg_id = item['fileid'] # 文章id
if msg_id == 0:
msg_id = int(time.time() * 1000) # 設置唯一id,解決部分文章id=0出現唯一索引衝突的情況
self._parse_articles(item, msg_id, post_time)
print('next offset is %d' % offset)
else:
print('Before break , Current offset is %d' % offset)
break
def _parse_articles(self, info, msg_id, post_time):
"""解析嵌套文章數據並保存入庫"""
title = info.get('title') # 標題
cover = info.get('cover') # 封面圖
author = info.get('author') # 作者
digest = info.get('digest') # 關鍵字
source_url = info.get('source_url') # 原文地址
content_url = info.get('content_url') # 微信地址
# ext_data = json.dumps(info, ensure_ascii=False) # 原始數據
content_url = content_url.replace('amp;', '').replace('http', 'https')
self._parse_article_detail(content_url, 1)
def _parse_article_detail(self, content_url, article_id):
# 從文章頁提取相關參數用於獲取評論,article_id是已保存的文章id
try:
html = self.session.get(content_url,verify=False).text
with open('1.html','w',encoding="utf-8") as f:
f.write(html)
except Exception as e:
print('獲取評論失敗' + content_url)
print(e)
else:
# group(0) is current line
str_comment = re.search(r'var comment_id = "(.*)" \|\| "(.*)" \* 1;', html)
str_msg = re.search(r"var appmsgid = '' \|\| '(.*)'\|\|", html)
str_token = re.search(r'window.appmsg_token = "(.*)";', html)
mid = re.search(r'mid=(\d*)', content_url).group(1)
sn = re.search(r'sn=(\w*)', content_url).group(1)
ct = re.search(r'ct = "(.*)";', html).group(1)
title = re.search(r'var msg_title = "(.*?)"', html).group(1)
req_id = re.search(r'var req_id = \'(.*?)\'', html).group(1)
devicetype = re.search(r'var devicetype = "(.*?)"', html).group(1)
scene = re.search(r'scene=(\d*)', content_url).group(1)
if str_comment and str_msg and str_token:
comment_id = str_comment.group(1) # 評論id(固定)
app_msg_id = str_msg.group(1) # 票據id(非固定)
appmsg_token = str_token.group(1) # 票據token(非固定)
# 缺一不可
if appmsg_token and app_msg_id and comment_id:
print('Crawl article comments: ' + content_url)
# self._crawl_comments(app_msg_id, comment_id, appmsg_token, article_id)
self._crawl_yuedu(self.pass_ticket, appmsg_token,mid,comment_id,sn,ct,title,req_id,devicetype,scene)
def _crawl_comments(self, app_msg_id, comment_id, appmsg_token, article_id):
"""抓取文章的評論"""
api = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz={0}' \
'&appmsgid={1}&idx=1&comment_id={2}&offset=0&limit=100&uin=777&key=777' \
'&pass_ticket={3}&wxtoken=777&devicetype=android-26&clientversion=26060739' \
'&appmsg_token={4}&x5=1&f=json'.format(self.biz, app_msg_id, comment_id,
self.pass_ticket, appmsg_token)
resp = self.session.get(api, verify=False).json()
ret, status = resp['base_resp']['ret'], resp['base_resp']['errmsg']
if ret == 0 or status == 'ok':
elected_comment = resp['elected_comment']
for comment in elected_comment:
nick_name = comment.get('nick_name') # 暱稱
logo_url = comment.get('logo_url') # 頭像
comment_time = datetime.fromtimestamp(comment.get('create_time')) # 評論時間
content = comment.get('content') # 評論內容
content_id = comment.get('content_id') # id
like_num = comment.get('like_num') # 點贊數
# reply_list = comment.get('reply')['reply_list'] # 回覆數據
print(nick_name,like_num)
def _crawl_yuedu(self,pass_ticket, appmsg_token,mid,comment_id,sn,ct,title,req_id,devicetype,scene):
api = 'https://mp.weixin.qq.com/mp/getappmsgext?f=json&mock=&uin=777&key=777&pass_ticket=&wxtoken=777&devicetype={0}&clientversion=62060833&__biz={1}&appmsg_token={2}&f=json&x5=0'.format(
urllib.parse.quote(devicetype),self.biz,appmsg_token)
data = {
'r':random.random(),
'__biz': 'MjM5NzI3NDg4MA==',
'appmsg_type': 9,
'mid': mid,
'sn': sn,
'idx': 1,
'scene': 27,
'title': urllib.parse.quote(title),
'ct': 1569809116,
'abtest_cookie':None,
'devicetype': 'Windows10',
'version': 62060833,
'is_need_ticket': 1,
'is_need_ad': 0,
'comment_id': comment_id,
'is_need_reward': 0,
'both_ad': 0,
'reward_uin_count': 0,
'send_time':None,
'msg_daily_idx': 1,
'is_original': 0,
'is_only_read': 1,
'req_id': req_id,
'pass_ticket':None,
'is_temp_url': 0,
'item_show_type': 0,
'tmp_version': 1,
'more_read_type': 0,
'appmsg_like_type': 2,
'related_video_sn':None,
'vid':None
}
time.sleep(2)
resp = self.session.post(api, verify=False,data=data).json() #爲啥這裏做着做着就變成了python的header?
print(resp)
if __name__ == '__main__':
biz = 'MjM5NzI3NDg4MA%3D%3D' # "人民網"
pass_ticket = 'nu+69QK2jsmCmAKjjqp998SuOZpRzA6cyxmu6F7xHCw/P/IoHxF8WWrrCwyBi8sI'
app_msg_token = '1028_XeSDhMAkMLG3l5xAQax7E9jA7h-o1-KWlAPWVIuxzgtokiEOFK89u8_U6RsoI9OdXdBBLcgfxvc156Mz'
cookie = 'pgv_pvid=7670766920; pgv_pvi=6227352576; RK=uBrUka7cTI; ptcz=366b89eceeb512317f19bb9082a8579afc611ad8f3546e3ab65722bbd9f2ece8; wxuin=464351360; lang=zh_CN; rewardsn=; wxtokenkey=777; devicetype=Windows10; version=62060833; pass_ticket=nu+69QK2jsmCmAKjjqp998SuOZpRzA6cyxmu6F7xHCw/P/IoHxF8WWrrCwyBi8sI; wap_sid2=CIDhtd0BElxQRHJSQ1JBV2ZRbEo2djhoWVA4UjRTOVZjai0zR3Z1YUlDYjJ6VURUb1RGY2hJOXgtejlxakQtem94ZGdrb0ItYlZSTl8yY2pWbmVIaE9nNXFHMUpNUVFFQUFBfjCrt8XsBTgNQAE='
# 以上信息不同公衆號每次抓取都需要藉助抓包工具做修改
wxMps = WxMps(biz, pass_ticket, app_msg_token, cookie)
wxMps.start() # 開始爬取文章及評論
# 運行前關閉fiddler!!