最近做了幾個關於微博搜索的需求,總結經驗如下:
1、需要登錄,但是採集了近10萬的數據也沒有碰到被封賬號的情況
2、單條博文最多可以採集500左右的評論
3、採集評論容易封ip
最後我使用的是python3+selenium+chrome進行微博高級搜索採集,不加評論還快的,加每條微博前10條評論+按每天切分搜索條件的情況下,一個關鍵詞一個月大概採集2w+博文,用時4小時以上。。。可以採集評論,博文,用戶id,博文id,評論數,轉發數,點贊數。
採集評論需要用代理,不然採集幾千條就封ip了。
部分代碼
# -*- coding: utf-8 -*-
"""
File Name: main
Description :
Author : meng_zhihao
mail : [email protected]
date: 2020/2/4
"""
import datetime
import time
from selenium_operate import ChromeOperate #個人工具類
from crawl_tool_for_py3_v6 import crawlerTool as ct #個人工具類
import random
import json
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def weibo_search():
# start_date = '2020-01-10' # 10-12
start_date = '2020-01-22'
# end_date = '2020-02-12'
end_date = '2020-02-10'
cop = ChromeOperate(executable_path=r'F:\github\py3\amazon_craw\chromedriver.exe')
date_list = dateRange(start_date, end_date)
print(date_list)
# search_url_template = 'https://s.weibo.com/weibo/%25E6%2594%25BF%25E5%25BA%259C?q=zyzf&scope=ori&suball=1×cope=custom:{0}-0:{1}-0&Refer=g'
#
# for i in range(len(date_list)-1):
# start_date = date_list[i]
# end_date = date_list[i+1]
# search_url = search_url_template.format(start_date,end_date)
if 1:
search_url = 'https://s.weibo.com/weibo/%25E7%2594%25B5%25E7%25AB%259E%25E8%25A1%258C%25E4%25B8%259A?q=電子競技&scope=ori&suball=1×cope=custom:2019-01-01:2020-02-01&Refer=g'
cop.open(search_url)
for page_num in range(50):
try:
need_comment = 0
page_buf = cop.open_source()
posts = ct.getXpath('//div[@class="card-wrap"]',page_buf)
if need_comment:
proxy = ct.get_new_1min_proxy()
proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy}
for post in posts:
texts = ct.getXpath('//p[@node-type="feed_list_content_full"]//text()',post)
if not texts:
texts = ct.getXpath('//p[@node-type="feed_list_content"]//text()', post)
texts = ''.join(texts)
if not texts:
continue
date = ""
from_source = ct.getXpath('//p[@class="from"]',post)
if from_source:
date = ct.getXpath1('//a/text()', from_source[-1])
date = date.strip()
nick = ct.getXpath1('//a/@nick-name',post)
mid = ct.getXpath1('//div/@mid',post)
''
# 評論
comments_button = ct.getXpath1('//a[@action-type="feed_list_comment"]/text()',post)
get_comments = []
if need_comment:
if ct.getRegex('評論 (\d+.*)',comments_button):
try:
try:
comments_page = ct.get('https://m.weibo.cn/api/comments/show?id='+mid,proxies=proxies) # 獲取評論會封ip,另外有很多評論不可見(敏感詞,用戶設置) 雖然評論數不是空
json_data = json.loads(comments_page)
comments = json_data['data']['data']
for comment in comments:
comment_text = comment['text']
get_comments.append(comment_text)
except:
time.sleep(2)
proxy = ct.get_new_1min_proxy()
# # time.sleep(2)
# comments_page = ct.get('https://m.weibo.cn/api/comments/show?id=' + mid)
# json_data = json.loads(comments_page)
# comments = json_data['data']['data']
# for comment in comments:
# comment_text = comment['text']
# get_comments.append(comment_text)
except Exception as e:
print(e,mid)
line = [date,mid,nick,texts]+get_comments
yield line
next_button = cop.find_elements_by_xpath('//a[@class="next"]')
if next_button:
time.sleep(random.randint(1, 2)*0.6)
next_button[0].click()
else:
break
except Exception as e:
print(e)
if __name__ == '__main__':
data = weibo_search()
ct.writer_to_csv(data,'電子競技.csv') # whzf 前幾天沒有數據
代碼地址
github
另在github上放了個打包好的windows用的桌面程序,使用請務必留言反饋問題或者評價
百度wp地址 /s/1jwr25SydQxUoYV0CN7aYfg 提取碼:ynxs