微博高級搜索爬蟲

最近做了幾個關於微博搜索的需求,總結經驗如下:
1、需要登錄,但是採集了近10萬的數據也沒有碰到被封賬號的情況
2、單條博文最多可以採集500左右的評論
3、採集評論容易封ip

最後我使用的是python3+selenium+chrome進行微博高級搜索採集,不加評論還快的,加每條微博前10條評論+按每天切分搜索條件的情況下,一個關鍵詞一個月大概採集2w+博文,用時4小時以上。。。可以採集評論,博文,用戶id,博文id,評論數,轉發數,點贊數。

採集評論需要用代理,不然採集幾千條就封ip了。

部分代碼

# -*- coding: utf-8 -*-
"""
File Name:     main
Description :
Author :       meng_zhihao
mail :       [email protected]
date:          2020/2/4
"""

import datetime
import time
from selenium_operate import ChromeOperate #個人工具類
from crawl_tool_for_py3_v6 import crawlerTool as ct  #個人工具類
import random
import json

def dateRange(beginDate, endDate):
    dates = []
    dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
    date = beginDate[:]
    while date <= endDate:
        dates.append(date)
        dt = dt + datetime.timedelta(1)
        date = dt.strftime("%Y-%m-%d")
    return dates

def weibo_search():
    # start_date = '2020-01-10' # 10-12
    start_date = '2020-01-22'
    # end_date = '2020-02-12'
    end_date = '2020-02-10'

    cop = ChromeOperate(executable_path=r'F:\github\py3\amazon_craw\chromedriver.exe')
    date_list = dateRange(start_date, end_date)
    print(date_list)
    # search_url_template = 'https://s.weibo.com/weibo/%25E6%2594%25BF%25E5%25BA%259C?q=zyzf&scope=ori&suball=1&timescope=custom:{0}-0:{1}-0&Refer=g'
    #
    # for i in range(len(date_list)-1):
    #     start_date = date_list[i]
    #     end_date = date_list[i+1]
    #     search_url = search_url_template.format(start_date,end_date)
    if 1:
        search_url = 'https://s.weibo.com/weibo/%25E7%2594%25B5%25E7%25AB%259E%25E8%25A1%258C%25E4%25B8%259A?q=電子競技&scope=ori&suball=1&timescope=custom:2019-01-01:2020-02-01&Refer=g'
        cop.open(search_url)
        for page_num in range(50):
            try:
                need_comment = 0
                page_buf = cop.open_source()
                posts = ct.getXpath('//div[@class="card-wrap"]',page_buf)
                if need_comment:
                    proxy = ct.get_new_1min_proxy()
                    proxies = {'http': 'http://' + proxy, 'https': 'http://' + proxy}
                for post in posts:
                    texts = ct.getXpath('//p[@node-type="feed_list_content_full"]//text()',post)
                    if not texts:
                        texts = ct.getXpath('//p[@node-type="feed_list_content"]//text()', post)
                    texts = ''.join(texts)
                    if not texts:
                        continue
                    date = ""
                    from_source = ct.getXpath('//p[@class="from"]',post)
                    if from_source:
                        date = ct.getXpath1('//a/text()', from_source[-1])
                        date = date.strip()
                    nick = ct.getXpath1('//a/@nick-name',post)
                    mid = ct.getXpath1('//div/@mid',post)
                    ''
                    # 評論
                    comments_button = ct.getXpath1('//a[@action-type="feed_list_comment"]/text()',post)
                    get_comments = []

                    if need_comment:
                        if ct.getRegex('評論 (\d+.*)',comments_button):
                            try:

                                try:
                                    comments_page = ct.get('https://m.weibo.cn/api/comments/show?id='+mid,proxies=proxies) # 獲取評論會封ip,另外有很多評論不可見(敏感詞,用戶設置) 雖然評論數不是空
                                    json_data = json.loads(comments_page)
                                    comments = json_data['data']['data']
                                    for comment in comments:
                                        comment_text = comment['text']
                                        get_comments.append(comment_text)
                                except:
                                    time.sleep(2)
                                    proxy = ct.get_new_1min_proxy()

                            # # time.sleep(2)
                            # comments_page = ct.get('https://m.weibo.cn/api/comments/show?id=' + mid)
                            # json_data = json.loads(comments_page)
                            # comments = json_data['data']['data']
                            # for comment in comments:
                            #     comment_text = comment['text']
                            #     get_comments.append(comment_text)
                            except Exception as e:
                                print(e,mid)

                    line = [date,mid,nick,texts]+get_comments
                    yield line

                next_button = cop.find_elements_by_xpath('//a[@class="next"]')
                if next_button:
                    time.sleep(random.randint(1, 2)*0.6)
                    next_button[0].click()
                else:
                    break
            except Exception as e:
                print(e)


if __name__ == '__main__':
    data = weibo_search()
    ct.writer_to_csv(data,'電子競技.csv')  # whzf 前幾天沒有數據


代碼地址

github
另在github上放了個打包好的windows用的桌面程序,使用請務必留言反饋問題或者評價

百度wp地址 /s/1jwr25SydQxUoYV0CN7aYfg 提取碼:ynxs

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章