python爬蟲小白昇仙_8-----初探scrapy-redis分佈式爬蟲

scrapy-redis分佈式爬蟲 爬取知乎用戶的關注列表信息及關注者的粉絲

流程

1. 登陸知乎後,進入個人主頁,可以發現請求的url

 響應爲json格式,一頁有20個用戶信息

2. 創建scrapy項目

3. 源碼

3.1 userinfo.py

       爬取用戶信息,轉爲字典,若一頁剛好20條信息,則更改url-來切換下一頁;否則信息保存到item,更換url-來切換爬取另一個用戶的關注列表信息  

# -*- coding: utf-8 -*-

import scrapy
from scrapy import Request
import json
from zhihu.items import ZhihuItem
import re


class UserinfoSpider(scrapy.Spider):
    name = 'userinfo'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']

    def parse(self, response):
        userData=json.loads(response.body.decode("utf-8"))["data"]
        count=len(userData)
        if count<20:
            pass
        elif count==20:
            # 正則 re.findall  的簡單用法(返回string中所有與pattern相匹配的全部字串,返回形式爲數組)
            offset=int(re.findall("&offset=(.*?)&",response.url)[0])
            next_offset=offset+20
            next_page_url=response.url.replace("&offset="+str(offset)+"&","&offset="+str(next_offset)+"&")
            yield Request(url=next_page_url,callback=self.parse)

        for data in userData:
            item = ZhihuItem()
            item["name"]=data["name"]
            item["url_token"] = data["url_token"]
            item["headline"] = data["headline"]
            item["follower_count"] = data["follower_count"]
            item["articles_count"] = data["articles_count"]
            yield item

            next_id_url="https://www.zhihu.com/api/v4/members/"+data["url_token"]+"/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20"
            yield Request(url=next_id_url,callback=self.parse)




3.2 items.py

import scrapy


class ZhihuItem(scrapy.Item):
    # 名字
    name = scrapy.Field()
    # url標籤
    url_token = scrapy.Field()
    # 個性簽名
    headline = scrapy.Field()
    # 粉絲
    follower_count = scrapy.Field()
    # 發佈文章
    articles_count = scrapy.Field()

 3.3 settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'zhihu'

SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# 下載器在從同一網站下載連續頁面之前應等待的時間(以秒爲單位)。這可以用於限制爬行速度,以避免擊中服務器太難。
DOWNLOAD_DELAY = 3

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "Host": "www.zhihu.com",
    "cookie": '_zap=bf320ca4-f377-4f5d-a294-86a285cc1b07; _xsrf=lglbJpe1HZ2uYEmRfjc0RwFcNv3GsXVs; d_c0="ABBctvmj1RCPTnoZaMXWutUjIcug2t2mG4w=|1581995080"; capsion_ticket="2|1:0|10:1581995084|14:capsion_ticket|44:NzAzYWMxYjcwYzFlNDY1MWE0ZmFkNzIxODUzN2RjODE=|61dcd9dc1da5b58a1280caec5aac54ca0bd64b6506ea946ba455e990e4ad41ff"; z_c0="2|1:0|10:1581995119|4:z_c0|92:Mi4xOTAwdUFnQUFBQUFBRUZ5Mi1hUFZFQ1lBQUFCZ0FsVk5iNkk0WHdCMHFyRHBiaHhxX0M1OUppcEtPTzFVejNQcWxn|d8b66099962700028671a523b3f058dec5b1dd8d302d299edc616b8e41452d43"; q_c1=22c717de53034a299aadad488196c6c9|1581995151000|1581995151000; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1581995076,1582079852; KLBRSID=b33d76655747159914ef8c32323d16fd|1582105883|1582104976; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1582105885',
    "referer": "https://www.zhihu.com/"
}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    #'zhihu.middlewares.ZhihuDownloaderMiddleware': 543,
    'zhihu.middlewares.RandomUserAgentMiddleware': 542,
}


# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'zhihu.pipelines.ZhihuPipeline': 300,
}

USER_AGENT_LIST = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

# Mongodb參數配置 ip/port/數據庫名/集合名
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'zhihu'
MONGODB_DOCNAME = 'zhihu_collection'

 3.4 middlewares.py

import random
from scrapy.utils.project import get_project_settings
import logging

# 隨機選擇 User-Agent 的下載器中間件
class RandomUserAgentMiddleware(object):
    def process_request(self,request,spider):
        # 從 settings 的 USER_AGENTS 列表中隨機選擇一個作爲 User-Agent
        settings=get_project_settings()
        user_agent=random.choice(settings["USER_AGENT_LIST"])
        request.headers["User-Agent"]=user_agent
        return None

    def process_response(self, request, response, spider):
        # 驗證 User-Agent 設置是否生效
        logger=logging.getLogger(__name__)
        logger.info("headers ::> User-Agent = " + str(request.headers['User-Agent'], encoding="utf8"))
        return response

3.5 pipelines.py

      數據存放到mongodb數據庫中

# -*- coding: utf-8 -*-

from scrapy.utils.project import get_project_settings  #  獲取settings.py
import pymongo
from zhihu.items import ZhihuItem

class ZhihuPipeline(object):
    settings = get_project_settings()
    host = settings['MONGODB_HOST']
    port = settings['MONGODB_PORT']
    dbName = settings['MONGODB_DBNAME']
    collectionName = settings['MONGODB_DOCNAME']

    # 開始處理數據之前連接數據庫
    def open_spider(self,spider):
        # 創建連接
        self.client=pymongo.MongoClient(host=self.host,port=self.port)
        # 創建數據庫
        self.db=self.client[self.dbName]
        #創建集合
        self.collection=self.db[self.collectionName]

    def process_item(self, item, spider):
        if isinstance(item,ZhihuItem):
            # 字段更新("查詢條件","$set 更新數據","True:如果數據不存在則插入")
            self.collection.update({"url_token":item["url_token"]},{"$set":item},True)
        return item

    # 數據處理完之後關閉數據庫
    def close_spider(self,spider):
        self.client.close()

4. 運行結果

     Robo 3T顯示數據

5. scrapy-redis 分佈式修改配置

    下載redis和scrapy-redis先

    a) 修改spider繼承類,添加redis_key, 註釋掉原有的起始url:start_urls

from scrapy_redis.spiders import RedisCrawlSpider


class UserinfoSpider(RedisCrawlSpider):
    name = 'userinfo'
    redis_key = "myspider:start_urls"
    allowed_domains = ['zhihu.com']
    #start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']

    b) settings.py中添加 redis的相關配置

# 配置 Scrapy-Redis
# 啓用 Scrapy-Redis 調度存儲請求隊列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重規則對應處理的類
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 不清除 Redis 隊列,即是否在關閉時候保留原來的調度器和去重記錄。
# True=保留,False=清空。這樣可以暫停/恢復 爬取
SCHEDULER_PERSIST = True
#使用優先級調度請求隊列 (默認使用)
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

#可選用的其它隊列 PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'  # 廣度優先
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'  # 深度優先

# 連接redis
REDIS_HOST = '127.0.0.1'         # 主機名
REDIS_PORT = 6379                      # 端口
#REDIS_PARAMS  = {'password':'xxx'}     # Redis連接參數。
REDIS_ENCODING = "utf-8"               # redis編碼類型。默認:'utf-8'
# 或者:
#REDIS_URL = 'redis://user:pass@hostname:9001' # 連接 URL(優先於以上配置)

  5.1 打開redis服務端和客戶端,並向redis_key列表裏添加url爲起始爬取的url

5.2 同時運行幾個來模擬分佈式爬取

 

參考學習:

https://www.bilibili.com/video/av20220465?from=search&seid=5965231367183075581

https://cuiqingcai.com/8465.html

 

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章