scrapy-redis分佈式爬蟲 爬取知乎用戶的關注列表信息及關注者的粉絲
流程
1. 登陸知乎後,進入個人主頁,可以發現請求的url
響應爲json格式,一頁有20個用戶信息
2. 創建scrapy項目
3. 源碼
3.1 userinfo.py
爬取用戶信息,轉爲字典,若一頁剛好20條信息,則更改url-來切換下一頁;否則信息保存到item,更換url-來切換爬取另一個用戶的關注列表信息
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
import json
from zhihu.items import ZhihuItem
import re
class UserinfoSpider(scrapy.Spider):
name = 'userinfo'
allowed_domains = ['zhihu.com']
start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']
def parse(self, response):
userData=json.loads(response.body.decode("utf-8"))["data"]
count=len(userData)
if count<20:
pass
elif count==20:
# 正則 re.findall 的簡單用法(返回string中所有與pattern相匹配的全部字串,返回形式爲數組)
offset=int(re.findall("&offset=(.*?)&",response.url)[0])
next_offset=offset+20
next_page_url=response.url.replace("&offset="+str(offset)+"&","&offset="+str(next_offset)+"&")
yield Request(url=next_page_url,callback=self.parse)
for data in userData:
item = ZhihuItem()
item["name"]=data["name"]
item["url_token"] = data["url_token"]
item["headline"] = data["headline"]
item["follower_count"] = data["follower_count"]
item["articles_count"] = data["articles_count"]
yield item
next_id_url="https://www.zhihu.com/api/v4/members/"+data["url_token"]+"/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20"
yield Request(url=next_id_url,callback=self.parse)
3.2 items.py
import scrapy
class ZhihuItem(scrapy.Item):
# 名字
name = scrapy.Field()
# url標籤
url_token = scrapy.Field()
# 個性簽名
headline = scrapy.Field()
# 粉絲
follower_count = scrapy.Field()
# 發佈文章
articles_count = scrapy.Field()
3.3 settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'zhihu'
SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 下載器在從同一網站下載連續頁面之前應等待的時間(以秒爲單位)。這可以用於限制爬行速度,以避免擊中服務器太難。
DOWNLOAD_DELAY = 3
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"Host": "www.zhihu.com",
"cookie": '_zap=bf320ca4-f377-4f5d-a294-86a285cc1b07; _xsrf=lglbJpe1HZ2uYEmRfjc0RwFcNv3GsXVs; d_c0="ABBctvmj1RCPTnoZaMXWutUjIcug2t2mG4w=|1581995080"; capsion_ticket="2|1:0|10:1581995084|14:capsion_ticket|44:NzAzYWMxYjcwYzFlNDY1MWE0ZmFkNzIxODUzN2RjODE=|61dcd9dc1da5b58a1280caec5aac54ca0bd64b6506ea946ba455e990e4ad41ff"; z_c0="2|1:0|10:1581995119|4:z_c0|92:Mi4xOTAwdUFnQUFBQUFBRUZ5Mi1hUFZFQ1lBQUFCZ0FsVk5iNkk0WHdCMHFyRHBiaHhxX0M1OUppcEtPTzFVejNQcWxn|d8b66099962700028671a523b3f058dec5b1dd8d302d299edc616b8e41452d43"; q_c1=22c717de53034a299aadad488196c6c9|1581995151000|1581995151000; tshl=; tst=r; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1581995076,1582079852; KLBRSID=b33d76655747159914ef8c32323d16fd|1582105883|1582104976; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1582105885',
"referer": "https://www.zhihu.com/"
}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
#'zhihu.middlewares.ZhihuDownloaderMiddleware': 543,
'zhihu.middlewares.RandomUserAgentMiddleware': 542,
}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'zhihu.pipelines.ZhihuPipeline': 300,
}
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# Mongodb參數配置 ip/port/數據庫名/集合名
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'zhihu'
MONGODB_DOCNAME = 'zhihu_collection'
3.4 middlewares.py
import random
from scrapy.utils.project import get_project_settings
import logging
# 隨機選擇 User-Agent 的下載器中間件
class RandomUserAgentMiddleware(object):
def process_request(self,request,spider):
# 從 settings 的 USER_AGENTS 列表中隨機選擇一個作爲 User-Agent
settings=get_project_settings()
user_agent=random.choice(settings["USER_AGENT_LIST"])
request.headers["User-Agent"]=user_agent
return None
def process_response(self, request, response, spider):
# 驗證 User-Agent 設置是否生效
logger=logging.getLogger(__name__)
logger.info("headers ::> User-Agent = " + str(request.headers['User-Agent'], encoding="utf8"))
return response
3.5 pipelines.py
數據存放到mongodb數據庫中
# -*- coding: utf-8 -*-
from scrapy.utils.project import get_project_settings # 獲取settings.py
import pymongo
from zhihu.items import ZhihuItem
class ZhihuPipeline(object):
settings = get_project_settings()
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
collectionName = settings['MONGODB_DOCNAME']
# 開始處理數據之前連接數據庫
def open_spider(self,spider):
# 創建連接
self.client=pymongo.MongoClient(host=self.host,port=self.port)
# 創建數據庫
self.db=self.client[self.dbName]
#創建集合
self.collection=self.db[self.collectionName]
def process_item(self, item, spider):
if isinstance(item,ZhihuItem):
# 字段更新("查詢條件","$set 更新數據","True:如果數據不存在則插入")
self.collection.update({"url_token":item["url_token"]},{"$set":item},True)
return item
# 數據處理完之後關閉數據庫
def close_spider(self,spider):
self.client.close()
4. 運行結果
Robo 3T顯示數據
5. scrapy-redis 分佈式修改配置
下載redis和scrapy-redis先
a) 修改spider繼承類,添加redis_key, 註釋掉原有的起始url:start_urls
from scrapy_redis.spiders import RedisCrawlSpider
class UserinfoSpider(RedisCrawlSpider):
name = 'userinfo'
redis_key = "myspider:start_urls"
allowed_domains = ['zhihu.com']
#start_urls = ['https://www.zhihu.com/api/v4/members/wang-shun-61-24/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20']
b) settings.py中添加 redis的相關配置
# 配置 Scrapy-Redis
# 啓用 Scrapy-Redis 調度存儲請求隊列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 去重規則對應處理的類
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 不清除 Redis 隊列,即是否在關閉時候保留原來的調度器和去重記錄。
# True=保留,False=清空。這樣可以暫停/恢復 爬取
SCHEDULER_PERSIST = True
#使用優先級調度請求隊列 (默認使用)
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
#可選用的其它隊列 PriorityQueue(有序集合),FifoQueue(列表)、LifoQueue(列表)
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue' # 廣度優先
#SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue' # 深度優先
# 連接redis
REDIS_HOST = '127.0.0.1' # 主機名
REDIS_PORT = 6379 # 端口
#REDIS_PARAMS = {'password':'xxx'} # Redis連接參數。
REDIS_ENCODING = "utf-8" # redis編碼類型。默認:'utf-8'
# 或者:
#REDIS_URL = 'redis://user:pass@hostname:9001' # 連接 URL(優先於以上配置)
5.1 打開redis服務端和客戶端,並向redis_key列表裏添加url爲起始爬取的url
5.2 同時運行幾個來模擬分佈式爬取
參考學習:
https://www.bilibili.com/video/av20220465?from=search&seid=5965231367183075581
https://cuiqingcai.com/8465.html