源碼閱讀如下:
"""Set User-Agent header per spider or use a default value from settings"""
from scrapy import signals
class UserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
def __init__(self, user_agent='Scrapy'):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
# 傳遞配置中的USER_AGENT值給初始化函數
o = cls(crawler.settings['USER_AGENT'])
# 在開始爬蟲任務時,嘗試讀取爬蟲的user_agent值來覆蓋配置中對應的值
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
return o
def spider_opened(self, spider):
self.user_agent = getattr(spider, 'user_agent', self.user_agent)
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(b'User-Agent', self.user_agent)
值得注意的是,在開啓此中間件時,如果不主動配置USER_AGENT請求就變得很有意思,因爲項目中USER_AGENT配置默認值是這樣的:
USER_AGENT = 'Scrapy/%s (+https://scrapy.org)' % import_module('scrapy').__version__
實際上我們需要的可能是能夠切換user_agent的中間件,所以可以這樣去實現:
1.在配置中將所有的user_agent組成一個列表
2.在中間件中用choice隨機獲取
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
from random import choice
class MyUserAgentMiddleware(UserAgentMiddleware):
def process_request(self, request, spider):
if self.user_agent:
request.headers.setdefault(
b'User-Agent',
choice(self.user_agent)
)