值得糾正的理解是:scrapy的重試產生時,雖然會馬上被消費,但是請求會重新走一遍調度器和下載中間件。所以中間件鏈上對應的邏輯會再執行一遍,不用擔心代理和USER_AGENT等等沒有切換
"""
An extension to retry failed requests that are potentially caused by temporary
problems such as a connection timeout or HTTP 500 error.
You can change the behaviour of this middleware by modifing the scraping settings:
RETRY_TIMES - how many times to retry a failed page
RETRY_HTTP_CODES - which HTTP response codes to retry
Failed pages are collected on the scraping process and rescheduled at the end,
once the spider has finished crawling all regular (non failed) pages. Once
there is no more failed pages to retry this middleware sends a signal
(retry_complete), so other extensions could connect to that signal.
"""
import logging
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from twisted.web.client import ResponseFailed
from scrapy.exceptions import NotConfigured
from scrapy.utils.response import response_status_message
from scrapy.core.downloader.handlers.http11 import TunnelError
from scrapy.utils.python import global_object_name
logger = logging.getLogger(__name__)
class RetryMiddleware(object):
# IOError is raised by the HttpCompression middleware when trying to
# decompress an empty response
# 被process_exception處理的11種錯誤
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError)
def __init__(self, settings):
# RETRY_ENABLED標識中間件是否開啓。如果優先級較高的中間件攔截了所有異常狀態碼請求,即使該值爲True也不會有重試效果
if not settings.getbool('RETRY_ENABLED'):
raise NotConfigured
# RETRY_TIMES默認值爲2,默認最多嘗試重試2次
self.max_retry_times = settings.getint('RETRY_TIMES')
# RETRY_HTTP_CODES默認值爲[500, 502, 503, 504, 522, 524, 408]
self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
# RETRY_PRIORITY_ADJUST請求優先級調整值,默認值爲-1,所以重試的請求會馬上被消費
self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
# 狀態碼存在於retry_http_codes,且重試次數未滿則會發生重試,否則響應向底層中間件傳遞
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
# 超過重試次數,則響應向下層中間件傳遞
return self._retry(request, reason, spider) or response
return response
# 將EXCEPTIONS_TO_RETRY定義的錯誤請求執行重試
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
and not request.meta.get('dont_retry', False):
return self._retry(request, exception, spider)
def _retry(self, request, reason, spider):
retries = request.meta.get('retry_times', 0) + 1
retry_times = self.max_retry_times
# 可以單獨對請求設置最大請求次數,覆蓋設置的裏參數
if 'max_retry_times' in request.meta:
retry_times = request.meta['max_retry_times']
stats = spider.crawler.stats
if retries <= retry_times:
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
stats.inc_value('retry/count')
stats.inc_value('retry/reason_count/%s' % reason)
return retryreq
else:
stats.inc_value('retry/max_reached')
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})