作爲強大的採集框架scrapy,有幾個基本配置,大家一定要掌握。下面貓哥一一爲大家介紹。
- 編輯settings.py文件
# 隨機下載延遲
RANDOM_DELAY = 1
# MIDDLEWARES 設置
DOWNLOADER_MIDDLEWARES = {
'DemoProjects.middlewares.RandomDelayMiddleware': 150,
}
- 編輯middlewares文件
# 設置隨機延時
class RandomDelayMiddleware(object):
def __init__(self, delay):
self.delay = delay
@classmethod
def from_crawler(cls, crawler):
delay = crawler.spider.settings.get("RANDOM_DELAY", 10)
if not isinstance(delay, int):
raise ValueError("RANDOM_DELAY need a int")
return cls(delay)
def process_request(self, request, spider):
# delay = random.randint(0, self.delay)
delay = random.uniform(0, self.delay)
delay = float("%.1f" % delay)
logging.debug("### random delay: %s s ###" % delay)
time.sleep(delay)
- 編輯settings.py文件
# 設置隨機UA
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
RANDOM_UA_TYPE = "random"
# MIDDLEWARES 設置
DOWNLOADER_MIDDLEWARES = {
'DemoProjects.middlewares.RandomUserAgentMiddlware': 100,
}
- 編輯middlewares文件
# pip install fake_useragent
# 導入UserAgent
from fake_useragent import UserAgent
# 隨機更換user-agent方法
class RandomUserAgentMiddlware(object):
def __init__(self, crawler):
super(RandomUserAgentMiddlware, self).__init__()
self.ua = UserAgent()
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_ua():
# print(request.headers)
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_ua())
- 編輯settings.py文件
# MIDDLEWARES 設置
DOWNLOADER_MIDDLEWARES = {
'DemoProjects.middlewares.RandomUserAgentMiddlware': 50,
}
- 編輯middlewares文件
# GetIP方法是自己定義的,用於返回一個IP,IP格式:https://58.218.92.167:2303
import GetIP
class RandomProxyMiddleware(object):
# 動態設置ip代理
def process_request(self, request, spider):
# 定義方法
get_ip = GetIP()
# 這裏貓哥是設置了一個IP池,每次隨機的從IP池裏取出一個IP使用
proxy_ip = get_ip.get_random_ip()
# print("當前使用的代理IP是" + proxy_ip)
request.meta["proxy"] = proxy_ip
完畢!
⚠️ 不懂的地方,歡迎在評論區留言~