Setting設置
# -*- coding: utf-8 -*-
# Scrapy settings for yangguang project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# redis增量爬蟲設置
# 指定那個去重方法給request對象去重
# 調度器 指定scheduler隊列
# 讓redis持續保存數據 設置False,會在關閉redis的時候清空redis
# 設置路徑操作redis
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://127.0.0.1:6379"
# scrapy_redis實現的items保存到redis的pipeline。
ITEM_PIPELINES = {
......
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 也可以寫成
# REDIS_HOST = “192.168.207.124”
# REDIS_PORT = 6379
-----------------------------------------------------------------------------------------------
# 項目名
BOT_NAME = 'yangguang'
# 爬蟲位置
SPIDER_MODULES = ['yangguang.spiders']
# 新建爬蟲位置
NEWSPIDER_MODULE = 'yangguang.spiders'
# 查看cookies傳遞路徑
COOKIES_DEBUG =True
# 設置報警級別
LOG_LEVEL="WARNING"
# 設置log日誌保存的地址(終端中不再顯示)
# LOG_FILE="./log.log"
# 設置請求中的 User-Agent (瀏覽器的身份標識,用戶代理)
#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'
# 遵守robots協議.(網站中允許爬去的範圍)
ROBOTSTXT_OBEY = True
# 設置mongo存儲位置爲本機
MONGO_HOST="local_host"
# 配置Scrapy執行的最大併發請求(默認值:16)
#CONCURRENT_REQUESTS = 32
#配置對同一網站要求延遲(默認值:0秒)
#DOWNLOAD_DELAY = 3
# 每個域名請求併發數
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
# 每個ip請求併發數
#CONCURRENT_REQUESTS_PER_IP = 16
# 禁用cookie(默認啓用)
#COOKIES_ENABLED = False
# 禁用Telnet控制檯(默認啓用)
#TELNETCONSOLE_ENABLED = False
# 覆蓋默認請求頭 (注:User-Agent不能寫到這裏)
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# 啓用或禁用spider中間件
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yangguang.middlewares.YangguangSpiderMiddleware': 543,
#}
# 啓用或禁用downloader中間件
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yangguang.middlewares.YangguangDownloaderMiddleware': 543,
#}
# 啓用或禁用擴展
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
#配置項目管道
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# 開啓ITEM_PIPELINES,yield 才能接收item返回到pipelines.py中調用,存入mongodb數據庫。 (300是權重值,越小越先執行)
ITEM_PIPELINES = {
'yangguang.pipelines.YangguangPipeline': 300,
}
# 啓用並配置自動節流閥擴展(默認禁用) 防止請求過快,將服務器抓崩。
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# 啓用和配置HTTP緩存(默認禁用)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'