Scrapy 中 settings 配置

Setting設置

# -*- coding: utf-8 -*-

 

# Scrapy settings for yangguang project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#  redis增量爬蟲設置

# 指定那個去重方法給request對象去重

# 調度器 指定scheduler隊列

# 讓redis持續保存數據  設置False,會在關閉redis的時候清空redis

# 設置路徑操作redis

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

SCHEDULER_PERSIST = True

REDIS_URL = "redis://127.0.0.1:6379"

 

# scrapy_redis實現的items保存到redis的pipeline。

ITEM_PIPELINES = {

    ......

    'scrapy_redis.pipelines.RedisPipeline': 400,

}

# 也可以寫成

# REDIS_HOST = “192.168.207.124”

# REDIS_PORT = 6379

-----------------------------------------------------------------------------------------------

 

# 項目名

BOT_NAME = 'yangguang'

 

# 爬蟲位置

SPIDER_MODULES = ['yangguang.spiders']

 

# 新建爬蟲位置

NEWSPIDER_MODULE = 'yangguang.spiders'

 

# 查看cookies傳遞路徑

COOKIES_DEBUG =True

 

# 設置報警級別

LOG_LEVEL="WARNING"

 

# 設置log日誌保存的地址(終端中不再顯示)

# LOG_FILE="./log.log"

 

# 設置請求中的 User-Agent (瀏覽器的身份標識,用戶代理)

#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'

 

# 遵守robots協議.(網站中允許爬去的範圍)

ROBOTSTXT_OBEY = True

 

# 設置mongo存儲位置爲本機

MONGO_HOST="local_host"

 

# 配置Scrapy執行的最大併發請求(默認值:16)

#CONCURRENT_REQUESTS = 32

 

#配置對同一網站要求延遲(默認值:0秒) 

#DOWNLOAD_DELAY = 3

 

# 每個域名請求併發數

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

# 每個ip請求併發數

#CONCURRENT_REQUESTS_PER_IP = 16

 

# 禁用cookie(默認啓用)

#COOKIES_ENABLED = False

 

# 禁用Telnet控制檯(默認啓用)

#TELNETCONSOLE_ENABLED = False

 

# 覆蓋默認請求頭 (注:User-Agent不能寫到這裏)

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

 

# 啓用或禁用spider中間件

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'yangguang.middlewares.YangguangSpiderMiddleware': 543,

#}

 

# 啓用或禁用downloader中間件

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'yangguang.middlewares.YangguangDownloaderMiddleware': 543,

#}

 

# 啓用或禁用擴展

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

 

#配置項目管道

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 開啓ITEM_PIPELINES,yield 才能接收item返回到pipelines.py中調用,存入mongodb數據庫。 (300是權重值,越小越先執行)

ITEM_PIPELINES = {

   'yangguang.pipelines.YangguangPipeline': 300,

}

 

# 啓用並配置自動節流閥擴展(默認禁用) 防止請求過快,將服務器抓崩。

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

 

# 啓用和配置HTTP緩存(默認禁用)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章