Scrapy 中 settings 配置

原創

若纷飞

2018-11-21 23:54

Setting設置

# -*- coding: utf-8 -*-

# Scrapy settings for yangguang project

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

# https://doc.scrapy.org/en/latest/topics/settings.html

# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

# redis增量爬蟲設置

# 指定那個去重方法給request對象去重

# 調度器指定scheduler隊列

# 讓redis持續保存數據設置False，會在關閉redis的時候清空redis

# 設置路徑操作redis

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

SCHEDULER = "scrapy_redis.scheduler.Scheduler"

SCHEDULER_PERSIST = True

REDIS_URL = "redis://127.0.0.1:6379"

# scrapy_redis實現的items保存到redis的pipeline。

ITEM_PIPELINES = {

......

'scrapy_redis.pipelines.RedisPipeline': 400,

}

# 也可以寫成

# REDIS_HOST = “192.168.207.124”

# REDIS_PORT = 6379

-----------------------------------------------------------------------------------------------

# 項目名

BOT_NAME = 'yangguang'

# 爬蟲位置

SPIDER_MODULES = ['yangguang.spiders']

# 新建爬蟲位置

NEWSPIDER_MODULE = 'yangguang.spiders'

# 查看cookies傳遞路徑

COOKIES_DEBUG =True

# 設置報警級別

LOG_LEVEL="WARNING"

# 設置log日誌保存的地址(終端中不再顯示)

# LOG_FILE="./log.log"

# 設置請求中的　User-Agent (瀏覽器的身份標識，用戶代理)

#USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3278.0 Safari/537.36'

# 遵守robots協議.(網站中允許爬去的範圍)

ROBOTSTXT_OBEY = True

# 設置mongo存儲位置爲本機

MONGO_HOST="local_host"

#　配置Scrapy執行的最大併發請求(默認值：16)

#CONCURRENT_REQUESTS = 32

#配置對同一網站要求延遲（默認值：0秒）　

#DOWNLOAD_DELAY = 3

# 每個域名請求併發數

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

# 每個ip請求併發數

#CONCURRENT_REQUESTS_PER_IP = 16

#　禁用cookie(默認啓用)

#COOKIES_ENABLED = False

#　禁用Telnet控制檯(默認啓用)

#TELNETCONSOLE_ENABLED = False

#　覆蓋默認請求頭　(注：User-Agent不能寫到這裏)

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

#　啓用或禁用spider中間件

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'yangguang.middlewares.YangguangSpiderMiddleware': 543,

#　啓用或禁用downloader中間件

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

# 'yangguang.middlewares.YangguangDownloaderMiddleware': 543,

#　啓用或禁用擴展

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#配置項目管道

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 開啓ITEM_PIPELINES，yield　才能接收item返回到pipelines.py中調用，存入mongodb數據庫。　(300是權重值，越小越先執行)

ITEM_PIPELINES = {

'yangguang.pipelines.YangguangPipeline': 300,

}

#　啓用並配置自動節流閥擴展(默認禁用)　防止請求過快，將服務器抓崩。

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

#　啓用和配置HTTP緩存（默認禁用）

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Scrapy 中 settings 配置

Setting設置

# redis增量爬蟲設置

關於遊戲付費的一點想法

我通過CKA和CKS啦！

CentOS7 安裝python3.7

redis安裝

秒殺設計--mysql的鎖機制應用和redis方案

redis遠程連接

OpenSSL SSL_connect: SSL_ERROR_SYSCALL in connection to git.paas.sinopec.com:443

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結