糗事百科段子之scrapy爬蟲
前期工作
- 創建項目:進入cmd中,切換路徑到某個工作目錄下,創建項目
scrapy startproject 項目名稱
- 創建爬蟲文件:首先到上述創建項目的目錄下
cd 項目名稱
,scrapy genspider 爬蟲的名稱 網站域名
- 對於出現的多個py文件內容如下
qsbk_spider.py
# -*- coding: utf-8 -*-
'''
response是一個‘scrapy.http.response.html.HtmlResponse’對象,可以執行‘xpath’,‘css’語法提取數據
提取出的數據,是‘Selector’或‘SelectorList’對象。如果想獲取其中的字符串。則執行get或getall方法
getall獲取selector的所有文本,返回一個列表
get獲取selector的第一個文本,返回一個str
pipeline:保存數據,其中三個方法常用到:open_spider(self,spider)、process_item(self,item,spider)、close_spider(self,spider)
在setting.py中激活pipline
'''
import scrapy
from scrapy.selector.unified import SelectorList
from qsbk.items import QsbkItem
class QsbkSpiderSpider(scrapy.Spider):
name = 'qsbk_spider'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
base_domin = "https://www.qiushibaike.com"
def parse(self, response):
# SelectorList
duanzis=response.xpath("//div[@id='content-left']/div")
for duanzi in duanzis:
# selector
author = duanzi.xpath(".//h2/text()").get().strip()
content = duanzi.xpath(".//div[@class='content']/span//text()").getall()#返回列表
content = "".join(content).strip()
item = QsbkItem(author=author,content=content)
yield item
next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
if not next_url:#當next_url爲False時,return
return
else:
yield scrapy.Request(self.base_domin+next_url,callback=self.parse)
import scrapy
class QsbkItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
# -*- coding: utf-8 -*-
'''
保存json數據時,使用以下兩個類,操作更簡單
1.JsonItemExporter:每次把數據添加到內存中,最後統一寫入磁盤中。優點是存儲的數據滿足json規則,缺點是不適合數據量較大的數據
2.JsonLinesItemExporter:每次調用export_item時把item存儲到硬盤中。優點是適合處理數據量大的情況,數據也比較安全,缺點是每一個字典一行,整個文件不滿足json格式
'''
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# =============================================================================
# import json
# class QsbkPipeline(object):
# def __init__(self):
# #也可以在open_spider中
# self.fp = open("duanzi.json","w",encoding='utf-8')
#
# def open_spider(self,spider):
# print('爬蟲開始。。。')
# def process_item(self, item, spider):
# item_json=json.dumps(dict(item),ensure_ascii=False)
# self.fp.write(item_json+'\n')
# return item
# def close_spider(self,spider):
# self.fp.close()
# print('爬蟲結束。。。')
#
# =============================================================================
# =============================================================================
# from scrapy.exporters import JsonItemExporter
#
# class QsbkPipeline(object):
# def __init__(self):
# #也可以在open_spider中
# self.fp = open("duanzi.json","wb")
# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
# self.exporter.start_exporting()
#
# def open_spider(self,spider):
# print('爬蟲開始。。。')
#
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#
# def close_spider(self,spider):
# self.exporter.finish_exporting()
# self.fp.close()
# print('爬蟲結束。。。')
#
# =============================================================================
from scrapy.exporters import JsonLinesItemExporter
class QsbkPipeline(object):
def __init__(self):
#也可以在open_spider中
self.fp = open("duanzi.json","wb")
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def open_spider(self,spider):
print('爬蟲開始。。。')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
print('爬蟲結束。。。')
# -*- coding: utf-8 -*-
# Scrapy settings for qsbk project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'qsbk'
SPIDER_MODULES = ['qsbk.spiders']
NEWSPIDER_MODULE = 'qsbk.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'qsbk (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'qsbk.middlewares.QsbkSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'qsbk.middlewares.QsbkDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
微信小程序爬蟲之CrawlSpider
wxapp_spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem
class WxappSpiderSpider(CrawlSpider):
name = 'wxapp_spider'
allowed_domains = ['wxapp-union.com']
start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
rules = (
Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
Rule(LinkExtractor(allow=r'.+article-.+\.html'),callback='parse_item',follow=False)
)
def parse_item(self, response):
title=response.xpath("//h1[@class='ph']/text()").get()
author_p = response.xpath("//p[@class='authors']")
author = author_p.xpath(".//a/text()").get()
pub_time = author_p.xpath(".//span/text()").get()
content = response.xpath("//td[@id='article_content']//text()").getall()
content = "".join(content).strip()
item = WxappItem(title=title,author=author,pub_time=pub_time,content=content)
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WxappItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
content = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class WxappPipeline(object):
def __init__(self):
self.fp = open('wxjc.json','wb')
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
# -*- coding: utf-8 -*-
# Scrapy settings for wxapp project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wxapp'
SPIDER_MODULES = ['wxapp.spiders']
NEWSPIDER_MODULE = 'wxapp.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wxapp (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wxapp.middlewares.WxappSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'wxapp.middlewares.WxappDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wxapp.pipelines.WxappPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
51jobpython崗位爬蟲之CrawlSpider
wyjob_spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wyjob.items import WyjobItem
class WyjobSpiderSpider(CrawlSpider):
name = 'wyjob_spider'
allowed_domains = ['51job.com']
start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html']
rules = (
Rule(LinkExtractor(allow=r'.+list/000000,000000,0000,00,9,99,python,2,\d\.html'), follow=True),
Rule(LinkExtractor(allow=r'https://jobs.51job.com/.+\d\.html\?s=01&t=0'),callback='parse_item',follow=False)
)
def parse_item(self, response):
title = response.xpath("//div[@class='cn']/h1/text()").get().strip()
salary = response.xpath("//div[@class='cn']/strong/text()").get()
company = response.xpath("//p[@class='cname']/a/text()").get().strip()
base_info = response.xpath("//p[@class='msg ltype']//text()").getall()#列表
base_info ="".join(base_info)#列表轉字符串
location = base_info.split("|")[0].strip()
workyears = base_info.split("|")[1].strip()
education = base_info.split("|")[2].strip()
position_info = response.xpath("//div[@class='bmsg job_msg inbox']/p//text()").getall()
position_info = "".join(position_info).strip()
item = WyjobItem(title=title,salary=salary,company=company,location=location,workyears=workyears,education=education,position_info=position_info)
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WyjobItem(scrapy.Item):
title = scrapy.Field()
salary = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
workyears = scrapy.Field()
education = scrapy.Field()
position_info = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class WyjobPipeline(object):
def __init__(self):
self.fp =open('wyjob.json',"wb")
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_item(self,spider):
self.fp.close()
# -*- coding: utf-8 -*-
# Scrapy settings for wyjob project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wyjob'
SPIDER_MODULES = ['wyjob.spiders']
NEWSPIDER_MODULE = 'wyjob.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wyjob (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wyjob.middlewares.WyjobSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'wyjob.middlewares.WyjobDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wyjob.pipelines.WyjobPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'