1、代碼jobbole.py寫爬取策略,
2、settings.py 配置pipelines、配置圖片下載、配置是否遵循robote協議、數據庫配置等
3、pipelines.py 主要是配置數據存儲操作
4、本來用的xpath 對網站解析,但是循環解析時發現每次解析的都是第一條,不知道是什麼問題,最後這部分代碼換成css選擇器就好了。
一、jobbole.py(主要寫爬取策略)
# -*- coding: utf-8 -*-
import json
import os
import re
import sys
from urllib import parse
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.utils import Util
from items import JobbolespiderItem
import scrapy
from scrapy import Request
class JobboleSpider(scrapy.Spider):
name = 'jobbole'
allowed_domains = ['news.cnblogs.com']
start_urls = ['http://news.cnblogs.com/']
# def parse(self, response):
# jobbolespiderItem = JobbolespiderItem()
# jobbolespiderItem['front_image_url'] = ['https://images2018.cnblogs.com/news_topic/20180515154619133-1755088138.png']
# yield jobbolespiderItem
def parse(self, response):
item_selecters = response.css('#news_list .news_block')
# item_selecters = response.xpath('//div[@id="news_list"]/div[@class="news_block"]')
for item_selecter in item_selecters:
# 循環中用xpath會出現問題,
print(item_selecter.extract())
front_image_url = item_selecter.css('.entry_summary a img::attr(src)').extract_first('')
if front_image_url.startswith('//'):
front_image_url = 'https:'+front_image_url
url = item_selecter.css('h2 a::attr(href)').extract_first("")
# front_image_url = item_selecter.xpath('//div[@class="entry_summary"]/a/img/@src').extract_first('')
# url = item_selecter.xpath('//div[@class="content"]/h2/a/@href').extract_first('')
# 請求詳情數據
print(url)
yield Request(parse.urljoin(response.url, url), meta={"front_image_url": front_image_url},
callback=self.parse_detail)
last_test = response.xpath('//div[@class="pager"]/a[last()]/text()').extract_first('')
if last_test == 'Next >':
# 請求下一頁數據
next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first('')
yield Request(parse.urljoin(response.url, next_url), callback=self.parse)
def parse_detail(self, response):
jobbolespiderItem = JobbolespiderItem()
if response.meta.get('front_image_url'):
jobbolespiderItem['front_image_url'] = [parse.urljoin(response.url, response.meta.get('front_image_url'))]
else:
jobbolespiderItem['front_image_url'] = []
if response.xpath('//div[@id="news_title"]/a/text()').extract_first(''):
jobbolespiderItem['title'] = response.xpath('//div[@id="news_title"]/a/text()').extract_first('')
else:
jobbolespiderItem['title'] = ''
if response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first(''):
create_date_content = response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first('')
else:
create_date_content = ''
print(create_date_content)
try:
if re.match(r'發佈於 (.*)', create_date_content).group(1):
jobbolespiderItem['create_date'] = re.match(r'發佈於 (.*)', create_date_content).group(1)
else:
jobbolespiderItem['create_date'] = '1970-01-01'
except Exception as e:
print(e)
jobbolespiderItem['content'] = response.xpath('//div[@id="news_content"]/div[@id="news_body"]').extract_first(
'')
tag_list = response.xpath('//div[@class="news_tags"]/a/text()').extract()
jobbolespiderItem['tags'] = ','.join(tag_list)
# https: // news.cnblogs.com / NewsAjax / GetPreNewsById?contentId = 665930
print(response.url)
id = re.match(r'.*?(\d+)', response.url).group(1)
url_new = parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(id))
yield Request(url_new, callback=self.parse_nums, meta={'jobbolespiderItem': jobbolespiderItem})
def parse_nums(self, response):
jobbolespiderItem = response.meta.get('jobbolespiderItem')
jobbolespiderItem['image_url_id'] = Util().trans_md5(jobbolespiderItem.get('front_image_url')[0])
r_json = json.loads(response.text)
jobbolespiderItem['content_id'] = r_json.get('ContentID')
jobbolespiderItem['comment_count'] = r_json.get('CommentCount')
jobbolespiderItem['total_view'] = r_json.get('TotalView')
jobbolespiderItem['digg_count'] = r_json.get('DiggCount')
jobbolespiderItem['bury_count'] = r_json.get('BuryCount')
yield jobbolespiderItem
二、settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for AricleSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
BOT_NAME = 'AricleSpider'
SPIDER_MODULES = ['AricleSpider.spiders']
NEWSPIDER_MODULE = 'AricleSpider.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'AricleSpider (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'AricleSpider.middlewares.AriclespiderSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'AricleSpider.middlewares.AriclespiderDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'AricleSpider.pipelines.AricleImagePipeline': 1,
# 'AricleSpider.pipelines.AricleSaveJsonPipeline': 2,
# 'AricleSpider.pipelines.AricleSaveDBPipeline': 3,
'AricleSpider.pipelines.MysqlTwistedPipeline': 4,
# 'AricleSpider.pipelines.AriclespiderPipeline': 300
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 圖片下載存放地址
IMAGES_URLS_FIELD = 'front_image_url' # 圖片路徑名的配置
img_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'images')
print(img_path)
IMAGES_STORE = img_path
MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "article_spider"
MYSQL_USER = "root"
MYSQL_PASSWORD = "root"
3、pipelines.py 數據存儲
# -*- coding: utf-8 -*-
import json
import os
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.pipelines.images import ImagesPipeline
from twisted.enterprise import adbapi
class AriclespiderPipeline(object):
def process_item(self, item, spider):
return item
class AricleImagePipeline(ImagesPipeline):
# 把下載圖片和對應的本地地址放在一個對象中
def item_completed(self, results, item, info):
if 'front_image_url' in item:
for ok, value in results:
# value 存放圖片url和本地存儲path
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
class AricleSaveJsonPipeline(object):
# 存儲數據到json中
def __init__(self):
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ceshi.json')
self.f = open(path, "a", encoding='utf-8')
def process_item(self, item, spider):
itme_json = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.f.write(itme_json)
return item
class AricleSaveDBPipeline(object):
# 存儲數據到mysql中
def __init__(self):
# 連接database
self.conn = pymysql.connect(
host="127.0.0.1",
user="root", password="root",
database="article_spider",
charset="utf8")
# 得到一個可以執行SQL語句的光標對象
self.cursor = self.conn.cursor() # 執行完畢返回的結果集默認以元組顯示
def process_item(self, item, spider):
sql = '''
INSERT INTO jobbole_article
(front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count)
VALUES
("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{}) on DUPLICATE KEY UPDATE parise_nums=VALUES(bury_count);;
'''
sql = sql.format(','.join(item.get('front_image_url')),
item.get('create_date'),
item.get('image_url_id'),
item.get('title'),
item.get('content'),
item.get('tags'),
item.get('content_id'),
item.get('comment_count'),
item.get('total_view'),
item.get('digg_count'),
item.get('bury_count'))
print(sql)
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print('=====error:{}'.format(e))
return item
class MysqlTwistedPipeline:
# 異步執行sql語句
def __init__(self, dbpool):
self.dbpool = dbpool
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handler_error)
def do_insert(self, cursor, item):
sql = '''
INSERT INTO jobbole_article
(front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count)
VALUES
("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{}) on DUPLICATE KEY UPDATE parise_nums=VALUES(bury_count);
'''
sql = sql.format(','.join(item.get('front_image_url')),
item.get('create_date'),
item.get('image_url_id'),
item.get('title'),
item.get('content'),
item.get('tags'),
item.get('content_id'),
item.get('comment_count'),
item.get('total_view'),
item.get('digg_count'),
item.get('bury_count'))
print(sql)
cursor.execute(sql)
def handler_error(self, failure, item, spider):
print(failure)
@classmethod
def from_settings(cls, settings):
from MySQLdb.cursors import DictCursor
dbparms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)