python 爬蟲框架scrapy 入門 爬取博客園新聞(代碼)

1、代碼jobbole.py寫爬取策略,

2、settings.py  配置pipelines、配置圖片下載、配置是否遵循robote協議、數據庫配置等

3、pipelines.py 主要是配置數據存儲操作

4、本來用的xpath 對網站解析,但是循環解析時發現每次解析的都是第一條,不知道是什麼問題,最後這部分代碼換成css選擇器就好了。

一、jobbole.py(主要寫爬取策略)

# -*- coding: utf-8 -*-
import json
import os
import re
import sys
from urllib import parse

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.utils import Util
from items import JobbolespiderItem
import scrapy
from scrapy import Request


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['http://news.cnblogs.com/']

    # def parse(self, response):
    #     jobbolespiderItem = JobbolespiderItem()
    #     jobbolespiderItem['front_image_url'] = ['https://images2018.cnblogs.com/news_topic/20180515154619133-1755088138.png']
    #     yield jobbolespiderItem

    def parse(self, response):
        item_selecters = response.css('#news_list .news_block')
        # item_selecters = response.xpath('//div[@id="news_list"]/div[@class="news_block"]')
        for item_selecter in item_selecters:
            # 循環中用xpath會出現問題,
            print(item_selecter.extract())
            front_image_url = item_selecter.css('.entry_summary a img::attr(src)').extract_first('')
            if front_image_url.startswith('//'):
                front_image_url = 'https:'+front_image_url
            url = item_selecter.css('h2 a::attr(href)').extract_first("")
            # front_image_url = item_selecter.xpath('//div[@class="entry_summary"]/a/img/@src').extract_first('')
            # url = item_selecter.xpath('//div[@class="content"]/h2/a/@href').extract_first('')
            # 請求詳情數據
            print(url)
            yield Request(parse.urljoin(response.url, url), meta={"front_image_url": front_image_url},
                          callback=self.parse_detail)
        last_test = response.xpath('//div[@class="pager"]/a[last()]/text()').extract_first('')
        if last_test == 'Next >':
            # 請求下一頁數據
            next_url = response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first('')
            yield Request(parse.urljoin(response.url, next_url), callback=self.parse)

    def parse_detail(self, response):
        jobbolespiderItem = JobbolespiderItem()
        if response.meta.get('front_image_url'):
            jobbolespiderItem['front_image_url'] = [parse.urljoin(response.url, response.meta.get('front_image_url'))]
        else:
            jobbolespiderItem['front_image_url'] = []
        if response.xpath('//div[@id="news_title"]/a/text()').extract_first(''):
            jobbolespiderItem['title'] = response.xpath('//div[@id="news_title"]/a/text()').extract_first('')
        else:
            jobbolespiderItem['title'] = ''
        if response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first(''):
            create_date_content = response.xpath('//div[@id="news_info"]/span[@class="time"]/text()').extract_first('')
        else:
            create_date_content = ''
        print(create_date_content)
        try:
            if re.match(r'發佈於 (.*)', create_date_content).group(1):
                jobbolespiderItem['create_date'] = re.match(r'發佈於 (.*)', create_date_content).group(1)
            else:
                jobbolespiderItem['create_date'] = '1970-01-01'
        except Exception as e:
            print(e)
        jobbolespiderItem['content'] = response.xpath('//div[@id="news_content"]/div[@id="news_body"]').extract_first(
            '')
        tag_list = response.xpath('//div[@class="news_tags"]/a/text()').extract()
        jobbolespiderItem['tags'] = ','.join(tag_list)
        # https: // news.cnblogs.com / NewsAjax / GetPreNewsById?contentId = 665930
        print(response.url)
        id = re.match(r'.*?(\d+)', response.url).group(1)
        url_new = parse.urljoin(response.url, '/NewsAjax/GetAjaxNewsInfo?contentId={}'.format(id))

        yield Request(url_new, callback=self.parse_nums, meta={'jobbolespiderItem': jobbolespiderItem})

    def parse_nums(self, response):
        jobbolespiderItem = response.meta.get('jobbolespiderItem')
        jobbolespiderItem['image_url_id'] = Util().trans_md5(jobbolespiderItem.get('front_image_url')[0])
        r_json = json.loads(response.text)
        jobbolespiderItem['content_id'] = r_json.get('ContentID')
        jobbolespiderItem['comment_count'] = r_json.get('CommentCount')
        jobbolespiderItem['total_view'] = r_json.get('TotalView')
        jobbolespiderItem['digg_count'] = r_json.get('DiggCount')
        jobbolespiderItem['bury_count'] = r_json.get('BuryCount')
        yield jobbolespiderItem

二、settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for AricleSpider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os

BOT_NAME = 'AricleSpider'

SPIDER_MODULES = ['AricleSpider.spiders']
NEWSPIDER_MODULE = 'AricleSpider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'AricleSpider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'AricleSpider.middlewares.AriclespiderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'AricleSpider.middlewares.AriclespiderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'AricleSpider.pipelines.AricleImagePipeline': 1,
    # 'AricleSpider.pipelines.AricleSaveJsonPipeline': 2,
    # 'AricleSpider.pipelines.AricleSaveDBPipeline': 3,
    'AricleSpider.pipelines.MysqlTwistedPipeline': 4,
    # 'AricleSpider.pipelines.AriclespiderPipeline': 300
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 圖片下載存放地址
IMAGES_URLS_FIELD = 'front_image_url'  # 圖片路徑名的配置
img_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),'images')
print(img_path)
IMAGES_STORE = img_path



MYSQL_HOST = "127.0.0.1"
MYSQL_DBNAME = "article_spider"
MYSQL_USER = "root"
MYSQL_PASSWORD = "root"

3、pipelines.py  數據存儲

# -*- coding: utf-8 -*-
import json
import os

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.pipelines.images import ImagesPipeline
from twisted.enterprise import adbapi


class AriclespiderPipeline(object):
    def process_item(self, item, spider):
        return item


class AricleImagePipeline(ImagesPipeline):
    # 把下載圖片和對應的本地地址放在一個對象中
    def item_completed(self, results, item, info):
        if 'front_image_url' in item:
            for ok, value in results:
                # value 存放圖片url和本地存儲path
                image_file_path = value["path"]
            item["front_image_path"] = image_file_path
        return item


class AricleSaveJsonPipeline(object):
    # 存儲數據到json中
    def __init__(self):
        path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'ceshi.json')
        self.f = open(path, "a", encoding='utf-8')

    def process_item(self, item, spider):
        itme_json = json.dumps(dict(item), ensure_ascii=False) + '\n'
        self.f.write(itme_json)
        return item


class AricleSaveDBPipeline(object):
    # 存儲數據到mysql中
    def __init__(self):
        # 連接database
        self.conn = pymysql.connect(
            host="127.0.0.1",
            user="root", password="root",
            database="article_spider",
            charset="utf8")
        # 得到一個可以執行SQL語句的光標對象
        self.cursor = self.conn.cursor()  # 執行完畢返回的結果集默認以元組顯示

    def process_item(self, item, spider):
        sql = '''
        INSERT INTO jobbole_article 
        (front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count)
         VALUES
         ("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{}) on DUPLICATE KEY UPDATE parise_nums=VALUES(bury_count);;
        '''
        sql = sql.format(','.join(item.get('front_image_url')),
                         item.get('create_date'),
                         item.get('image_url_id'),
                         item.get('title'),
                         item.get('content'),
                         item.get('tags'),
                         item.get('content_id'),
                         item.get('comment_count'),
                         item.get('total_view'),
                         item.get('digg_count'),
                         item.get('bury_count'))
        print(sql)
        try:
            self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print('=====error:{}'.format(e))
        return item


class MysqlTwistedPipeline:
    # 異步執行sql語句
    def __init__(self, dbpool):
        self.dbpool = dbpool

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.do_insert, item)
        query.addErrback(self.handler_error)

    def do_insert(self, cursor, item):
        sql = '''
                INSERT INTO jobbole_article 
                (front_image_url, create_date,image_url_id,title,content,tags,content_id,comment_count,total_view,digg_count,bury_count)
                 VALUES
                 ("{}","{}","{}","{}",'{}',"{}",{},{},{},{},{}) on DUPLICATE KEY UPDATE parise_nums=VALUES(bury_count);
                '''
        sql = sql.format(','.join(item.get('front_image_url')),
                         item.get('create_date'),
                         item.get('image_url_id'),
                         item.get('title'),
                         item.get('content'),
                         item.get('tags'),
                         item.get('content_id'),
                         item.get('comment_count'),
                         item.get('total_view'),
                         item.get('digg_count'),
                         item.get('bury_count'))
        print(sql)

        cursor.execute(sql)

    def handler_error(self, failure, item, spider):
        print(failure)

    @classmethod
    def from_settings(cls, settings):
        from MySQLdb.cursors import DictCursor
        dbparms = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset='utf8',
            cursorclass=DictCursor,
            use_unicode=True,
        )
        dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
        return cls(dbpool)

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章