pyspider網頁爬蟲示例

爬取v2ex爲例, 從首頁進入一級標籤“Apple”,再進二級標籤“iPhone”後爲相關的所有帖子,查看元素可以找到對應的鏈接
先要下載第一個網頁,在第一個網頁找能鏈接出去的網頁,再下載別的網頁, 將讀取的數據入庫
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述
該鏈接爲一個具體帖子
在這裏插入圖片描述

from pyspider.libs.base_handler import *
import random
import pymysql

class Handler(BaseHandler):
    crawl_config = {
        'headers':{
            'User-Agent':'GoogleBot',
            'Host':'v2ex.com'
        }
    }
    
    def __init__(self):
        self.db = pymysql.connect("localhost", "name", "password", "dbname", charset='utf8')
        
    def add_question(self, title, content, commentCount):
        try:
            cursor = self.db.cursor()
            sql = 'insert into question(title, content, user_id, created_date, comment_count) values ("%s", "%s", %d, now(), %d)' % (title, content, random.randint(1,10), commentCount)
            print(sql)
            cursor.execute(sql)
            qid = cursor.lastrowid
            self.db.commit()
            return qid
        except Exception as e:
            print(e)
            self.db.rollback()
            
    def add_comment(self, comment, qid):
        try:
            cursor = self.db.cursor()
            sql = 'insert into comment(content, user_id, created_date, entity_id, entity_type) values ("%s", %d, now(), %d, 1)' % (comment, random.randint(1,10), qid)
            print(sql)
            cursor.execute(sql)
            self.db.commit()
        except Exception as e:
            print(e)
            self.db.rollback()

            
    '''
    從首頁開始
    '''
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('https://v2ex.com/', callback=self.index_page, validate_cert=False)
    
    '''
    進一級標籤頁面
    '''
    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('a[href^="https://v2ex.com/?tab="]').items():
            self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)

    '''
    進二級標籤頁面
    '''
    @config(priority=2)
    def tab_page(self, response):
        for each in response.doc('a[href^="https://v2ex.com/go/"]').items():
            self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
            
    '''
    過濾多餘的reply(避免重複爬取),進帖子頁面,並爬取其他分頁
    '''
    @config(priority=2)
    def board_page(self, response):
        for each in response.doc('a[href^="https://v2ex.com/t/"]').items():
            url = each.attr.href
            if url.find('#reply') > 0:
                url = url[0:url.find('#')]
            self.crawl(url, callback=self.detail_page, validate_cert=False)
        for each in response.doc('a.page_normal').items():
            self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
   
   '''
   讀取問題題目,內容,評論,存入數據庫
   '''
    @config(priority=2)
    def detail_page(self, response):
        items = response.doc("div.reply_content").items()
        title = response.doc("h1").text()
        html = response.doc("div.topic_content").html()
        if html == None:
            html = ''
        content = html.replace('"', '\\"')
        sum = 0
        for each in items:
            sum += 1
        qid = self.add_question(title, content, sum)
        for each in response.doc("div.reply_content").items():
            self.add_comment(each.html().replace('"', '\\"'), qid)
        return {
            "url": response.url,
            "title": response.doc('title').text(),
        }

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章