爬取v2ex爲例, 從首頁進入一級標籤“Apple”,再進二級標籤“iPhone”後爲相關的所有帖子,查看元素可以找到對應的鏈接
先要下載第一個網頁,在第一個網頁找能鏈接出去的網頁,再下載別的網頁, 將讀取的數據入庫
該鏈接爲一個具體帖子
from pyspider.libs.base_handler import *
import random
import pymysql
class Handler(BaseHandler):
crawl_config = {
'headers':{
'User-Agent':'GoogleBot',
'Host':'v2ex.com'
}
}
def __init__(self):
self.db = pymysql.connect("localhost", "name", "password", "dbname", charset='utf8')
def add_question(self, title, content, commentCount):
try:
cursor = self.db.cursor()
sql = 'insert into question(title, content, user_id, created_date, comment_count) values ("%s", "%s", %d, now(), %d)' % (title, content, random.randint(1,10), commentCount)
print(sql)
cursor.execute(sql)
qid = cursor.lastrowid
self.db.commit()
return qid
except Exception as e:
print(e)
self.db.rollback()
def add_comment(self, comment, qid):
try:
cursor = self.db.cursor()
sql = 'insert into comment(content, user_id, created_date, entity_id, entity_type) values ("%s", %d, now(), %d, 1)' % (comment, random.randint(1,10), qid)
print(sql)
cursor.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
'''
從首頁開始
'''
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://v2ex.com/', callback=self.index_page, validate_cert=False)
'''
進一級標籤頁面
'''
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="https://v2ex.com/?tab="]').items():
self.crawl(each.attr.href, callback=self.tab_page, validate_cert=False)
'''
進二級標籤頁面
'''
@config(priority=2)
def tab_page(self, response):
for each in response.doc('a[href^="https://v2ex.com/go/"]').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
'''
過濾多餘的reply(避免重複爬取),進帖子頁面,並爬取其他分頁
'''
@config(priority=2)
def board_page(self, response):
for each in response.doc('a[href^="https://v2ex.com/t/"]').items():
url = each.attr.href
if url.find('#reply') > 0:
url = url[0:url.find('#')]
self.crawl(url, callback=self.detail_page, validate_cert=False)
for each in response.doc('a.page_normal').items():
self.crawl(each.attr.href, callback=self.board_page, validate_cert=False)
'''
讀取問題題目,內容,評論,存入數據庫
'''
@config(priority=2)
def detail_page(self, response):
items = response.doc("div.reply_content").items()
title = response.doc("h1").text()
html = response.doc("div.topic_content").html()
if html == None:
html = ''
content = html.replace('"', '\\"')
sum = 0
for each in items:
sum += 1
qid = self.add_question(title, content, sum)
for each in response.doc("div.reply_content").items():
self.add_comment(each.html().replace('"', '\\"'), qid)
return {
"url": response.url,
"title": response.doc('title').text(),
}