爬蟲實例6:使用pyspider爬取鏈家網長春市南寧區的最近二手房更新信息

1-首先下載pyspider

pip install pyspider

2-在任意一個文件夾下執行命令 啓動pyspider

pyspider 或者 pyspider all

3-執行命令後 創建項目

4-創建項目後 進入項目 編寫代碼

from pyspider.libs.base_handler import *
from fake_useragent import UserAgent
ua=UserAgent()
from pymongo import MongoClient




class Handler(BaseHandler):
    crawl_config = {
        'headers': {
            'User-Agent': ua.random,
        }
    }
    

    @every(minutes=4 * 60)
    def on_start(self):
        #只獲取長春市南關區的最新二手房信息
        self.crawl('https://cc.lianjia.com/ershoufang/nanguanqu/co32/',fetch_type='js', callback=self.index_page)

    @config(age= 60)
    def index_page(self, response):
        maxpage = int(response.etree.xpath('//div[@class="page-box house-lst-page-box"]/a[last()-1]/text()')[0])
        #print(maxpage)
        #print(response.url) #https://cc.lianjia.com/ershoufang/nanguanqu/co32/
        for index in range(1,maxpage+1):
            baseUrl = response.url.replace('co32','pg%dco32'%index)
            index+=1
            #print(baseUrl)
            self.crawl(baseUrl,callback=self.page)
    @config(priority=4)       
    def page(self,response):
        #print(response.url)
        #獲取每頁的30條信息
        for ele in response.etree.xpath('//ul[@class="sellListContent"]/li'):
            #符合條件的信息
            messages = ele.xpath('./div[1]/div[@class="followInfo"]/text()')[0]
            
            if '剛剛發佈' not in messages:
                return
            #print(messages)
                
            #鏈接url
            urls = ele.xpath('./a/@href')
            for url in urls:
                #print(url)
                itemUrl=url
                self.crawl(itemUrl,callback=self.detail)
                
    @config(priority=2)  
    def detail(self,response):
        print(response.url)
        item={}
        item['url']=response.url
        item['title']=response.etree.xpath('//h1/text()')[0]
        item['totalprice']=response.etree.xpath('//span[@class="total"]/text()')[0]+"萬"
        item['area']=response.etree.xpath('//div[@class="area"]/div[@class="mainInfo"]/text()')[0]
        item['rooms']=response.etree.xpath('//div[@class="room"]/div[@class="mainInfo"]/text()')[0]
        item['direction']=response.etree.xpath('//div[@class="type"]/div[@class="mainInfo"]/text()')[0]
        item['averageprice']=response.etree.xpath('//div[@class="unitPrice"]/span/text()')[0]+'元/平米'
        item['onsaleTime']=response.etree.xpath("//div[@class='transaction']//li[1]/span[2]/text()")[0]
        print(item)
        
        #把數據存儲到數據庫中
        
        col = MongoClient()['LianJia']['ChangChun']
        if col.find_one(item):
            return
        if item:
            col.insert_one(item)
            
        
            
        return {'item':item}

5-保存後 運行沒問題

6-檢查之後 發現數據也能正常存入數據庫 就是在results中找不到數據 而且powershell中還出現中文亂碼 這是爲什麼呢?

在這裏插入圖片描述
在這裏插入圖片描述
在這裏插入圖片描述

7-經過調試之後 發現是優先級的問題 @config(priority=2) 這裏面priority=2數字越大 權重越高 越先執行 所以最後我改代碼

from pyspider.libs.base_handler import *
from fake_useragent import UserAgent
ua=UserAgent()
from pymongo import MongoClient




class Handler(BaseHandler):
    crawl_config = {
        
    }
    def __init__(self):
        self.headers ={
          'User-Agent': ua.random,  
        }

    @every(minutes=4 * 60)
    def on_start(self):
        #只獲取長春市南關區的最新二手房信息
        self.crawl('https://cc.lianjia.com/ershoufang/nanguanqu/co32/',headers=self.headers,fetch_type='js',validate_cert=False, callback=self.index_page)

    @config(age= 3*60)
    def index_page(self, response):
        
        maxpage = int(response.etree.xpath('//div[@class="page-box house-lst-page-box"]/a[last()-1]/text()')[0])
        #print(maxpage)
        #print(response.url) #https://cc.lianjia.com/ershoufang/nanguanqu/co32/
        for index in range(1,maxpage+1):
            baseUrl = response.url.replace('co32','pg%dco32'%index)
            index+=1
            #print(baseUrl)
            self.crawl(baseUrl,validate_cert=False,callback=self.page)
    @config(priority=2)       
    def page(self,response):
        #print(response.url)
        #獲取每頁的30條信息
        for ele in response.etree.xpath('//ul[@class="sellListContent"]/li'):
            #符合條件的信息
            messages = ele.xpath('./div[1]/div[@class="followInfo"]/text()')[0]
            
            if '剛剛發佈' not in messages:
                return
            #print(messages)
                
            #鏈接url
            urls = ele.xpath('./a/@href')
            for url in urls:
                #print(url)
                itemUrl=url
                self.crawl(itemUrl,validate_cert=False,callback=self.detail)
                
    @config(priority=2)  
    def detail(self,response):
        print(response.url)
        
        item={}
        item['url']=response.url
        item['title']=response.etree.xpath('//h1/text()')[0]
        item['totalprice']=response.etree.xpath('//span[@class="total"]/text()')[0]+"萬"
        item['area']=response.etree.xpath('//div[@class="area"]/div[@class="mainInfo"]/text()')[0]
        item['rooms']=response.etree.xpath('//div[@class="room"]/div[@class="mainInfo"]/text()')[0] 
        item['direction']=response.etree.xpath('//div[@class="type"]/div[@class="mainInfo"]/text()')[0] 
        item['averageprice']=response.etree.xpath('//div[@class="unitPrice"]/span/text()')[0]+'元/平米'
        item['onsaleTime']=response.etree.xpath("//div[@class='transaction']//li[1]/span[2]/text()")[0]                  
        return item
        

8-結果演示 獲取最新二手房發佈信息

在這裏插入圖片描述

9-事實證明 數據能夠在web中的控制檯顯示數據 說明代碼沒啥問題 就是不能在results中顯示數據 把priority=某個數字 這裏需要注意

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章