1-首先下載pyspider
pip install pyspider
2-在任意一個文件夾下執行命令 啓動pyspider
pyspider 或者 pyspider all
3-執行命令後 創建項目
4-創建項目後 進入項目 編寫代碼
from pyspider.libs.base_handler import *
from fake_useragent import UserAgent
ua=UserAgent()
from pymongo import MongoClient
class Handler(BaseHandler):
crawl_config = {
'headers': {
'User-Agent': ua.random,
}
}
@every(minutes=4 * 60)
def on_start(self):
#只獲取長春市南關區的最新二手房信息
self.crawl('https://cc.lianjia.com/ershoufang/nanguanqu/co32/',fetch_type='js', callback=self.index_page)
@config(age= 60)
def index_page(self, response):
maxpage = int(response.etree.xpath('//div[@class="page-box house-lst-page-box"]/a[last()-1]/text()')[0])
#print(maxpage)
#print(response.url) #https://cc.lianjia.com/ershoufang/nanguanqu/co32/
for index in range(1,maxpage+1):
baseUrl = response.url.replace('co32','pg%dco32'%index)
index+=1
#print(baseUrl)
self.crawl(baseUrl,callback=self.page)
@config(priority=4)
def page(self,response):
#print(response.url)
#獲取每頁的30條信息
for ele in response.etree.xpath('//ul[@class="sellListContent"]/li'):
#符合條件的信息
messages = ele.xpath('./div[1]/div[@class="followInfo"]/text()')[0]
if '剛剛發佈' not in messages:
return
#print(messages)
#鏈接url
urls = ele.xpath('./a/@href')
for url in urls:
#print(url)
itemUrl=url
self.crawl(itemUrl,callback=self.detail)
@config(priority=2)
def detail(self,response):
print(response.url)
item={}
item['url']=response.url
item['title']=response.etree.xpath('//h1/text()')[0]
item['totalprice']=response.etree.xpath('//span[@class="total"]/text()')[0]+"萬"
item['area']=response.etree.xpath('//div[@class="area"]/div[@class="mainInfo"]/text()')[0]
item['rooms']=response.etree.xpath('//div[@class="room"]/div[@class="mainInfo"]/text()')[0]
item['direction']=response.etree.xpath('//div[@class="type"]/div[@class="mainInfo"]/text()')[0]
item['averageprice']=response.etree.xpath('//div[@class="unitPrice"]/span/text()')[0]+'元/平米'
item['onsaleTime']=response.etree.xpath("//div[@class='transaction']//li[1]/span[2]/text()")[0]
print(item)
#把數據存儲到數據庫中
col = MongoClient()['LianJia']['ChangChun']
if col.find_one(item):
return
if item:
col.insert_one(item)
return {'item':item}
5-保存後 運行沒問題
6-檢查之後 發現數據也能正常存入數據庫 就是在results中找不到數據 而且powershell中還出現中文亂碼 這是爲什麼呢?
7-經過調試之後 發現是優先級的問題 @config(priority=2) 這裏面priority=2數字越大 權重越高 越先執行 所以最後我改代碼
from pyspider.libs.base_handler import *
from fake_useragent import UserAgent
ua=UserAgent()
from pymongo import MongoClient
class Handler(BaseHandler):
crawl_config = {
}
def __init__(self):
self.headers ={
'User-Agent': ua.random,
}
@every(minutes=4 * 60)
def on_start(self):
#只獲取長春市南關區的最新二手房信息
self.crawl('https://cc.lianjia.com/ershoufang/nanguanqu/co32/',headers=self.headers,fetch_type='js',validate_cert=False, callback=self.index_page)
@config(age= 3*60)
def index_page(self, response):
maxpage = int(response.etree.xpath('//div[@class="page-box house-lst-page-box"]/a[last()-1]/text()')[0])
#print(maxpage)
#print(response.url) #https://cc.lianjia.com/ershoufang/nanguanqu/co32/
for index in range(1,maxpage+1):
baseUrl = response.url.replace('co32','pg%dco32'%index)
index+=1
#print(baseUrl)
self.crawl(baseUrl,validate_cert=False,callback=self.page)
@config(priority=2)
def page(self,response):
#print(response.url)
#獲取每頁的30條信息
for ele in response.etree.xpath('//ul[@class="sellListContent"]/li'):
#符合條件的信息
messages = ele.xpath('./div[1]/div[@class="followInfo"]/text()')[0]
if '剛剛發佈' not in messages:
return
#print(messages)
#鏈接url
urls = ele.xpath('./a/@href')
for url in urls:
#print(url)
itemUrl=url
self.crawl(itemUrl,validate_cert=False,callback=self.detail)
@config(priority=2)
def detail(self,response):
print(response.url)
item={}
item['url']=response.url
item['title']=response.etree.xpath('//h1/text()')[0]
item['totalprice']=response.etree.xpath('//span[@class="total"]/text()')[0]+"萬"
item['area']=response.etree.xpath('//div[@class="area"]/div[@class="mainInfo"]/text()')[0]
item['rooms']=response.etree.xpath('//div[@class="room"]/div[@class="mainInfo"]/text()')[0]
item['direction']=response.etree.xpath('//div[@class="type"]/div[@class="mainInfo"]/text()')[0]
item['averageprice']=response.etree.xpath('//div[@class="unitPrice"]/span/text()')[0]+'元/平米'
item['onsaleTime']=response.etree.xpath("//div[@class='transaction']//li[1]/span[2]/text()")[0]
return item