以爬取淘車網的二手車信息爲例,將車的信息爬取出來並存到MongoDB數據庫中
首先創建如圖所示的目錄:
進入當前目錄下命令行:
創建項目:scrapy startproject day0514
然後cd day0514 進入當前項目
創建爬蟲程序:scrapy genspider 程序名 域名
scrapy genspider TaoChe taoche.com
啓動項目:scrapy crawl 項目名稱
scrapy crawl day0514
首先在items.py文件中定義好要存儲的字段:
import scrapy
class Day0514Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 列表頁
name = scrapy.Field() # 標題
registered_date = scrapy.Field() # 上牌時間
mileage = scrapy.Field() # 車程
city = scrapy.Field() # 車程
price = scrapy.Field() # 原價
new_price = scrapy.Field() # 售價
detail_url = scrapy.Field() # 詳情頁鏈接
# 詳情頁內容
displacement = scrapy.Field() # 排量
transmission = scrapy.Field() # 變速箱
brand_type = scrapy.Field() # 品牌型號
loc_of_lic = scrapy.Field() # 牌照所在地
oil_wear = scrapy.Field() # 油耗
engine = scrapy.Field() # 發動機
three_high = scrapy.Field() # 長寬高
drive_type = scrapy.Field() # 驅動方式
body_way = scrapy.Field() # 車身類型
che_level = scrapy.Field() # 車輛級別
trunk_cap = scrapy.Field() # 後備箱容量
p_time = scrapy.Field() # 爬取時間
ip = scrapy.Field()
爬蟲程序TaoChe.py程序如圖所示:
# -*- coding: utf-8 -*-
import scrapy
from scrapy_redis.spiders import RedisSpider
import time
from day0514.items import Day0514Item
class TaocheSpider(RedisSpider):
name = 'TaoChe'
# allowed_domains = ['taoche.com']
# start_urls = ['http://taoche.com/']
redis_key = "taoche:start_urls"
# def parse(self, response):
# with open('taoCheUrl.txt','a',encoding='utf-8') as fp:
# fp.write(response.url+'\n')
def parse(self, response):
# 從首頁獲取最大頁面page
max_page = response.xpath('//div[@class="paging-box the-pages"]/div/a[last()-1]/text()').extract()
max_page = self.get_value(max_page)
# 列表頁單頁
for i in range(1,int(max_page)+1):
url = response.url + '?page=%d#pagetag'%(i)
yield scrapy.Request(url=url,callback=self.parse_1)
# 列表頁解析函數
def parse_1(self,response):
# 獲取整個車輛信息
car_info_list = response.xpath('//ul[@class="gongge_ul"]/li')
for car in car_info_list:
# 標題
name = car.xpath('./div[@class="gongge_main"]/a/span/text()').extract()
name = self.get_value(name)
# 車程
mileage = car.xpath('./div[2]/p/i[2]/text()').extract() # 標題
mileage = self.get_value(mileage)
# 城市
city = car.xpath('./div[2]/p/i[3]/span/text()').extract()
city = self.get_value([i.strip() for i in city])
# 原價
price = car.xpath('./div[2]/div[1]/i[3]/text()').extract()
price = self.get_value(price)
# 現價
new_price = car.xpath('./div[2]/div[1]/i[2]//text()').extract()
new_price = ''.join(new_price)
# 上牌時間
registered_date = car.xpath('./div[2]/p/i[1]/text()').extract()
registered_date = self.get_value(registered_date)
# 詳情頁鏈接
detail_url = car.xpath('./div[2]/a/@href').extract()
detail_url = 'https:'+self.get_value(detail_url)
# 爬取時間
p_time = time.strftime("%Y-%m-%d %X", time.localtime())
# 實例化
item = Day0514Item()
item['name'] = name
item['mileage'] = mileage
item['city'] = city
item['price'] = price
item['new_price'] = new_price
item['registered_date'] = registered_date
item['detail_url'] = detail_url
item['p_time'] = p_time
yield scrapy.Request(
url=detail_url,
callback=self.parse_detail,
meta={'data':item},
encoding = 'utf-8',
dont_filter=True
)
# 詳情頁解析數據
def parse_detail(self, response):
print(response.url)
li_box = response.xpath('//div[@class="row parameter-configure"]//div[2]/ul')[0]
# 排量
displacement = li_box.xpath('./li[1]//text()').extract()
displacement = ''.join([i.strip() for i in displacement])
# 油耗
oil_wear = li_box.xpath('./li[2]//text()').extract()
oil_wear = ''.join(oil_wear)
# 長寬高
three_high = li_box.xpath('./li[3]//text()').extract()
three_high = ''.join(three_high)
# 車身類型
body_way = li_box.xpath('./li[4]//text()').extract()
body_way = ''.join(body_way)
#驅動方式
trunk_cap = li_box.xpath('./li[5]//text()').extract()
trunk_cap = ''.join(trunk_cap)
ul = response.xpath('//div[@class="row parameter-configure"]//div[1]/ul')[0]
# 品牌型號
brand_type = ul.xpath('./li[1]/span//text()').extract()
brand_type = ''.join(brand_type)
# 牌照所在地
loc_of_lic = ul.xpath('./li[2]//text()').extract()
loc_of_lic = ''.join([i.strip() for i in loc_of_lic])
# 發動機
engine = ul.xpath('./li[3]//text()').extract()
engine = ''.join(engine)
# 驅動方式
drive_type = ul.xpath('./li[4]//text()').extract()
drive_type = ''.join(drive_type)
# 車輛級別
che_level = ul.xpath('./li[5]//text()').extract()
che_level = ''.join([i.strip() for i in che_level])
# 變速箱
transmission = response.xpath('//div[@class="summary-attrs"]/dl[3]//text()').extract()
transmission = ''.join([i.strip() for i in transmission])
# 爬取時間
p_time = time.strftime("%Y-%m-%d %X", time.localtime())
item = response.meta['data']
item['displacement'] = displacement
item['oil_wear'] = oil_wear
item['three_high'] = three_high
item['body_way'] = body_way
item['trunk_cap'] = trunk_cap
item['brand_type'] = brand_type
item['loc_of_lic'] = loc_of_lic
item['engine'] = engine
item['drive_type'] = drive_type
item['che_level'] = che_level
item['transmission'] = transmission
item['p_time'] = p_time
item['ip'] = '10.10.65.168'
yield item
# 判空
def get_value(self,value):
if value:
value = value[0]
else:
value = 1
return value
不要忘了settings.py配置文件中的各項設置:
將對應的管道創建的類名打開,以便執行數據的存儲操作
最後是pipelines.py文件執行連接mongo數據庫並進行數據存儲的操作
import pymongo
class Day0514Pipeline(object):
# def process_item(self, item, spider):
# return item
def __init__(self):
# 鏈接數據庫
self.client = pymongo.MongoClient(host='10.10.65.227',port=27017)
# 創建庫
self.db = self.client['TaoCheWang1']
# 創建表
self.table = self.db['cars']
def process_item(self, item, spider):
# with open('taoche.txt', 'a', encoding='utf-8') as fp:
# fp.write(json.dumps(dict(item), ensure_ascii=False) + '\n')
# 往表中插入字段
self.table.insert(dict(item))
return item
在項目執行之前要確保mongo數據庫處於運行狀態
在桌面,右擊“計算機”,選擇“管理”,選擇“服務和應用”,如下圖
如圖所示表示mongodb數據庫處於運行狀態。
運行項目
通過mongo可視化工具Robo 3T 1.2.1查看數據情況:
點擊:file-->Connect-->Connect 創建與MongoDB數據庫的連接
然後就可以查看數據庫了