前言:
使用多進程爬蟲方法爬取轉轉網二手市場商品信息,並將爬取的數據存儲於MongoDB數據庫中
本文爲整理代碼,梳理思路,驗證代碼有效性——2020.1.18
環境:
Python3(Anaconda3)
PyCharm
Chrome瀏覽器
主要模塊: 後跟括號內的爲在cmd窗口安裝的指令
requests(pip install requests)
lxml(pip install lxml)
pymongo(pip install pymongo )
multiprocessing
步驟
轉轉網即58同城二手市場
1
首先爬取各類別的URL
那麼打開開發者工具F12,對網頁結構進行分析
由截圖即可判斷,它們均有一個共同的外標籤,這裏用Xpath表達式爲'//div[@class="lbsear"]/div/ul/li'
,我們在此基礎上再進一步對網頁進行解析,解析用Xpath表達式爲'ul/li/b/a/@href'
,並將結果打印在控制檯上作爲channel_list,該步驟最終代碼詳見完整代碼中的channel_extract.py。
2
通過上步得到的url進一步獲取詳情頁的url
# 定義獲取商品URL的函數
def get_links(channel, pages):
list_view = '{}pn{}/'.format(channel, str(pages))
try:
html = requests.get(list_view, headers=headers)
time.sleep(2)
selector = etree.HTML(html.text)
if selector.xpath('//tr'):
infos = selector.xpath('//tr')
for info in infos:
if info.xpath('td[2]/a/@href'):
url = info.xpath('td[2]/a/@href')[0]
print("url:", url)
tongcheng_url.insert_one({'url': url}) # 插入數據庫
else:
pass
else:
pass
except requests.exceptions.ConnectionError:
pass # pass掉請求連接錯誤
3
點擊跳轉至詳情頁,並分析網頁結構獲取具體的信息(標題,價格,區域,瀏覽量)
4
分第一部分和第二部分代碼分別運行main.py文件,通過多線程方法將數據爬取下來並保存到MongoDB數據庫中。
5
注意:
在後期運行過程中出現了一些問題,具體原因及處理方法如下:
圖一爲其他正常詳情頁,圖二爲手機的詳情頁,而圖二中沒有對應的我們所需求的信息,所以在第一步獲得的channel_list手動剔除手機相關的url,另做處理。
完整代碼
1.channel_extract.py
# 導入庫文件
import requests
from lxml import etree
start_url = 'http://cs.58.com/sale.shtml' # 請求URL
url_host = 'http://cs.58.com' # 拼接的部分URL
# 獲取商品類目URL
def get_channel_urls(url):
html = requests.get(url)
selector = etree.HTML(html.text)
infos = selector.xpath('//div[@class="lbsear"]/div/ul/li')
for info in infos:
class_urls = info.xpath('ul/li/b/a/@href')
for class_url in class_urls:
# 打印類目urls
print(url_host + class_url)
# get_channel_urls(start_url)
# 運行上行代碼 得到 channel_list
channel_list = '''
http://cs.58.com/shouji/
http://cs.58.com/tongxunyw/
http://cs.58.com/danche/
http://cs.58.com/diandongche/
http://cs.58.com/fzixingche/
http://cs.58.com/sanlunche/
http://cs.58.com/peijianzhuangbei/
http://cs.58.com/diannao/
http://cs.58.com/bijiben/
http://cs.58.com/pbdn/
http://cs.58.com/diannaopeijian/
http://cs.58.com/zhoubianshebei/
http://cs.58.com/shuma/
http://cs.58.com/shumaxiangji/
http://cs.58.com/mpsanmpsi/
http://cs.58.com/youxiji/
http://cs.58.com/ershoukongtiao/
http://cs.58.com/dianshiji/
http://cs.58.com/xiyiji/
http://cs.58.com/bingxiang/
http://cs.58.com/jiadian/
http://cs.58.com/binggui/
http://cs.58.com/chuang/
http://cs.58.com/ershoujiaju/
http://cs.58.com/yingyou/
http://cs.58.com/yingeryongpin/
http://cs.58.com/muyingweiyang/
http://cs.58.com/muyingtongchuang/
http://cs.58.com/yunfuyongpin/
http://cs.58.com/fushi/
http://cs.58.com/nanzhuang/
http://cs.58.com/fsxiemao/
http://cs.58.com/xiangbao/
http://cs.58.com/meirong/
http://cs.58.com/yishu/
http://cs.58.com/shufahuihua/
http://cs.58.com/zhubaoshipin/
http://cs.58.com/yuqi/
http://cs.58.com/tushu/
http://cs.58.com/tushubook/
http://cs.58.com/wenti/
http://cs.58.com/yundongfushi/
http://cs.58.com/jianshenqixie/
http://cs.58.com/huju/
http://cs.58.com/qiulei/
http://cs.58.com/yueqi/
http://cs.58.com/kaquan/
http://cs.58.com/bangongshebei/
http://cs.58.com/diannaohaocai/
http://cs.58.com/bangongjiaju/
http://cs.58.com/ershoushebei/
http://cs.58.com/chengren/
http://cs.58.com/nvyongpin/
http://cs.58.com/qinglvqingqu/
http://cs.58.com/qingquneiyi/
http://cs.58.com/chengren/
http://cs.58.com/xiaoyuan/
http://cs.58.com/ershouqiugou/
http://cs.58.com/tiaozao/
http://cs.58.com/tiaozao/
http://cs.58.com/tiaozao/
'''
2.page_spider.py
# 導入庫
import requests
from lxml import etree
import time
import pymongo
# 連接數據庫
client = pymongo.MongoClient('localhost', 27017)
# 創建數據庫和數據集合
mydb = client['mydb']
tongcheng_url = mydb['tongcheng_url']
tongcheng_info = mydb['tongcheng_info']
# 加入請求頭
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, '
'like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Connection': 'keep-alive'
}
# 定義獲取商品URL的函數
def get_links(channel, pages):
list_view = '{}pn{}/'.format(channel, str(pages))
try:
html = requests.get(list_view, headers=headers)
time.sleep(2)
selector = etree.HTML(html.text)
if selector.xpath('//tr'):
infos = selector.xpath('//tr')
for info in infos:
if info.xpath('td[2]/a/@href'):
url = info.xpath('td[2]/a/@href')[0]
print("url:", url)
tongcheng_url.insert_one({'url': url}) # 插入數據庫
else:
pass
else:
pass
except requests.exceptions.ConnectionError:
pass # pass掉請求連接錯誤
# 定義商品詳細信息的函數
def get_info(url):
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
print(url, html.status_code)
try:
# 標題
title = selector.xpath('//*[@id="basicinfo"]/div[1]/h1/text()')[0]
# 價格
if selector.xpath('//*[@id="basicinfo"]/div[3]/div[1]/div[2]/span/text()'):
price = selector.xpath('//*[@id="basicinfo"]/div[3]/div[1]/div[2]/span/text()')[0]
else:
price = "無"
# 區域
if selector.xpath('//*[@id="basicinfo"]/div[3]/div[3]/div[2]/a[1]/text()'):
area = selector.xpath('//*[@id="basicinfo"]/div[3]/div[3]/div[2]/a[1]/text()')[0]
else:
area = "無"
# 瀏覽量
view = selector.xpath('//*[@id="totalcount"]/text()')[0]
info = {
'tittle': title,
'price': price,
'area': area,
'view': view,
'url': url
}
print(info)
tongcheng_info.insert_one(info) # 插入數據庫
except IndexError:
pass # pass掉IndexError:錯誤
3.main.py
# 第一部分,爬取url地址
# import sys
# sys.path.append("..")
#
# from multiprocessing import Pool
# from channel_extract import channel_list
# from page_spider import get_links # 導入庫文件和同一文件下的程序
#
#
# def get_all_links_from(channel):
# for num in range(1,101):
# get_links(channel, num) # 構造urls
#
#
# if __name__ == '__main__': # 程序主入口
# pool = Pool(processes=4) # 創建進程池
# pool.map(get_all_links_from, channel_list.split()) # 調用進程池爬蟲
# 第二部分,爬取信息
import sys
sys.path.append("..")
from multiprocessing import Pool
from page_spider import get_info
from page_spider import tongcheng_url
from page_spider import tongcheng_info
db_urls = [item['url'] for item in tongcheng_url.find()]
db_infos = [item['url'] for item in tongcheng_info.find()]
x = set(db_urls)
y = set(db_infos)
rest_urls = x - y
if __name__ == '__main__':
pool = Pool(processes=4)
pool.map(get_info, rest_urls)