前言
最近用爬蟲在鏈家網上轉了轉,獲取了成都所有二手房的數據(普通住宅,不含公寓、別墅等),一共5萬多條,在爬數據的過程中發現了一些需要注意的問題:
- 每一組篩選條件最多隻能顯示100頁(每頁30條,一次篩選最多3000條),需要拆分篩選項來獲取一個城市的所有數據,我選擇按地域拆分,其它方式也可以
- 只有中心城區的二手房數據,稍遠一點的區縣基本沒有或被歸入臨近的地區(看來鏈家的業務收縮在了中心區域)
- 篩選區域劃分讓我有點懵逼,一個商圈在多個區都有,有的還直接歸錯了
根據這些數據做了一個分析展示,可以看這裏 。
代碼
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as bs
import re
import json
import pymongo
class LianJia:
def __init__(self):
'''
定義後面用到的屬性
'''
self.start_url = 'https://cd.lianjia.com/ershoufang/rs/'
self.ua = UserAgent()
self.user_agent = self.ua.random
self.headers = {'User-Agent': self.ua.random,
'Host': 'cd.lianjia.com',
'Referer': 'https://cd.lianjia.com/'}
self.max_count = 5
self.client = pymongo.MongoClient('localhost')
self.db = self.client.spider
def get_resp(self, url, count=1):
'''
定義通用的請求方法,後面可以重複調用
'''
if count >= self.max_count:
print('Tried Too Many Times')
return None
try:
resp = requests.get(url, headers=self.headers, timeout=30)
return resp.content.decode()
except Exception as e:
print('Error Occurred', e.args)
count += 1
return self.get_resp(url, count)
def get_zonelist(self, content):
'''
這個方法可以獲取地級市下面所有區縣的url
'''
if content:
soup = bs(content, 'lxml')
results = soup.select_one('div.position > dl > dd > div > div').select('a')
result_dict = {result.get_text(): 'https://cd.lianjia.com'+result['href'] for result in results}
return result_dict
def get_loclist(self, content):
'''
這個方法獲取每個區縣下面所有商圈的url
'''
if content:
soup = bs(content, 'lxml')
results = soup.select('div.position > dl > dd > div > div')[1].select('a')
result_list = ['https://cd.lianjia.com'+result['href'] for result in results]
return result_list
def parse_page(self, content):
'''
解析列表頁,獲取所有需要字段的數據(暫時不進詳情頁)
'''
if content:
soup = bs(content, 'lxml')
found_num = int(soup.select_one('div.leftContent > div.resultDes > h2.total > span').get_text())
if found_num > 0:
total_page = json.loads(soup.select_one('div.page-box > div.house-lst-page-box')['page-data'])['totalPage']
lists = soup.select('.sellListContent .LOGCLICKDATA .info')
tables = []
for l in lists:
table = {'title': l.select('div a')[0].get_text(), 'link': l.select('div a')[0]['href']}
house = l.select('.houseInfo')[0].get_text().split('|')
table['building'] = house[0].strip()
table['layout'] = house[1].strip()
table['size'] = float(house[2].strip().replace('平米', ''))
table['orientation'] = [i for i in house[3].strip().split(' ')]
table['decoration'] = house[4].strip()
if len(house) == 6:
table['elevator'] = house[5].strip()
table['zone'] = soup.select_one('div.position > dl > dd > div > div > a.selected').get_text()
location = l.select('.positionInfo')[0].get_text().replace(' ', '')
table['location'] = location.split('-')[1]
floor = re.search(r'(.*?)樓層', location)
if floor:
table['floor'] = floor.group(1)
num_of_floor = re.search(r'(\d+)層', location)
if num_of_floor:
table['num_of_floor'] = int(num_of_floor.group(1))
year = re.search(r'(\d+?)年', location)
if year:
table['year'] = year.group(1)
types = re.search(r'(板樓|塔樓|板塔結合)', location)
if types:
table['type'] = types.group(1)
follow = l.select('.followInfo')[0].get_text().replace(' ', '').split('/')
table['follow'] = int(follow[0].replace('人關注', ''))
table['watch'] = int(re.search(r'共(\d+)次', follow[1]).group(1))
table['how_long_since_release'] = re.search(r'(.*?天|.*?個月|.*?年|剛剛)', follow[2]).group(1)
tags = l.select('.tag span')
table['tags'] = [tag.get_text() for tag in tags]
table['total'] = float(l.select('.priceInfo .totalPrice')[0].get_text().replace('萬', ''))
table['unit'] = int(l.select('.priceInfo .unitPrice')[0].get_text().replace('單價', '').replace('元/平米', ''))
tables.append(table)
return tables, total_page
else:
return None
def on_save(self, result):
'''
存儲抓取的數據,mongdb的update可以幫助過濾重複項
'''
if result:
self.db.lianjia.update_one({'link': result['link']}, {'$set': result}, True)
# with open('lianjia.txt', 'a', encoding='utf-8') as f:
# f.write(json.dumps(result, ensure_ascii=False))
# f.write('\n')
def run(self):
'''
主要邏輯:
1 區縣鏈接
2 區縣的各商圈鏈接
3 訪問商圈所有頁面抓取數據
'''
resp = self.get_resp(self.start_url)
content = self.get_zonelist(resp)
for key, value in content.items():
r = self.get_resp(value)
cons = self.get_loclist(r)
for con in cons:
new_con = con+'sf1'
print(key, new_con)
response = self.get_resp(new_con)
if self.parse_page(response):
results, total = self.parse_page(response)
for i in results:
self.on_save(i)
page = 2
while page <= total:
next_page = con+'pg{}sf1'.format(str(page))
print(key, next_page)
response = self.get_resp(next_page)
results, _ = self.parse_page(response)
for i in results:
self.on_save(i)
page += 1
if __name__ == '__main__':
lianjia = LianJia()
lianjia.run()