python爬蟲之七 —— 鏈家二手房

前言

最近用爬蟲在鏈家網上轉了轉,獲取了成都所有二手房的數據(普通住宅,不含公寓、別墅等),一共5萬多條,在爬數據的過程中發現了一些需要注意的問題:

  • 每一組篩選條件最多隻能顯示100頁(每頁30條,一次篩選最多3000條),需要拆分篩選項來獲取一個城市的所有數據,我選擇按地域拆分,其它方式也可以
  • 只有中心城區的二手房數據,稍遠一點的區縣基本沒有或被歸入臨近的地區(看來鏈家的業務收縮在了中心區域)
  • 篩選區域劃分讓我有點懵逼,一個商圈在多個區都有,有的還直接歸錯了

根據這些數據做了一個分析展示,可以看這裏

代碼

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup as bs
import re
import json
import pymongo


class LianJia:
   def __init__(self):
       '''
       定義後面用到的屬性
       '''
       self.start_url = 'https://cd.lianjia.com/ershoufang/rs/'
       self.ua = UserAgent()
       self.user_agent = self.ua.random
       self.headers = {'User-Agent': self.ua.random,
                       'Host': 'cd.lianjia.com',
                       'Referer': 'https://cd.lianjia.com/'}
       self.max_count = 5
       self.client = pymongo.MongoClient('localhost')
       self.db = self.client.spider
   
   def get_resp(self, url, count=1):
       '''
       定義通用的請求方法,後面可以重複調用
       '''
       if count >= self.max_count:
           print('Tried Too Many Times')
           return None        
       try:
           resp = requests.get(url, headers=self.headers, timeout=30)
           return resp.content.decode()
       except Exception as e:
           print('Error Occurred', e.args)
           count += 1
           return self.get_resp(url, count)
   
   def get_zonelist(self, content):
       '''
       這個方法可以獲取地級市下面所有區縣的url
       '''
       if content:
           soup = bs(content, 'lxml')
           results = soup.select_one('div.position > dl > dd > div > div').select('a')
           result_dict = {result.get_text(): 'https://cd.lianjia.com'+result['href'] for result in results}
           return result_dict
   
   def get_loclist(self, content):
       '''
       這個方法獲取每個區縣下面所有商圈的url
       '''
       if content:
           soup = bs(content, 'lxml')
           results = soup.select('div.position > dl > dd > div > div')[1].select('a')
           result_list = ['https://cd.lianjia.com'+result['href'] for result in results]
           return result_list
   
   def parse_page(self, content):
       '''
       解析列表頁,獲取所有需要字段的數據(暫時不進詳情頁)
       '''
       if content:
           soup = bs(content, 'lxml')
           found_num = int(soup.select_one('div.leftContent > div.resultDes > h2.total > span').get_text())
           if found_num > 0:
               total_page = json.loads(soup.select_one('div.page-box > div.house-lst-page-box')['page-data'])['totalPage']
               lists = soup.select('.sellListContent .LOGCLICKDATA .info')
               tables = []
               for l in lists:
                   table = {'title': l.select('div a')[0].get_text(), 'link': l.select('div a')[0]['href']}
                   
                   house = l.select('.houseInfo')[0].get_text().split('|')
                   table['building'] = house[0].strip()
                   table['layout'] = house[1].strip()
                   table['size'] = float(house[2].strip().replace('平米', ''))
                   table['orientation'] = [i for i in house[3].strip().split(' ')]
                   table['decoration'] = house[4].strip()
                   if len(house) == 6:
                       table['elevator'] = house[5].strip()
                   
                   table['zone'] = soup.select_one('div.position > dl > dd > div > div > a.selected').get_text()
                   location = l.select('.positionInfo')[0].get_text().replace(' ', '')
                   table['location'] = location.split('-')[1]
                   floor = re.search(r'(.*?)樓層', location)
                   if floor:
                       table['floor'] = floor.group(1)                    
                   num_of_floor = re.search(r'(\d+)層', location)
                   if num_of_floor:
                       table['num_of_floor'] = int(num_of_floor.group(1))         
                   year = re.search(r'(\d+?)年', location)
                   if year:
                       table['year'] = year.group(1)
                   types = re.search(r'(板樓|塔樓|板塔結合)', location)
                   if types:
                       table['type'] = types.group(1)
                   
                   follow = l.select('.followInfo')[0].get_text().replace(' ', '').split('/')
                   table['follow'] = int(follow[0].replace('人關注', ''))
                   table['watch'] = int(re.search(r'共(\d+)次', follow[1]).group(1))
                   table['how_long_since_release'] = re.search(r'(.*?天|.*?個月|.*?年|剛剛)', follow[2]).group(1)
                   
                   tags = l.select('.tag span')
                   table['tags'] = [tag.get_text() for tag in tags]
                   
                   table['total'] = float(l.select('.priceInfo .totalPrice')[0].get_text().replace('萬', ''))
                   table['unit'] = int(l.select('.priceInfo .unitPrice')[0].get_text().replace('單價', '').replace('元/平米', ''))                
                   tables.append(table)
               return tables, total_page
           else:
               return None   
   
   def on_save(self, result):
       '''
       存儲抓取的數據,mongdb的update可以幫助過濾重複項
       '''
       if result:
           self.db.lianjia.update_one({'link': result['link']}, {'$set': result}, True)
#            with open('lianjia.txt', 'a', encoding='utf-8') as f:
#                f.write(json.dumps(result, ensure_ascii=False))
#                f.write('\n')        
   
   def run(self):
       '''
       主要邏輯:
       1 區縣鏈接
       2 區縣的各商圈鏈接
       3 訪問商圈所有頁面抓取數據
       '''
       resp = self.get_resp(self.start_url)
       content = self.get_zonelist(resp)
       for key, value in content.items():
           r = self.get_resp(value)
           cons = self.get_loclist(r)
           for con in cons:
               new_con = con+'sf1'
               print(key, new_con)
               response = self.get_resp(new_con)
               if self.parse_page(response): 
                   results, total = self.parse_page(response)
                   for i in results:
                       self.on_save(i)
                   page = 2
                   while page <= total:
                       next_page = con+'pg{}sf1'.format(str(page))
                       print(key, next_page)
                       response = self.get_resp(next_page)
                       results, _ = self.parse_page(response)
                       for i in results:
                           self.on_save(i)
                       page += 1
       

if __name__ == '__main__':
   lianjia = LianJia()
   lianjia.run()
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章