python3爬蟲----人人貸散標用戶信息爬取

前提

本文基於此篇博文的實現思路以及給出的代碼。不得不說,此博主心真的細,這個數據隱藏點都找到了,從而使得此爬蟲的實現簡單許多。

但我在使用博主代碼時,發現人人貸網站有些許改變,於是對應代碼我也相應做了些許修改(主要加了cookie請求頭信息,不加的話,有些數據爲空,以及對一些數據字段的更新修改)。

代碼如下

# coding=utf-8

from requests.exceptions import RequestException
import requests
import json
import csv
import re
import os

class Spider(object):

    def __init__(self):
        self.headers = {
            'Accept': 'application / json, text / javascript, * / *; q = 0.01',
            'Accept - Encoding': 'gzip, deflate, br',
            'Accept - Language': 'zh - CN, zh; q = 0.9',
            'Connection': 'keep - alive',
            'Host': 'www.renrendai.com',
            'Referer': 'https: // www.renrendai.com / loan.html',
            'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
            'X - Requested - With': 'XMLHttpRequest',
            'Cookie': 'rrdid=ccc75785-2c07-4dc4-b020-d846e1400e61; __jsluid_s=4c6ad9d4d4049fd05ad106c261fda012; Qs_lvt_181814=1584388964; Hm_lvt_a00f46563afb7c779eef47b5de48fcde=1584388968; gr_user_id=029f0129-ffae-41ed-8142-696b9cfc4616; grwng_uid=0008e890-1b28-4f43-9501-d22dc42f303a; _ga=GA1.2.1950582141.1584388974; _gid=GA1.2.1843462544.1584388974; renrendaiUsername=15580241130; utmSource=pc_pz_baidu; utm_medium=2075513; utm_campaign=716885827; utm_content=864625; utm_term=831733554_15058797666725; promotion_source=pc_pz_baidu; mediav=%7B%22eid%22%3A%22301358%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22%22%7D; loginMethod=password; IS_MOBLIE_IDPASS=true-false; jforumUserInfo=eiSmTE3oI809bABL60b2VNQ6XE%2FegqCwFJN6FcAwPsE%3D%0A; _gat=1; activeTimestamp=17971412; we_token=LXY5Z0NXSzVmMHBIN1FVUmhFQW5pdTJZUS1SaDBxdFI6MTc5NzE0MTI6MjNjNWM5ZjljZWYwNzQyNWQ2ODA4MmQ0NzI1ZTBjMDRjNmY2N2E4ZQ%3D%3D; we_sid=s%3AkvTAOQE0ZgUL4tKzSTBlhqZYF-E-J2QG.mOqona1ez021fYXhK0kBadT9xkwlp1LtTI%2FdK3xJ2XU; JSESSIONID=2C91F95436A01AF4DA78482A3EA0292A; bf0acacc0a738790_gr_last_sent_sid_with_cs1=027d3626-985a-4c76-8c70-aa3e73a19965; bf0acacc0a738790_gr_last_sent_cs1=17971412; bf0acacc0a738790_gr_cs1=17971412; bf0acacc0a738790_gr_session_id=027d3626-985a-4c76-8c70-aa3e73a19965; bf0acacc0a738790_gr_session_id_027d3626-985a-4c76-8c70-aa3e73a19965=true; Qs_pv_181814=692552440239038100%2C3904391899668128000%2C3858476856754299000%2C3922951718213370000%2C2429641774634917000; Hm_lpvt_a00f46563afb7c779eef47b5de48fcde=1584393613'
        }
        self.count = 0 # 記錄成功爬取的條數


    # 獲取散標信息
    def get_sanbiao(self):
        # 一共1000條,爬10次,每次100條
        for page in range(10):
            url = 'https://www.renrendai.com/loan/list/loanList?startNum={}&limit=100'.format(page)
            try:
                response = requests.get(url, headers=self.headers)
                if response.status_code == 200:
                    self.parse_sanbian(response.text)
            except RequestException as e:
                print(e)


    # 解析散標信息
    def parse_sanbian(self, data):
        data = json.loads(data)
        for item in data['data']['list']:
            url = 'https://www.renrendai.com/loan-{}.html'.format(item['loanId'])
            self.get_detailinfo(url)


    # 獲取詳細信息
    def get_detailinfo(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                self.count += 1
                print('成功爬取第 {} 條'.format(self.count))
                self.parse_detailinfo(response.text)
            else:
                print('failure: {}'.format(url))
        except RequestException as e:
            print(e)


    # 解析詳細信息
    def parse_detailinfo(self, data):
        data = data.replace(u'\xa9', u'').replace('\\u0022', '"').replace('\\u005C', '\\')  # gbk無法對u'\xa9'代表的字符進行編碼,在Unicode中u'\xa9'代表的是©。因此直接忽略掉。
        data = re.compile("var info = '({.*?})'", re.S).findall(data)
        data = json.loads(data[0])
       # print(data['borrower'])
        result = {}
        # 頂部信息
        result['loanId'] = data['loan']['loanId'] # Number
        result['borrowType'] = data['loan']['borrowType'] # 貸款類型
        result['amount'] = data['loan']['amount'] #標的總額
        result['interest'] = data['loan']['interest'] # 年利率
        result['months'] = data['loan']['months'] # 還款期限
        result['creditLevel'] = data['borrower']['creditLevel']  # 風險等級
        result['repayType'] = '按季還款' if int(data['loan']['repayType']) else '按月還款' # 還款方式
        result['loanType'] = '等額本息' if data['loan']['loanType'] == 'DEBX' else '付息還本' #借貸方式
        result['repaySource'] = data['repaySource']  # 還款來源
        # 借貸人信息
        result['realName'] = data['borrower']['realName']  # 姓名
        result['gender'] = data['borrower']['gender'] # 性別
        result['age'] = 2019-int(data['borrower']['birthDay'][:4]) # 年齡
        result['marriage'] = '已婚' if data['borrower']['marriage'] else '未婚' # 婚姻
        result['graduation'] = data['borrower']['graduation']  # 學歷
        result['salary'] = data['borrower']['salary'] # 收入
        result['houseLoan'] = '有' if data['borrower']['houseLoan'] else '無'  # 房貸
        result['carLoan'] = '有' if  data['borrower']['carLoan'] else '無' # 車貸
        result['officeDomain'] = data['borrower']['officeDomain'] # 公司行業
        result['hasOthDebt'] =data['hasOthDebt'] # 其他負債
        # 信用信息
        result['totalCount'] = data['userLoanRecord']['totalCount'] # 申請借款
        result['successCount'] = data['userLoanRecord']['successCount']  # 成功借款
        result['alreadyPayCount'] = data['userLoanRecord']['alreadyPayCount']   # 還清筆數
        result['availableCredits'] = data['borrower']['availableCredits']  #信用額度
        result['borrowAmount'] = data['userLoanRecord']['borrowAmount']  # 借款總額
        result['notPayTotalAmount'] = data['userLoanRecord']['notPayPrincipal']+data['userLoanRecord']['notPayInterest']  # 待還本息
        result['overdueTotalAmount'] = data['userLoanRecord']['overdueTotalAmount']   # 逾期金額
        result['overdueCount'] = data['userLoanRecord']['overdueCount']  # 逾期次數
        result['failedCount'] = data['userLoanRecord']['failedCount']  # 嚴重逾期
        self.save_excel(list(result.values()))


    # 存到excel
    def save_excel(self, data):
        out = open('人人貸.csv', 'a', newline='')
        write = csv.writer(out, dialect='excel')
        write.writerow(data)


    def run(self):
        if os.path.exists('./人人貸.csv'):
            os.remove('./人人貸.csv')
        self.save_excel('序號 貸款類型 標的總額 年利率 還款期限 風險等級 還款方式 借貸方式 還款來源'
              ' 姓名 性別 年齡 婚姻 學歷 收入 房貸 車貸 公司行業 其他負債'
              ' 申請借款 成功借款 還清筆數 信用額度 借款總額 待還本息 逾期金額 逾期次數 嚴重逾期'.split(' '))
        self.get_sanbiao()


if __name__ == '__main__':
    spider = Spider()
    spider.run()

注意:大家在使用時記得修改成自己cookie信息,其次,由於網站結果隨時可能會變,從而此爬蟲相應隨時可能會失效,具體還需大家自己調試。

順便附上一份自己當時爬的數據

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章