抓貨車網的圖片

# _*_ coding:UTF-8 _*_
# 開發作者   :   ZhangRong z00520111
# 開發時間   :   2020/3/28  10:09
# 文件名稱   :   catchhuoche.py
# 開發工具   :   PyCharm
# Description:
# Copyright @ Huawei Technologies Co., Ltd. 2019-2020. All rights reserved.

# -*- coding: utf-8 -*-
import re
import requests
from pyquery import PyQuery as pq
from getcookie import excuteScript
import time, random
import json
import os

requests.packages.urllib3.disable_warnings()

# str(content).encode('ISO-8859-1').decode('utf-8')
carbrandlist = ['東風', '一汽', '江淮', '三環', '江鈴', '重汽', '福田', '陝汽', '上汽', '凱馬', '長安']
image_num = 0
car_num = 0  # 當前是第幾輛車
prepath = 'E:/pictures/'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}


class HuoCheCrawler():

    def __init__(self):
        proxy_list = [
          #代理設置
        ]

        # http=random.choice(proxy_list),
        # https=random.choice(proxy_list)
        proxies = {
            "http": random.choice(proxy_list),
            "https": random.choice(proxy_list)
        }
        # print(http)
        self.baseurl = 'https://www.hcj198.com'
        self.sess = requests.Session()
        self.sess.headers = headers
        self.sess.proxies = proxies
        self.start_url = 'https://www.hcj198.com/car.html'

    # def anti_value(self):
    #     '''
    #     獲取antipas參數需要的key和value
    #     :return:
    #     '''
    #     content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
    #     params = re.findall(r"value=anti\('(.*?)','(.*?)'\)", content)[0]
    #     return params

    # def caculate_antipas(self):
    #     '''
    #     計算antipas參數
    #     :return:
    #     '''
    #     params = self.anti_value()
    #     antipas = excuteScript(params[0], params[1])
    #     self.sess.cookies.set('antipas', antipas)

    def page_url(self):
        # self.caculate_antipas()
        '''
        獲取翻頁鏈接
        :param start_url:
        :return:
        '''
        content = pq(self.sess.get(self.start_url, verify=False).text)
        # print(content)
        page_num_max = max([int(each.text()) for each in content(
            'div[@class="page-center search_list_one"] ul[@class="pagination"]  > li > a').items() if
                            re.match(r'\d+', each.text())])
        page_url_list = []
        for i in range(1, page_num_max + 1, 1):
            base_url = 'https://www.hcj198.com/car.html?page={}'.format(i)
            # print("第 %d 頁", i)
            # print(base_url)
            page_url_list.append(base_url)

        return page_url_list

    def index_page(self, start_url):
        '''
        抓取詳情頁鏈接
        :param start_url:
        :return:
        '''
        # print(start_url)
        content = pq(self.sess.get(start_url).text)
        # print('$' * 200)
        # print(content)
        for each in content('ul[@class="car-ul"]  > li > a').items():
            # print("each is ",each)
            url = each.attr.href
            # print("url is ",url)
            if not url.startswith('http'):
                url = self.baseurl + url
                yield url

    def detail_page(self, detail_url):
        '''
        抓取詳情信息
        :param detail_url:
        :return:
        '''

        content = pq(self.sess.get(detail_url).text, parser="html")
        # print("content is ", content)

        # tem1 = str(tem('img'))
        # pattern = r'data-src=["](.*?)["]'
        # result = re.findall(pattern, tem1)

        detail = content('ul[@class="tages-param"] li div').text()
        eachDetail = detail.split(' ')
        tem = content('div[@class="tages-img-list"]')
        # print("tem is ", tem)
        tem1 = str(tem('div'))
        # print("tem1 is ",tem1)
        pattern = r'url\("(.*?)&quot'
        result = re.findall(pattern, tem1)
        # print("result is ",result)
        name = content('div[@class="pro-title-cmodel"]').text().strip()
        for brand in carbrandlist:
            carbrand = brand
            if name.find(brand) != -1:
                break
                # content = self.sess.get(self.baseurl).text.encode('ISO-8859-1').decode('utf-8')
        price = content('div[@class="detail-left-dprice"] div[@class="dprice-left"]').text()
        data_dict = {
            'name': name,
            'carbrand': carbrand,
            'bordingdate': eachDetail[0],
            'km': eachDetail[3],
            'displacement': eachDetail[4],
            'carstyle': eachDetail[1],
            'price': price[price.index('¥') + 1:],
            'image': result
        }
        if not data_dict['name']:
            print(str(content).encode('ISO-8859-1').decode('utf-8'))

        return data_dict, result

    def request_download(self, https, carbrand):
        global car_num
        proxy_list = [
#代理設置
        ]

        # http=random.choice(proxy_list),
        # https=random.choice(proxy_list)
        proxies = {
            "http": random.choice(proxy_list),
            "https": random.choice(proxy_list)
        }
        global image_num
        # print("http is ",https)
        r = requests.get(https, proxies=proxies, verify=False)
        with open(
                prepath + carbrand + '/' + carbrand + str(car_num - 1) + '/' + carbrand + str(car_num - 1) + '_' + str(
                    image_num) + '.png', 'wb') as f:
            f.write(r.content)
            image_num = image_num + 1

    def run(self):
        global car_num
        for pageurl in self.page_url():
            for detail_url in self.index_page(pageurl):
                # print("datail is ", detail_url)
                listout, result = self.detail_page(detail_url)
                data_string = json.dumps(listout, ensure_ascii=False)
                carbrand = listout['carbrand']
                filename = carbrand + str(car_num)
                isExists = os.path.exists(prepath + carbrand + '/' + filename + '/')

                # 判斷結果
                if not isExists:
                    # 如果不存在則創建目錄
                    # 創建目錄操作函數
                    os.makedirs(prepath + carbrand + '/' + filename + '/')
                file = open(prepath + carbrand + '/' + filename + '/' + filename + ".txt", "a+", encoding='utf-8')
                file.write(data_string)
                file.close()
                car_num = car_num + 1
                print("list is ", listout)
                stop = 0
                for https in result:
                    if stop == 7:
                        break
                    self.request_download(self.baseurl + https, carbrand)
                    stop = stop + 1
                print("暫停5-15秒,防止被關小黑屋")
                time.sleep(random.randint(5, 15))
            print('*' * 200)


if __name__ == '__main__':
    hccrawler = HuoCheCrawler()
    hccrawler.run()

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章