實戰各大平臺商品比價--Python 爬取過客網商品歷史價格(30天)

#! usr/bin/env python
# -*- coding:utf-8 -*-

__author__="JUNHAN"

環境: Python3.6.5

1.導入第三方庫

import functools
import execjs
import traceback
from urllib.parse import quote_plus
import requests, json, time, datetime, random, re
from urllib.parse import quote
from user_check_proxy import Proxy_start
from logs import logDebug, logInfo
#代理自己加上,或者不加代理
from user_check_proxy import get_proxy2

#過客網支持淘寶、天貓、京東、蘇寧、噹噹、網易考拉、亞馬遜等商品網址

import warnings
warnings.filterwarnings('ignore')

2.手機端UA


def random_h5_ua():
    h5_user_agent = ['Mozilla/5.0 (Linux; Android 5.1; OPPO A37m Build/LMY47I; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/4G Language/zh_CN', 'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/4G Language/zh_CN', 'Mozilla/5.0 (Linux; Android 5.1.1; OPPO R9 Plusm A Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/WIFI Language/zh_CN', 'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11 Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/WIFI Language/zh_CN', 'Mozilla/5.0 (Linux; Android 7.1.1; OPPO R11 Pluskt Build/NMF26X; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.49 Mobile MQQBrowser/6.2 TBS/043632 Safari/537.36 MicroMessenger/6.5.23.1180 NetType/WIFI Language/zh_CN']
    return random.choice(h5_user_agent)

3.PC端UA


def random_web_ua():
    web_user_agent = [
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
    ]

    return random.choice(web_user_agent)

 

4.先獲取30天時間的方法

    獲取當前日期前30天日期, 不算當天, 返回 list


def days_ago():
    today = time.strftime('%Y,%m,%d')   # <class 'str'>
    t = time.strptime(today, '%Y,%m,%d')    # # <class 'time.struct_time'>
    y, m, d = t[0:3]
    # print(y, m, d)
    thirty_days_list = []
    # print("thirty_days_list:",thirty_days_list)
    for dd in range(30, 0, -1):
        Date = str(datetime.datetime(y, m, d) - datetime.timedelta(dd)).split()
        days_b = Date[0]  # .replace('-', '') #  <class 'str'>
        # print(days_b)
        # if days_b[1][0] == '0':
        #     days_b[1] = days_b[1][1]
        # if days_b[2][0] == '0':
        #     days_b[2] = days_b[2][1]
        # days_before = '-'.join(days_b)
        # print('--', days_before)
        thirty_days_list.append(days_b)
    return thirty_days_list

5.時間戳轉換


def get_timestamp_str(timestamp):
    # print(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(timestamp)))
    # print(type(time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(timestamp))))
    return time.strftime('%Y-%m-%d',time.localtime(timestamp))


def get_guoke_price_web(item_url):

    # 獲取代理,這裏需要自己加上代理池或者雲代理!!!!!!!!

    下面一行,可以註釋代理不用,請求的時候(proxies=proxies)刪除!!!!!!
    proxies = get_proxy2()


    ua = random_web_ua()
    k = quote_plus(item_url)
    btnSearch = quote_plus('搜索')

6.開始請求url

    url_01 = 'http://www.tool168.cn/?'
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        # 'Cookie':'PHPSESSID=l31o4o91itpmeh7m38ol196t47; Hm_lvt_61e842dc51946642fa309fd4e1c752aa=1547202812; Hm_lpvt_61e842dc51946642fa309fd4e1c752aa=1547283438',
        'Host': 'www.tool168.cn',
        'Referer': 'http://www.tool168.cn/history/',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': ua,
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
    }

    params = {
        'm': 'history',
        'a': 'view',
        'k': k,
        'btnSearch': btnSearch
    }

    response_html_01 = requests.get(url=url_01, headers=header, params=params ,proxies=proxies, verify=False,timeout=20)
    result_html_01 = response_html_01.text
    # print(result_html_01)
    # print(result)
    # print("result_html_01:",result_html_01)
    checkCode = re.search('id="checkCodeId" value="(.*?)"', result_html_01).group(1)

    # print(checkCode)

    url_02 = "http://www.tool168.cn/dm/ptinfo.php"
    header = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '108',
        'Content-Type': 'application/x-www-form-urlencoded',
        # 'Cookie':'PHPSESSID=l31o4o91itpmeh7m38ol196t47; Hm_lvt_61e842dc51946642fa309fd4e1c752aa=1547202812; Hm_lpvt_61e842dc51946642fa309fd4e1c752aa=1547210101',
        'Host': 'www.tool168.cn',
        'Origin': 'http://www.tool168.cn',
        'Referer': 'http://www.tool168.cn/?m=history&a=view&k={}&btnSearch={}'.format(k,btnSearch),
        # 'Referer': f'http://www.tool168.cn/?m=history&a=view&k={k}&btnSearch={btnSearch}',
        'User-Agent': ua,
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }

    data = {
        # 'checkCode': "ce5e75b10ad46b1927895e0de48b5134",
        'checkCode':checkCode,
        'con': item_url,
        # 'con': 'https://detail.tmall.com/item.htm?id=534068049215'
    }

    response_html_02 = requests.post(url=url_02, headers=header, data=data, proxies=proxies, verify=False,timeout=20)
    result_html_02 = response_html_02.text
    # print(result_html_02)
    code = json.loads(result_html_02).get("code")
    # print(code)

    # url_03 = f"http://www.tool168.cn/dm/history.php?code={code}&t="
    url_03="http://www.tool168.cn/dm/history.php?"
    header = {
        'Accept': 'text/plain, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        # 'Connection': 'keep-alive',
        # 'Content-Length': '0',
        # 'Cookie':'PHPSESSID=l31o4o91itpmeh7m38ol196t47; Hm_lvt_61e842dc51946642fa309fd4e1c752aa=1547202812; Hm_lpvt_61e842dc51946642fa309fd4e1c752aa=1547203682',
        'Host': 'www.tool168.cn',
        'Origin': 'http://www.tool168.cn',
        'Referer':'http://www.tool168.cn/?m=history&a=view&k={}'.format(item_url),
        # 'Referer': 'http://www.tool168.cn/?m=history&a=view&k=https%3A%2F%2Fdetail.tmall.com%2Fitem.htm%3Fid%3D534068049217',
        'User-Agent':ua,
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }

    params = {
        "code":code,
        # 'code': "0f72c0c84e6f722de6fb57f9feb3691e26545bc2991ffc290ed35271bb85549977d831788ac687b919d2670d35df4641b9ccc7be6e917dfc",
        't': ''
    }
    response_html_03 = requests.post(url=url_03, headers=header, params=params, proxies=proxies, verify=False,timeout=20)
    # result_response = response_html_03.text
    # print(result_response)
    response_html_03.encoding = "utf-8"
    result_response =response_html_03.text.strip()
    # print('result_response = ', result_response)
    try:
        if "對不起,沒有找到。" in result_response:
            result = "對不起,該商品未收錄或加載異常!"
            # return result
            return None
        else:
            return result_response
    except Exception as e:
        # print(e)
        return None

7.解析日期,歷史價格


def parse(result_history_price):
    thirty_date = days_ago()[0]
    # 歷史價格列表
    history_price_list = []
    for res in result_history_price:
        dates = re.search('\((.*?)\)', res).group(1)
        price = re.search('\),(.*?)]', res).group(1)
        dates_prices = dates.split(",")
        year = dates_prices[0]
        month = dates_prices[1]
        month = int(month) + 1
        if len(str(month)) == 1:
            month = '0' + str(month)
        day = dates_prices[2]
        if len(day) == 1:
            day = '0' + day
        shop_history_time = f"{year}-{month}-{day}"
        end_price = price
        history_price_list.append([shop_history_time, end_price])

    # print('result_list_true = ', history_price_list)
    #     history_price[shop_history_time] = end_price
    # result_response = json.dumps(history_price)

8.判斷取出30天商品歷史價格

    thirty_days_price = []  # 刪選出最近30天價格列表
    for i in history_price_list:
        if int(i[0].replace('-', '')) >= int(thirty_date.replace('-', '')):
            thirty_days_price.append(i)
    # print('thirty_days_price = ', thirty_days_price)
    if thirty_days_price == []:  # 沒有最近一個月日期, 說明價格和幾個月前價格一致
        thirty_days_price = [[thirty_date, history_price_list[-1][1]]]
    try:  # 查詢第一天日期
        if history_price_list != [] and thirty_days_price != []:
            if len(history_price_list) > len(thirty_days_price):
                if int(thirty_date.replace('-', '')) not in [int(i[0].replace('-', '')) for i in thirty_days_price]:
                    h_days = [int(i[0].replace('-', '')) for i in history_price_list]
                    for i in range(0, len(h_days)):
                        if h_days[i] < int(thirty_date.replace('-', '')) < h_days[i + 1]:
                            p_index = i
                            break
                    thirty_days_price.insert(0, [thirty_date, history_price_list[p_index][1]])
    except:  # 否則,說明慢慢買也是在這個日期第一次收錄進來的
        pass
    # print('thirty_days_price = ', thirty_days_price)
    thirty_days_price_dict = {}  # 接口最終返回
    for price in thirty_days_price:
        thirty_days_price_dict[price[0]] = int(float(price[1]) * 100)
    # print('thirty_days_price_dict = ', thirty_days_price_dict)
    return thirty_days_price_dict


def gkw_history_prices(item_url):
    try:
        result = get_guoke_price_web(item_url)
    except:
        # print(item_url,'--response_erro')
        return None
    # print("result:",result)
    try:
        result_history_price = re.search('chart\("(.*?)".*\);', result, re.S).group(1).replace("],[", "],[").replace("Date.UTC", "").split(",")
        thirty_days_price_dict = parse(result_history_price)
        # print(item_url, '--', thirty_days_price_dict)
        return thirty_days_price_dict
    except:
        # print(item_url, '--parse_erro')
        return None


if __name__ == '__main__':

    # 添加各大平臺商品URL
    item_url="https://item.jd.com/5475614.html"
    print(gkw_history_prices(item_url))
   
 


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章