python 爬取小米商城的應用apk文件

本文在他人基礎之上改良而成  

本文僅做個人技術學習使用,不得用以危害他人或羣體權益!!!!

請合法合規使用!!!

需在python 3.6下運行

# coding=utf-8 chengqiang

import urllib

import requests

import re

from bs4 import BeautifulSoup

import random

import time

import ssl

def parser_apks(count = 300,category=1):

    _root_url = "http://app.mi.com"  # 應用市場主頁網址

    res_parser = {}

    # 設置爬取的頁面,從第一頁開始爬取,第一頁爬完爬取第二頁,以此類推

    page_num = 8

    while count:

        # 獲取應用列表頁面

        print("進入循環 count=" + str(count))

        wbdata = requests.get("http://app.mi.com/catTopList/"+str(category)+"?page=" + str(page_num)).text

        print("開始爬取第"+str(category)+"類的" + str(page_num) + "頁")

        # 解析應用列表頁面內容

        soup = BeautifulSoup(wbdata, "html.parser")

        links = soup.find_all("a", href=re.compile("/details?"), class_="", alt="")

 

        if len(links) == 0:

            print("==============links空了=================")

            break

 

        for link in links:

            # 獲取應用詳情頁面的鏈接

            detail_link = urllib.parse.urljoin(_root_url, str(link["href"]))

            package_name = detail_link.split("=")[1]

            download_page = requests.get(detail_link).text

            #解析應用詳情頁面

            soup1 = BeautifulSoup(download_page, "html.parser")

            download_link = soup1.find(class_="download")["href"]

            #獲取直接下載的鏈接

            download_url = urllib.parse.urljoin(_root_url, str(download_link))

            # 解析後會有重複的結果,通過判斷去重

            if package_name not in res_parser.keys():

                if count > 0:

                    res_parser[package_name] = download_url

                    count = count - 1

                else:

                    break

            if count == 0:

                    break

        if count > 0:

            page_num = page_num + 1

 

    print("爬取apk數量爲: " + str(len(res_parser)))

    return res_parser

 

user_agent_list = [

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "

        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",

        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "

        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "

        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "

        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",

        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "

        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "

        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "

        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "

        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",

        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "

        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",

        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "

        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"

    ]

 

def auto_down(url,filename):

    try:

        opener = urllib.request.build_opener()

        opener.addheaders = [('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30')]

        opener.addheaders = [('User-Agent',random.choice(user_agent_list))]

        urllib.request.install_opener(opener)

        urllib.request.urlretrieve(url,filename)

    except urllib.request.ContentTooShortError:

        print( 'Network conditions is not good.Reloading.')

        auto_down(url,filename)

    except urllib.error.HTTPError:

        print(str(urllib.error.HTTPError))

    except urllib.error.URLError:

        # auto_down(url,filename)

        print("sssssss oh ssssssss" + str(urllib.error.URLError))

 

 

def craw_apks(count = 300,category=1, save_path = "/Users/chengqiang/Desktop/python_spider/"):

    print("craw_apks count="+str(count))

    res_dic = parser_apks(count,category)

 

    for apk in res_dic.keys():

        print("正在下載應用: " + apk)

        time.sleep(2)

        print("下載位置: " + str(res_dic[apk]))

        fileUrl = str(res_dic[apk])

        fileUrl=get_redirect_url(res_dic[apk])

        save_to_file('apkList.txt', fileUrl)

        print("+++++++++++" + save_path + apk + ".apk")

        # print(res_dic[apk] + '-----------')

        urllib.request.urlretrieve(fileUrl, save_path + apk + ".apk")

#        auto_down(fileUrl, save_path + apk + ".apk")

#         File = requests.get(fileUrl, stream=True)

        print("下載完成: "+ fileUrl)

    print(str(category)+"=category====finish=====")

 

 

def save_to_file(file_name, contents):

    fh = open(file_name, 'a')

    fh.write(contents+'\n')

    fh.close()

 

 

def get_redirect_url(downurl):

    # 請求頭,這裏我設置了瀏覽器代理

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}

    # 請求網頁

    response = requests.get(downurl, headers=headers)

    print(response.status_code)  # 打印響應的狀態碼

    print(response.url)  # 打印重定向後的網址

    # 返回重定向後的網址

    return response.url

 

 

if __name__ == "__main__":

    category = 27

    #python 3 會進行https強驗證需要關閉驗證

    ssl._create_default_https_context = ssl._create_unverified_context

    #while category<=10:

    craw_apks(30, category,"/Users/chengqiang/Desktop/python_spider/")

    # craw_apks(50,category,"H:\\miApk\\")

    #category = category + 1

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章