嗶哩嗶哩視頻下載(python3+asyncio+斷點續傳+有註釋)

最近需要學點東西,把b站上的視頻下載到本地觀看,有不懂得掃我頭像加我微信問我,私信一般不看


import json
import os
from functools import partial
from tqdm import tqdm
import aiofiles
from aiohttp import ClientSession
import re
import asyncio

# 判斷是多集還是單集
async def fast(url,  headers, file_headers):
    async with ClientSession() as session:
        async with session.get(url, headers=headers) as response:
            html = await response.text()
            p = '視頻選集'
            tasks = []
            if re.search(p, html):
                parten = 'part'
                result_parten = re.findall(parten, html)
                for num in range(1, len(result_parten) +1):
                    every_pageurl = url + '?p=' + str(num)
                    videouUrl, audioUrl, name = await get_baseurl(every_pageurl, session, headers)
                    task = asyncio.ensure_future( download(videouUrl, audioUrl, session, name, headers = file_headers))
                    tasks.append(task)
                await asyncio.gather(*tasks)

            else:
                videouUrl, audioUrl, name = await get_baseurl(url, session, headers)
                await download(videouUrl, audioUrl, session, name, headers = file_headers)
# 獲取資源baseurl
async def get_baseurl(url, session, headers):
    async with session.get(url, headers = headers) as response:
        html = await response.text()
        urlData = json.loads(re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.M)[0])
        videoUrl = urlData['data']['dash']['video'][0]['baseUrl']
        audioUrl = urlData['data']['dash']['audio'][0]['baseUrl']
        name = re.findall('<h1 title="(.*?)" class="video-title">', html, re.M)[0]
        return videoUrl, audioUrl, name

async def download(videourl, audiourl, session, name,  headers):
    video_official_filename = name + '.m4s'
    video_temp_filename = video_official_filename + '.tem'
    video_config_filename = video_official_filename + '.cfg'

    audio_official_filename = name + '.mp3'
    audio_temp_filename = audio_official_filename + '.tem'
    audio_config_filename = audio_official_filename + '.cfg'

    if os.path.exists(video_official_filename) and os.path.exists(audio_official_filename):
        print(f'{name}整體文件已下載')
    elif not os.path.exists(video_official_filename) and os.path.exists(audio_official_filename):
        print(f'{audio_official_filename}'  + f'已下載,即將下載{name}視頻文件')
        if os.path.exists(video_temp_filename):
            await get_videopart(videourl, session, headers, video_temp_filename, video_config_filename,
                       video_official_filename)
        else:
            await get_video( videourl, session, name, headers)

    elif not os.path.exists(audio_official_filename) and os.path.exists(video_official_filename):
        print(f'{video_official_filename}' + f'已下載,即將下載{name}音頻文件')
        if os.path.exists(audio_temp_filename):
            await get_audiopart(audiourl, session, headers, audio_temp_filename, audio_config_filename,
                       audio_official_filename)
        else:
            await get_audio(videourl, session, name, headers)

    elif (not os.path.exists(audio_temp_filename)) and (not os.path.exists(video_temp_filename)):
        print('未下載,啓動下載程序')
        print(headers)

        # await get_video( videourl, session, name, headers)
        #
        # await get_audio(audiourl, session, name, headers)
        await asyncio.gather(get_video( videourl, session, name, headers),get_audio(audiourl, session, name, headers))

    elif os.path.exists(video_temp_filename) and os.path.exists(audio_temp_filename):
        await asyncio.gather(get_videopart(videourl, session, headers, video_temp_filename, video_config_filename,
                       video_official_filename), get_audiopart(audiourl, session, headers, audio_temp_filename, audio_config_filename,
                       audio_official_filename))

async def get_videopart(videourl, session, headers, video_temp_filename, video_config_filename,
                       video_official_filename):
    async with aiofiles.open(video_config_filename, 'r') as fp:
        all_fp = await fp.read()
    cfg = json.loads(all_fp)
    succeed_parts = {part['PartNumber'] for part in cfg['successparts']}  # 之前已下載好的分塊號集合
    succeed_parts_size = sum([part['Size'] for part in cfg['successparts']])  # 已下載的塊的總大小
    parts = set(cfg['partnums']) - succeed_parts  #  本次需要下載的分塊號集合
    video_size = cfg['size']
    parts_count = cfg['parts_count']
    await get_file(videourl, session, headers, parts_count, video_temp_filename, video_config_filename,
                       video_official_filename, video_size, parts, succeed_parts_size)

async def get_audiopart(audiourl, session, headers, audio_temp_filename, audio_config_filename,
                       audio_official_filename):
    async with aiofiles.open(audio_config_filename, 'r') as f:
        all_f = await f.read()
    audio_cfg = json.loads(all_f)
    audio_succeed_parts = {audio_part['PartNumber'] for audio_part in audio_cfg['successparts']}  
    audio_succeed_parts_size = sum([audio_part['Size'] for audio_part in audio_cfg['successparts']])  
    audio_parts = set(audio_cfg['partnums']) - audio_succeed_parts  
    audio_audio_size = audio_cfg['size']
    audio_parts_count = audio_cfg['parts_count']
    await get_file(audiourl, session, headers, audio_parts_count, audio_temp_filename, audio_config_filename,
                   audio_official_filename, audio_audio_size, audio_parts, audio_succeed_parts_size)

async def get_file(url, session,  headers,parts_count,temp_filename,config_filename,official_filename,size,
                   parts,succeed_parts_size, multipart_chunksize = 2*1024*1024):

        '''

        :param url: 請求文件所需的地址
        :param session: 請求所需的共同session
        :param headers: 請求文件所必須的請求頭
        :param parts_count: 整體文件共分爲多少份
        :param temp_filename: 緩存文件名
        :param config_filename: 文件配置,記錄所請求的文件的基本信息
        :param official_filename: 正式文件名
        :param size: 文件具體大小
        :param parts: 本次請求的所有文件塊信息
        :param succeed_parts_size: 已請求成功的文件塊信息
        :param multipart_chunksize: 每次請求文件塊的大小
        :return:
        '''

        sem = asyncio.Semaphore(3)
        _fetchByRange_partial = partial(_fetchByRange, sem, session, url,headers, temp_filename, config_filename)
        to_do = []  # 保存所有任務的列表
        for part_number in parts:
           
            if part_number != parts_count - 1:
                start = part_number * multipart_chunksize
                stop = (part_number + 1) * multipart_chunksize - 1
            else:
                start = part_number * multipart_chunksize
                stop = size - 1
           
            task = asyncio.ensure_future(_fetchByRange_partial(part_number, start, stop))
            to_do.append(task)

        to_do_iter = asyncio.as_completed(to_do)
        # to_do_iter = await asyncio.gather(*to_do)

        failed_parts = 0  # 下載失敗的分塊數目
        with tqdm(total=size, initial=succeed_parts_size, unit='B', unit_scale=True, unit_divisor=1024,
                  desc=official_filename) as bar:  # 打印下載時的進度條,並動態顯示下載速度
            for future in to_do_iter:
                result = await future
                # result = future
                if result.get('failed'):
                    failed_parts += 1
                else:
                    bar.update(result.get('part')['Size'])

        if failed_parts > 0:
            print(
                'Failed to download {}, failed parts: {}, successful parts: {}'.format(official_filename, failed_parts,
                                                                                       parts_count - failed_parts))
        else:
            # pass
            # 整個文件內容被成功下載後,將臨時文件名修改回正式文件名、刪除配置文件
            os.rename(temp_filename, official_filename)
            if os.path.exists(config_filename):
                os.remove(config_filename)
            print('{} downloaded'.format(official_filename))

async def get_video( videourl, session, name, headers, multipart_chunksize=2 * 1024 * 1024):
    async with session.head(videourl, headers=headers) as audio_response:
        official_filename = name + '.m4s'
        temp_filename = official_filename + '.tem'
        config_filename = official_filename + '.cfg'
        video_size = int(audio_response.headers.get('Content-Length'))
        # 獲取文件的總塊數
        div, mod = divmod(video_size, multipart_chunksize)
        parts_count = div if mod == 0 else div + 1  # 計算出多少個分塊
        succeed_parts_size = 0
        parts = range(parts_count)
        set_parts = list(parts)
        async with aiofiles.open(temp_filename, 'wb') as fp:
            print(f'創建了{temp_filename}')

        with open(config_filename, 'w') as fp:  # 創建配置文件
            cfg = {

                'successparts': [],  # 已請求成功的文件塊信息
                'parts_count': parts_count,  # 總文件塊數
                'partnums': set_parts,  # 總文件塊索引的集合
                'size': video_size  # 文件總大小
            }
            json.dump(cfg, fp)
        await get_file(videourl, session, headers, parts_count, temp_filename, config_filename, official_filename, video_size,
                 parts, succeed_parts_size)

async def get_audio( audiourl, session, name, headers, multipart_chunksize=2 * 1024 * 1024):
    async with session.head(audiourl, headers = headers) as audio_response:
        official_filename = name + '.mp3'
        temp_filename = official_filename + '.tem'
        config_filename = official_filename + '.cfg'
        audio_size = int(audio_response.headers.get('Content-Length'))
        # 獲取文件的總塊數
        div, mod = divmod(audio_size, multipart_chunksize)
        parts_count = div if mod == 0 else div + 1  # 計算出多少個分塊
        succeed_parts_size = 0
        parts = range(parts_count)
        set_parts = list(parts)
        async with aiofiles.open(temp_filename, 'wb') as fp:
            pass

        with open(config_filename, 'w') as fp:  # 創建配置文件
            cfg = {
                'successparts': [],  # 已請求成功的文件塊信息
                'parts_count': parts_count,  # 總文件塊數
                'partnums': set_parts,  # 總文件塊索引的集合
                'size': audio_size  # 文件總大小
            }
            json.dump(cfg, fp)
        await get_file(audiourl, session, headers, parts_count, temp_filename, config_filename, official_filename, audio_size,
                 parts, succeed_parts_size)


async def _fetchByRange(semaphore, session, url,headers, temp_filename, config_filename, part_number, start, stop):
    '''根據 HTTP headers 中的 Range 只下載一個塊 (rb+ 模式)
    semaphore: 限制併發的協程數
    session: aiohttp 會話
    url: 遠程目標文件的 URL 地址
    temp_filename: 臨時文件
    config_filename: 配置文件
    part_number: 塊編號(從 0 開始)
    start: 塊的起始位置
    stop: 塊的結束位置
    '''
    part_length = stop - start + 1
    range_head = {'range': 'bytes=%d-%d' % (start, stop)}
    headers.update(range_head)  #此片段的請求頭

    try:
        async with semaphore:
            async with session.get(url, headers=headers) as r:
                # 此分塊的信息
                part = {
                    # 'ETag': r.headers['ETag'],
                    # 'Last-Modified': r.headers['Last-Modified'],
                    'PartNumber': part_number,
                    'Size': part_length
                }

                async with aiofiles.open(temp_filename, 'rb+') as fp:  # 注意: 不能用 a 模式哦,那樣的話就算用 seek(0, 0) 移動指針到文件開頭後,還是會從文件末尾處追加
                    await fp.seek(start)  # 移動文件指針
                    print('[{}] File point: {}'.format(temp_filename.strip('.swp'), fp.tell()))
                    binary_content = await r.read()  # Binary Response Content: access the response body as bytes, for non-text requests
                    await fp.write(binary_content)  # 寫入已下載的字節
                    print('寫入成功')

                # 讀取原配置文件中的內容
                f = open(config_filename, 'r')
                cfg = json.load(f)
                f.close()
                # 更新配置文件,寫入此分塊的信息
                f = open(config_filename, 'w')
                cfg['successparts'].append(part)
                json.dump(cfg, f)
                f.close()

                print('[{}] Part Number {} [Range: bytes={}-{}] downloaded'.format(temp_filename.strip('.swp'), part_number, start, stop))
                return {
                    'part': part,
                    'failed': False  # 用於告知 _fetchByRange() 的調用方,此 Range 成功下載
                }
    except Exception as e:
        print('[{}] Part Number {} [Range: bytes={}-{}] download failed, the reason is that {}'.format(temp_filename.strip('.swp'), part_number, start, stop, e))
        return {
            'failed': True  # 用於告知 _fetchByRange() 的調用方,此 Range 下載失敗了
        }

if __name__ == '__main__':
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
        }

    file_headers = {
            'accept': '*/*',
            'accept-encoding': 'identity',
            'accept-language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,zh-HK;q=0.5',
            'origin': 'https://www.bilibili.com',
            'referer': 'https://www.bilibili.com/video/',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'cross-site',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
        }
    url = 'https://www.bilibili.com/video/BV1Qz411v7Qg'
    loop = asyncio.get_event_loop()
    loop.run_until_complete(fast(url,  headers, file_headers))
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章