最近需要學點東西,把b站上的視頻下載到本地觀看,有不懂得掃我頭像加我微信問我,私信一般不看
import json
import os
from functools import partial
from tqdm import tqdm
import aiofiles
from aiohttp import ClientSession
import re
import asyncio
# 判斷是多集還是單集
async def fast(url, headers, file_headers):
async with ClientSession() as session:
async with session.get(url, headers=headers) as response:
html = await response.text()
p = '視頻選集'
tasks = []
if re.search(p, html):
parten = 'part'
result_parten = re.findall(parten, html)
for num in range(1, len(result_parten) +1):
every_pageurl = url + '?p=' + str(num)
videouUrl, audioUrl, name = await get_baseurl(every_pageurl, session, headers)
task = asyncio.ensure_future( download(videouUrl, audioUrl, session, name, headers = file_headers))
tasks.append(task)
await asyncio.gather(*tasks)
else:
videouUrl, audioUrl, name = await get_baseurl(url, session, headers)
await download(videouUrl, audioUrl, session, name, headers = file_headers)
# 獲取資源baseurl
async def get_baseurl(url, session, headers):
async with session.get(url, headers = headers) as response:
html = await response.text()
urlData = json.loads(re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.M)[0])
videoUrl = urlData['data']['dash']['video'][0]['baseUrl']
audioUrl = urlData['data']['dash']['audio'][0]['baseUrl']
name = re.findall('<h1 title="(.*?)" class="video-title">', html, re.M)[0]
return videoUrl, audioUrl, name
async def download(videourl, audiourl, session, name, headers):
video_official_filename = name + '.m4s'
video_temp_filename = video_official_filename + '.tem'
video_config_filename = video_official_filename + '.cfg'
audio_official_filename = name + '.mp3'
audio_temp_filename = audio_official_filename + '.tem'
audio_config_filename = audio_official_filename + '.cfg'
if os.path.exists(video_official_filename) and os.path.exists(audio_official_filename):
print(f'{name}整體文件已下載')
elif not os.path.exists(video_official_filename) and os.path.exists(audio_official_filename):
print(f'{audio_official_filename}' + f'已下載,即將下載{name}視頻文件')
if os.path.exists(video_temp_filename):
await get_videopart(videourl, session, headers, video_temp_filename, video_config_filename,
video_official_filename)
else:
await get_video( videourl, session, name, headers)
elif not os.path.exists(audio_official_filename) and os.path.exists(video_official_filename):
print(f'{video_official_filename}' + f'已下載,即將下載{name}音頻文件')
if os.path.exists(audio_temp_filename):
await get_audiopart(audiourl, session, headers, audio_temp_filename, audio_config_filename,
audio_official_filename)
else:
await get_audio(videourl, session, name, headers)
elif (not os.path.exists(audio_temp_filename)) and (not os.path.exists(video_temp_filename)):
print('未下載,啓動下載程序')
print(headers)
# await get_video( videourl, session, name, headers)
#
# await get_audio(audiourl, session, name, headers)
await asyncio.gather(get_video( videourl, session, name, headers),get_audio(audiourl, session, name, headers))
elif os.path.exists(video_temp_filename) and os.path.exists(audio_temp_filename):
await asyncio.gather(get_videopart(videourl, session, headers, video_temp_filename, video_config_filename,
video_official_filename), get_audiopart(audiourl, session, headers, audio_temp_filename, audio_config_filename,
audio_official_filename))
async def get_videopart(videourl, session, headers, video_temp_filename, video_config_filename,
video_official_filename):
async with aiofiles.open(video_config_filename, 'r') as fp:
all_fp = await fp.read()
cfg = json.loads(all_fp)
succeed_parts = {part['PartNumber'] for part in cfg['successparts']} # 之前已下載好的分塊號集合
succeed_parts_size = sum([part['Size'] for part in cfg['successparts']]) # 已下載的塊的總大小
parts = set(cfg['partnums']) - succeed_parts # 本次需要下載的分塊號集合
video_size = cfg['size']
parts_count = cfg['parts_count']
await get_file(videourl, session, headers, parts_count, video_temp_filename, video_config_filename,
video_official_filename, video_size, parts, succeed_parts_size)
async def get_audiopart(audiourl, session, headers, audio_temp_filename, audio_config_filename,
audio_official_filename):
async with aiofiles.open(audio_config_filename, 'r') as f:
all_f = await f.read()
audio_cfg = json.loads(all_f)
audio_succeed_parts = {audio_part['PartNumber'] for audio_part in audio_cfg['successparts']}
audio_succeed_parts_size = sum([audio_part['Size'] for audio_part in audio_cfg['successparts']])
audio_parts = set(audio_cfg['partnums']) - audio_succeed_parts
audio_audio_size = audio_cfg['size']
audio_parts_count = audio_cfg['parts_count']
await get_file(audiourl, session, headers, audio_parts_count, audio_temp_filename, audio_config_filename,
audio_official_filename, audio_audio_size, audio_parts, audio_succeed_parts_size)
async def get_file(url, session, headers,parts_count,temp_filename,config_filename,official_filename,size,
parts,succeed_parts_size, multipart_chunksize = 2*1024*1024):
'''
:param url: 請求文件所需的地址
:param session: 請求所需的共同session
:param headers: 請求文件所必須的請求頭
:param parts_count: 整體文件共分爲多少份
:param temp_filename: 緩存文件名
:param config_filename: 文件配置,記錄所請求的文件的基本信息
:param official_filename: 正式文件名
:param size: 文件具體大小
:param parts: 本次請求的所有文件塊信息
:param succeed_parts_size: 已請求成功的文件塊信息
:param multipart_chunksize: 每次請求文件塊的大小
:return:
'''
sem = asyncio.Semaphore(3)
_fetchByRange_partial = partial(_fetchByRange, sem, session, url,headers, temp_filename, config_filename)
to_do = [] # 保存所有任務的列表
for part_number in parts:
if part_number != parts_count - 1:
start = part_number * multipart_chunksize
stop = (part_number + 1) * multipart_chunksize - 1
else:
start = part_number * multipart_chunksize
stop = size - 1
task = asyncio.ensure_future(_fetchByRange_partial(part_number, start, stop))
to_do.append(task)
to_do_iter = asyncio.as_completed(to_do)
# to_do_iter = await asyncio.gather(*to_do)
failed_parts = 0 # 下載失敗的分塊數目
with tqdm(total=size, initial=succeed_parts_size, unit='B', unit_scale=True, unit_divisor=1024,
desc=official_filename) as bar: # 打印下載時的進度條,並動態顯示下載速度
for future in to_do_iter:
result = await future
# result = future
if result.get('failed'):
failed_parts += 1
else:
bar.update(result.get('part')['Size'])
if failed_parts > 0:
print(
'Failed to download {}, failed parts: {}, successful parts: {}'.format(official_filename, failed_parts,
parts_count - failed_parts))
else:
# pass
# 整個文件內容被成功下載後,將臨時文件名修改回正式文件名、刪除配置文件
os.rename(temp_filename, official_filename)
if os.path.exists(config_filename):
os.remove(config_filename)
print('{} downloaded'.format(official_filename))
async def get_video( videourl, session, name, headers, multipart_chunksize=2 * 1024 * 1024):
async with session.head(videourl, headers=headers) as audio_response:
official_filename = name + '.m4s'
temp_filename = official_filename + '.tem'
config_filename = official_filename + '.cfg'
video_size = int(audio_response.headers.get('Content-Length'))
# 獲取文件的總塊數
div, mod = divmod(video_size, multipart_chunksize)
parts_count = div if mod == 0 else div + 1 # 計算出多少個分塊
succeed_parts_size = 0
parts = range(parts_count)
set_parts = list(parts)
async with aiofiles.open(temp_filename, 'wb') as fp:
print(f'創建了{temp_filename}')
with open(config_filename, 'w') as fp: # 創建配置文件
cfg = {
'successparts': [], # 已請求成功的文件塊信息
'parts_count': parts_count, # 總文件塊數
'partnums': set_parts, # 總文件塊索引的集合
'size': video_size # 文件總大小
}
json.dump(cfg, fp)
await get_file(videourl, session, headers, parts_count, temp_filename, config_filename, official_filename, video_size,
parts, succeed_parts_size)
async def get_audio( audiourl, session, name, headers, multipart_chunksize=2 * 1024 * 1024):
async with session.head(audiourl, headers = headers) as audio_response:
official_filename = name + '.mp3'
temp_filename = official_filename + '.tem'
config_filename = official_filename + '.cfg'
audio_size = int(audio_response.headers.get('Content-Length'))
# 獲取文件的總塊數
div, mod = divmod(audio_size, multipart_chunksize)
parts_count = div if mod == 0 else div + 1 # 計算出多少個分塊
succeed_parts_size = 0
parts = range(parts_count)
set_parts = list(parts)
async with aiofiles.open(temp_filename, 'wb') as fp:
pass
with open(config_filename, 'w') as fp: # 創建配置文件
cfg = {
'successparts': [], # 已請求成功的文件塊信息
'parts_count': parts_count, # 總文件塊數
'partnums': set_parts, # 總文件塊索引的集合
'size': audio_size # 文件總大小
}
json.dump(cfg, fp)
await get_file(audiourl, session, headers, parts_count, temp_filename, config_filename, official_filename, audio_size,
parts, succeed_parts_size)
async def _fetchByRange(semaphore, session, url,headers, temp_filename, config_filename, part_number, start, stop):
'''根據 HTTP headers 中的 Range 只下載一個塊 (rb+ 模式)
semaphore: 限制併發的協程數
session: aiohttp 會話
url: 遠程目標文件的 URL 地址
temp_filename: 臨時文件
config_filename: 配置文件
part_number: 塊編號(從 0 開始)
start: 塊的起始位置
stop: 塊的結束位置
'''
part_length = stop - start + 1
range_head = {'range': 'bytes=%d-%d' % (start, stop)}
headers.update(range_head) #此片段的請求頭
try:
async with semaphore:
async with session.get(url, headers=headers) as r:
# 此分塊的信息
part = {
# 'ETag': r.headers['ETag'],
# 'Last-Modified': r.headers['Last-Modified'],
'PartNumber': part_number,
'Size': part_length
}
async with aiofiles.open(temp_filename, 'rb+') as fp: # 注意: 不能用 a 模式哦,那樣的話就算用 seek(0, 0) 移動指針到文件開頭後,還是會從文件末尾處追加
await fp.seek(start) # 移動文件指針
print('[{}] File point: {}'.format(temp_filename.strip('.swp'), fp.tell()))
binary_content = await r.read() # Binary Response Content: access the response body as bytes, for non-text requests
await fp.write(binary_content) # 寫入已下載的字節
print('寫入成功')
# 讀取原配置文件中的內容
f = open(config_filename, 'r')
cfg = json.load(f)
f.close()
# 更新配置文件,寫入此分塊的信息
f = open(config_filename, 'w')
cfg['successparts'].append(part)
json.dump(cfg, f)
f.close()
print('[{}] Part Number {} [Range: bytes={}-{}] downloaded'.format(temp_filename.strip('.swp'), part_number, start, stop))
return {
'part': part,
'failed': False # 用於告知 _fetchByRange() 的調用方,此 Range 成功下載
}
except Exception as e:
print('[{}] Part Number {} [Range: bytes={}-{}] download failed, the reason is that {}'.format(temp_filename.strip('.swp'), part_number, start, stop, e))
return {
'failed': True # 用於告知 _fetchByRange() 的調用方,此 Range 下載失敗了
}
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0',
}
file_headers = {
'accept': '*/*',
'accept-encoding': 'identity',
'accept-language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8,en-US;q=0.7,en;q=0.6,zh-HK;q=0.5',
'origin': 'https://www.bilibili.com',
'referer': 'https://www.bilibili.com/video/',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'cross-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
}
url = 'https://www.bilibili.com/video/BV1Qz411v7Qg'
loop = asyncio.get_event_loop()
loop.run_until_complete(fast(url, headers, file_headers))