1.相關網址和庫
參考文章:漫畫下載,動態加載、反爬蟲這都不叫事!
網址
https://www.dmzj.com/
需要用到的庫
requests、beautifulsoup、tqdm
2.代碼實現
import requests
import re
from bs4 import BeautifulSoup
from contextlib import closing
from tqdm import tqdm
import os
import time
class MangaSpider(object):
def __init__(self):
self.save_dir = '妖神記'
if not os.path.exists(self.save_dir):
os.mkdir(self.save_dir)
self.target_url = 'https://www.dmzj.com/info/yaoshenji.html'
self.chapter_list = []
# 1.發送請求
def send_request(self, url):
response = requests.get(url)
data = response.content.decode('utf-8')
return data
# 2.解析數據
def parse_list_data(self, data):
bs = BeautifulSoup(data, 'lxml')
list_con_li = bs.find('ul', attrs={'class': 'list_con_li'})
manga_list = list_con_li.find_all('a')
for manga in manga_list:
chapter_dict = {}
chapter_dict['chapter'] = manga.text
chapter_dict['url'] = manga.get('href')
self.chapter_list.append(chapter_dict)
def parse_pic_data(self, pics_data):
bs_pics = BeautifulSoup(pics_data, 'lxml')
script_info = bs_pics.script # bs 解析獲取 script 數據
pics = re.findall('\d{13,14}', str(script_info))
for j, pic in enumerate(pics):
if len(pic) == 13:
pics[j] = pic + '0'
pics = sorted(pics, key=lambda x: int(x))
chapterpic_hou = re.findall('\|(\d{5})\|', str(script_info))[0]
chapterpic_qian = re.findall('\|(\d{4})\|', str(script_info))[0]
pics_url_sorted = []
for pic in pics:
if pic[-1] == '0':
url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic[:-1] + '.jpg'
else:
url = 'https://images.dmzj.com/img/chapterpic/' + chapterpic_qian + '/' + chapterpic_hou + '/' + pic + '.jpg'
pics_url_sorted.append(url)
return pics_url_sorted
# 3.下載漫畫
def download_pics(self, data):
download_header = {
'Referer': data['url']
}
name = data['chapter']
# print(name)
while '.' in name:
name = name.replace('.', '')
chapter_save_dir = os.path.join(self.save_dir, name)
if not os.path.exists(chapter_save_dir):
os.mkdir(chapter_save_dir)
pics_data = self.send_request(url=data['url'])
pics_url = self.parse_pic_data(pics_data)
# 下載圖片並保存
for idx, pic_url in enumerate(pics_url):
pic_name = '%03d.jpg' % (idx + 1)
pic_save_path = os.path.join(chapter_save_dir, pic_name)
with closing(requests.get(pic_url, headers=download_header, stream=True)) as response:
chunk_size = 1024
content_size = int(response.headers['content-length'])
if response.status_code == 200:
with open(pic_save_path, 'wb') as f:
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
else:
print('鏈接異常')
time.sleep(5)
def run(self):
data = self.send_request(url=self.target_url)
self.parse_list_data(data)
for i, data in enumerate(tqdm(self.chapter_list)):
self.download_pics(data)
if __name__ == '__main__':
MangaSpider().run()
3.小結
- 通過Referer的反扒爬蟲手段
- 動態加載數據的解析