最近發現一個手機在線看小說的網站,裏面沒有任何廣告,先上鍊接地址:
然後在線看起來也是挺累,還是下載txt用掌閱書城app看單機小說有趣,想辦法爬下來。
爬個小說本來就不是什麼難受,現在問題是這個網站的小說做了頁面做了分段處理,一個網頁是沒法拿到整篇小說的源碼的。人家把一篇小說分成多個html頁面儲存了,而且你還得不到這篇小說的所有html頁面,主頁面上還沒有提示。
不過,經過一通html分析,寫了幾段正則表達式,還是成功把小說爬了下來,就是單線程,非分佈式,速度很慢,將就用吧
代碼發上來以供參考,重點是網頁結構分析,分析懂了才知道怎麼爬。
#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# File : 爬小說.py
# Author: DaShenHan&道長-----先苦後甜,任憑晚風拂柳顏------
# Date : 2019/11/13
import requests
import re
def getCode(url): #獲取主頁源碼
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Cookie": "l=AurqcPuigwQdnQv7WvAfCoR1OlrRQW7h; isg=BHp6mNB79CHqYXpVEiRteXyyyKNcg8YEwjgLqoRvCI3ddxqxbLtOFUBGwwOrZ3ad; thw=cn; cna=VsJQERAypn0CATrXFEIahcz8; t=0eed37629fe7ef5ec0b8ecb6cd3a3577; tracknick=tb830309_22; _cc_=UtASsssmfA%3D%3D; tg=0; ubn=p; ucn=unzbyun; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; miid=981798063989731689; hng=CN%7Czh-CN%7CCNY%7C156; um=0712F33290AB8A6D01951C8161A2DF2CDC7C5278664EE3E02F8F6195B27229B88A7470FD7B89F7FACD43AD3E795C914CC2A8BEB1FA88729A3A74257D8EE4FBBC; enc=1UeyOeN0l7Fkx0yPu7l6BuiPkT%2BdSxE0EqUM26jcSMdi1LtYaZbjQCMj5dKU3P0qfGwJn8QqYXc6oJugH%2FhFRA%3D%3D; ali_ab=58.215.20.66.1516409089271.6; mt=ci%3D-1_1; cookie2=104f8fc9c13eb24c296768a50cabdd6e; _tb_token_=ee7e1e1e7dbe7; v=0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
resp = requests.request("GET",url,headers=headers)
resp.encoding = resp.apparent_encoding
if resp.status_code == requests.codes.ok:
return resp.text
else:
return False
def getNovelList(page_source,reurl="https://m.lread.net"):
rl = re.compile(r"<p><a href='(.*?)'>(.*?)</a></p>") # 匹配文章列表
novels = rl.findall(page_source)
novel_list = []
for i in novels:
novel_dict = (i[1],reurl+i[0])
novel_list.append(novel_dict)
return novel_list
def getOneNovel(pagecode):
rl = re.compile(r'<div id="novelcontent" class="novelcontent"><p>(.*?)</p>') # 匹配小說內容
content = str(rl.findall(pagecode)[0])
content = content.replace('\n','').replace('\t','').replace(' ','').replace(" "," ").replace("<br/><br/>","\n")
return content
def oneCapter(url):
page_source = getCode(url.replace(".html","-1.html"))
rl = re.compile(r'<h1 id="chaptertitle">(.*?)(1/(.*?))</h1>') # 匹配總頁數
content = rl.findall(page_source)
page = int(content[0][1])
all_content = ""
for i in range(1,page+1):
v_url = url.replace(".html", f"-{i}.html")
all_content += getOneNovel(getCode(v_url))
return all_content
def download_novel(url="https://m.lread.net/read/208/", name="修仙狂少.txt"):
page_source = getCode(url=url)
novelList = getNovelList(page_source)
print(novelList)
f = open(name, encoding="utf-8", mode="w+")
for novel in range(len(novelList)):
title = novelList[novel][0]
page_url = novelList[novel][1]
all_content = title + "\n\n" + oneCapter(page_url) + "\n\n"
print(f"\r正在寫入: {title} {novel + 1}/{len(novelList)}", end="")
f.write(all_content)
f.close()
if __name__ == '__main__':
download_novel()