網頁解析之xpath
一、簡介
二、語法
三、xpath練習:
爬取全書網玄幻魔法分類中的完本小說
import requests
from lxml import etree
import re
import time
from threading import Thread
from multiprocessing.pool import ThreadPool
def my_session(url, headers=None):
session = requests.session()
html = session.get(url)
html.encoding = "gbk"
return html.text
def save_story(chapter_urls, titles, number):
"""
保存所有章節內容
:param chapter_urls: 每個章節的url,a list
:param titles: 每本書的標題,a list
:param number:
:return:
"""
for url in chapter_urls:
chapter_content = my_session(url) # 對每個章節的url循環發起請求
selector_chapter = etree.HTML(chapter_content)
while selector_chapter is None:
chapter_content = my_session(url)
selector_chapter = etree.HTML(chapter_content)
chapter_title = selector_chapter.xpath('//h1/strong/text()') # 匹配該章的章節名
chapter_content = selector_chapter.xpath('//div[@id="content"]/text()') # 匹配該章的章節內容
print(chapter_title)
# print(chapter_content)
chapter_content_msg = "".join(chapter_content) # 將列表拼接成字符串
# print(chapter_content_msg)
chapter_content_res = re.sub(r"\.*?\xa0|\.*?d", "", chapter_content_msg) # 利用正則替換特殊字符
chapter_content_result = chapter_content_res.replace("\r\n", "\r") # 利用字符串的replace方法將“回車換行”替換成“回車”
with open(r"D:\Python學院學習環境\pachong\story\{}.txt".format(titles[number]), "a")as f:
f.write("".join(chapter_title)+"\n") # 寫入章節名
f.write(chapter_content_result+"\n") # 寫入章節內容
# print(html)
# print(read_url)
if __name__ == "__main__":
pool = ThreadPool(5) # 實例一個線程池,線程數爲5
start_time = time.time()
index_url = "http://www.quanshuwang.com/all/allvisit_1_0_0_0_1_0_1.html"
html = my_session(index_url)
# print(html)
selector = etree.HTML(html)
title = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/h2/text()') # 獲取每本書的標題
author = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/div[@class="author-container"]/dl/dd/p/text()') # 獲取每本書的作者
story_url = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/@href') # 每本書的url
print(title)
t_list = []
# 循環下載
for i in range(5, 10):
html = my_session(story_url[i]) # 對每本書的url發起請求
selector_url = etree.HTML(html) # 每發起一次請求,都要實例一個選擇器對象
synopsis = selector_url.xpath('//div[@id="waa"]/text()') # 通過選擇器對象匹配該書的簡介
read_url = selector_url.xpath('//div[@class="detail"]/a/@href') # 通過選擇器對象匹配該書的閱讀url
html_content = my_session(read_url[0]) # 對該書發起閱讀請求
selector_content_url = etree.HTML(html_content)
chapter_url = selector_content_url.xpath('//div[@class="clearfix dirconone"]/li/a/@href') # 獲取該書每個章節的url
# print(synopsis)
# print(html_content)
print(chapter_url, len(chapter_url))
synopsis_str = "".join(synopsis) # 將列表拼接成字符串
print(synopsis_str)
synopsis_res = synopsis_str.replace("\xa0\xa0\xa0\xa0", " ") # 替換特殊字符
# 初始化文件
with open(r"D:\Python學院學習環境\pachong\story\{}.txt".format(title[i]), "w")as file: # 新建文本文件
file.write(author[i]+"\n")
file.write(synopsis_res) # 寫入小說簡介
# 不使用多線程
# save_story(chapter_url)
# 使用多線程
# th = Thread(target=save_story, args=(chapter_url, title, i))
# t_list.append(th)
# for t in t_list:
# t.start()
# for t in t_list:
# t.join()
pool.apply_async(save_story, args=(chapter_url, title, i))
pool.close()
pool.join()
print("耗時:{}".format(time.time()-start_time))
"不使用線程耗時:514.7513475418091"
"一個for循環啓動線程(相當於單線程,多個線程順序執行)耗時:526.5542323589325"
"雙線程耗時:291.0101172924042"
效果圖展示