網頁解析之xpath

一、簡介

二、語法

三、xpath練習：

爬取全書網玄幻魔法分類中的完本小說

import requests
from lxml import etree
import re
import time
from threading import Thread
from multiprocessing.pool import ThreadPool


def my_session(url, headers=None):
    session = requests.session()
    html = session.get(url)
    html.encoding = "gbk"
    return html.text


def save_story(chapter_urls, titles, number):
    """
保存所有章節內容
    :param chapter_urls: 每個章節的url，a list
    :param titles: 每本書的標題，a list
    :param number:
    :return:
    """
    for url in chapter_urls:
        chapter_content = my_session(url)  # 對每個章節的url循環發起請求
        selector_chapter = etree.HTML(chapter_content)
        while selector_chapter is None:
            chapter_content = my_session(url)
            selector_chapter = etree.HTML(chapter_content)
        chapter_title = selector_chapter.xpath('//h1/strong/text()')  # 匹配該章的章節名
        chapter_content = selector_chapter.xpath('//div[@id="content"]/text()')  # 匹配該章的章節內容
        print(chapter_title)
        # print(chapter_content)
        chapter_content_msg = "".join(chapter_content)  # 將列表拼接成字符串
        # print(chapter_content_msg)
        chapter_content_res = re.sub(r"\.*?\xa0|\.*?d", "", chapter_content_msg)  # 利用正則替換特殊字符
        chapter_content_result = chapter_content_res.replace("\r\n", "\r")  # 利用字符串的replace方法將“回車換行”替換成“回車”
        with open(r"D:\Python學院學習環境\pachong\story\{}.txt".format(titles[number]), "a")as f:
            f.write("".join(chapter_title)+"\n")  # 寫入章節名
            f.write(chapter_content_result+"\n")  # 寫入章節內容
    # print(html)
    # print(read_url)


if __name__ == "__main__":
    pool = ThreadPool(5)  # 實例一個線程池，線程數爲5
    start_time = time.time()
    index_url = "http://www.quanshuwang.com/all/allvisit_1_0_0_0_1_0_1.html"
    html = my_session(index_url)
    # print(html)
    selector = etree.HTML(html)
    title = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/h2/text()')  # 獲取每本書的標題
    author = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/div[@class="author-container"]/dl/dd/p/text()')  # 獲取每本書的作者
    story_url = selector.xpath('//div[@class="yd-book-item yd-book-item-pull-left"]/a/@href')  # 每本書的url
    print(title)
    t_list = []
    # 循環下載
    for i in range(5, 10):
        html = my_session(story_url[i])  # 對每本書的url發起請求
        selector_url = etree.HTML(html)  # 每發起一次請求，都要實例一個選擇器對象
        synopsis = selector_url.xpath('//div[@id="waa"]/text()')  # 通過選擇器對象匹配該書的簡介
        read_url = selector_url.xpath('//div[@class="detail"]/a/@href')  # 通過選擇器對象匹配該書的閱讀url
        html_content = my_session(read_url[0])  # 對該書發起閱讀請求
        selector_content_url = etree.HTML(html_content)
        chapter_url = selector_content_url.xpath('//div[@class="clearfix dirconone"]/li/a/@href')  # 獲取該書每個章節的url
        # print(synopsis)
        # print(html_content)
        print(chapter_url, len(chapter_url))
        synopsis_str = "".join(synopsis)  # 將列表拼接成字符串
        print(synopsis_str)
        synopsis_res = synopsis_str.replace("\xa0\xa0\xa0\xa0", "    ")  # 替換特殊字符
        # 初始化文件
        with open(r"D:\Python學院學習環境\pachong\story\{}.txt".format(title[i]), "w")as file:  # 新建文本文件
            file.write(author[i]+"\n")
            file.write(synopsis_res)  # 寫入小說簡介
        # 不使用多線程
        # save_story(chapter_url)

    # 使用多線程
    #     th = Thread(target=save_story, args=(chapter_url, title, i))
    #     t_list.append(th)
    # for t in t_list:
    #     t.start()
    # for t in t_list:
    #     t.join()
        pool.apply_async(save_story, args=(chapter_url, title, i))
    pool.close()
    pool.join()
    print("耗時：{}".format(time.time()-start_time))


"不使用線程耗時：514.7513475418091"

"一個for循環啓動線程（相當於單線程，多個線程順序執行）耗時：526.5542323589325"

"雙線程耗時：291.0101172924042"

效果圖展示