python多線程抓取小說

這幾天在寫js腳本,突然想寫一個抓取小說的腳本,於是磕磕碰碰,慢慢寫了一個比較完善的腳本,同時對於自身所學進一步鞏固。

1. 環境

  • python版本: Python 3.7.3
  • 編輯器:VScode
  • Python插件: ms-python.python
  • 操作系統: MAC

setings.json配置:

{
  "python.pythonPath": "/usr/local/bin/python3",
  "python.formatting.provider": "black"
}

launch.json配置:

{
  // 使用 IntelliSense 瞭解相關屬性。
  // 懸停以查看現有屬性的描述。
  // 欲瞭解更多信息,請訪問: https://go.microsoft.com/fwlink/?linkid=830387
  "version": "0.2.0",
  "configurations": [
    {
      "name": "Python: file",
      "type": "python",
      "request": "launch",
      "program": "${file}",
      "args": ["-g", "5", "-b"],
      "console": "integratedTerminal"
    }
  ]
}

2. python依賴安裝

# 初始化 beautifulSoup4
pip3 install bs4
pip3 install lxml

pip3 install json # 初始化json
pip3 install fnmatch # 查找文件
pip3 install hashlib # md5
pip3 install requests # 網絡請求

pip3 install threading # 線程

3. 自定義庫 - common

目錄結構:

- common
  - __init__.py
  - util.py # 工具類
  - thread_manager.py # 線程管理器

3.1. util.py 工具類

方法介紹:

class Util # 工具類
  def write_file(self, filename, data, mode="w+") # 存儲文件
  def append_file(self, filename, data) # 追加數據到文件
  def read_file(self, filename, mode="r") # 讀取文件
  def soup(self, url) # 抓取網頁
  def list_files(self, path, fnexp) # 搜索文件
  def now(self)  # 當前時間 毫秒級
  def now_s(self)  # 當前時間 秒級
  def recode_begin(self, group="default") # 記錄開始時間
  def recode_end(self, group="default") # 打印距開時間的時間差
  def time_format(self, timestamp, parttern="%Y-%m-%d %H:%M:%S") # 格式化時間
  def md5(self, content) # 獲取字符串md5碼

3.2. thread_manager.py 多線程管理類

方法介紹:

class ThreadManager:
    def __init__(self, max_size=10) # 初始化 max_size 數據量
    def exit(self) # 線程退出
    def lock(self) # 加鎖
    def unlock(self) # 解鎖
    def put_data(self, data) # 添加數據
    def put_thread(self, thread) # 添加並啓動線程
    def put_cbk_thread(self, thread_name, cbk, repeat=1) # 添加方法,方法會在一個線程中執行
    def join(self) # 等待所有線程執行完成
    def wait(self) # 等待數據全部被消費
   

3.3 init.py 文件

# coding=utf-8

__version__ = "1.0.0"
__all__ = ["cm_util", "ThreadManager"]

__author__ = "yszm"

from .util import *
from .thread_manager import *

4. 抓取小說

抓取小說總共分爲3部分內容: 標題、目錄和具體內容

但這3部分抓取方法大同小異,都是通過選擇器選擇對應的元素,過濾掉不必要的元素,然後獲取相對應的屬性和文本,然後對文件進行縮進。替換等處理

這裏以67書吧的小說《第一序列》爲例,地址: https://www.67shu.com/111/111473

import time
import json
import sys
import os

if __name__ == "__main__":
    from common import *
else:
    from .common import *


URL1 = "https://www.67shu.com/111/111473/"
URL2 = "https://www.67shu.com/40/40190/"

URL = "story/result/{}.txt"


def get_cpt(url):
    doc = cm_util.soup(url)

    data = {"name": "unknow"}
    # 獲取標題
    h1 = doc.select(".book_info .xiaoshuo h1")
    if len(h1) > 0:
        data["name"] = h1[0].contents[0]

    # 獲取所有鏈接
    links = doc.select(".novel_list .book_article_listtext dd a")
    cp_arr = []
    for item in links:
        cp_arr.append(
            {"url": (url + "{}").format(item.get("href")), "name": item.string}
        )
    data["cp"] = cp_arr
    return data


def get_content(data):
    dest_file = URL.format(data["name"])
    cm_util.write_file(dest_file, "")

    for item in data["cp"]:
        doc = cm_util.soup(item["url"])
        con = doc.select(".yuedu_page .yuedu_zhengwen")
        if len(con) > 0:
            c = con[0].text
            txt = (
                c.replace("\\n", "\n")
                .replace("\\r", "")
                .replace("\xa0", "")
                .replace("一秒記住【67書吧 www.67shu.com】,", "")
                .replace("精彩小說無彈窗免費閱讀!", "")
                .replace("            ", "  ")
                .replace("        ", "")
            )
            print("get data: %s" % item["name"])
            cm_util.write_file(dest_file, "\n\n%s\n\n" % item["name"], "a+")
            cm_util.write_file(dest_file, txt, "a+")


if __name__ == "__main__":
    get_content(get_cpt(URL2))

看起來,是不是特別簡單呢?

不過,這樣一章章地抓取太慢了,尤其是一些大牛,寫了幾千章,抓取就特別費時了,這時候,就需要採用多線程抓取了。

5. 多線程抓取小說

採用自定義線程管理器類:ThreadManager

需要實現方法: def handle_data(data, thread_id. thread_name)

這裏以全本小說網的小說《英雄聯盟我的時代》爲例:


import time
import json
import sys
import os

if __name__ == "__main__":
    from common import *
else:
    from .common import *

URL1 = "http://www.126shu.com/99596/"
URL_CONTENT = "http://www.126shu.com/{}"

URL_RESULT = "story/result/{}.txt"
URL_DATA = "story/data/{}.txt"

def get_cpt(url):
    doc = cm_util.soup(url)

    data = {"name": "unknow"}
    # 獲取標題
    h1 = doc.select("#info .hh")
    if len(h1) > 0:
        data["name"] = h1[0].string

    # 獲取所有鏈接
    links = doc.select("#headlink #list dl dd a")
    cp_arr = []
    for item in links:
        cp_arr.append(
            {"url": URL_CONTENT.format(item.get("href")), "name": item.string}
        )
    data["cp"] = cp_arr
    return data


def get_text(item):
    dest_file = URL_DATA.format(item["name"])
    if os.path.exists(dest_file):
        print("exist file, so we will use cache: %s " % dest_file)
        return dest_file
    doc = cm_util.soup(item["url"])
    con = doc.select("#content")

    if len(con) > 0:
        con_l = con[0].select(".zjtj")
        if len(con_l) > 0:
            con_l[0].extract()
        con_l = con[0].select(".zjxs")
        if len(con_l) > 0:
            con_l[0].extract()
        c = con[0].text
        txt = (
            c.replace("www.126shu.com", "")
            .replace("\r", "")
            .replace("請百度搜索()", "")
            .replace("\xa0", "\n")
            .replace("\n\n\n\n", "\n\n")
            .replace("\n\n\n\n", "\n\n")
        )  # replace("\r", "\n\n").replace("         ", "")
        print("get data: %s" % item["name"])
        cm_util.write_file(dest_file, ("\n\n%s" % item["name"]) + txt, "a+")
        return dest_file
    return None


# 保存路徑
text_path = {}

def get_text_thread(item, id, name):
    path = get_text(item)
    if path:
        text_path[item["name"]] = path
    else:
        print("[warn]: cannot find content: %s,%s" % (item["url"], item["name"]))


def get_content(data):
    # 小說名稱
    dest_file = URL_RESULT.format(data["name"])
    cm_util.write_file(dest_file, "")

    manager = ThreadManager(len(data["cp"]))
    thread_names = [
        "thread_a",
        "thread_b",
        "thread_c",
        "thread_d"
    ]
    manager.put_data(data["cp"])
    manager.put_cbk_thread(thread_names, get_text_thread)
    # 等待隊列清空
    manager.wait()
    # 通知線程是時候退出
    manager.exit()
    # 等待所有線程完成
    manager.join()

    # 按照順序合併
    for item in data["cp"]:
        path = text_path.get(item["name"], None)
        if path:
            txt = cm_util.read_file(path)
            cm_util.append_file(dest_file, txt)

if __name__ == "__main__":
    cm_util.recode_begin()
    get_content(get_cpt(URL1))
    cm_util.recode_end()


6. 進一步升級,實現工廠方法

爲了更具有通用性,所以應該抽取共同部分,通過動態注入方法,從而增加腳本的可拓展性。

不多說,放源碼:

# coding=utf-8

import os

if __name__ == "__main__":
    from ..common import *
    from .parser import *
else:
    from common import *
    from story.parser import *


URL_RESULT = "python/story/result/{}.txt"
URL_DATA = "python/story/data/{}"
URL_TXT_CHAPTER = "python/story/data/{}/{}.txt"
CONFIG_DATA = "python/story/data/{}/config.json"

class Parser:
    def __init__(self, base_url=""):
        self.base_url = base_url

    def get_chapter(self, url):
        return None

    def get_text(self, item):
        return None

class StoryFactory:
    def __init__(self):
        self.url_matcher = {}
        self.max_thread_size = 10

    # 註冊
    def registe(self, base_url, get_chapter, get_text):
        self.url_matcher[base_url] = {
            "base_url": base_url,
            "get_cpt": get_chapter,
            "get_text": get_text,
        }

    def registe_paser(self, p):
        self.registe(p.base_url, p.get_chapter, p.get_text)

    def match(self, url):
        for base_url in self.url_matcher:
            if url.startswith(base_url):
                return base_url
        return None

    def get_text_thread(self, item, id, name):
        conf_path = CONFIG_DATA.format(item["book_key"])
        chap_data = cm_util.read_file(conf_path)

        get_text = self.url_matcher[chap_data["base_url"]].get(
            "get_text", None
        )  # (item)
        if not get_text:
            print("[warn] not match url: %s" % item["url"])
            return
        txt = get_text(item)
        if txt:
            cm_util.write_file(
                URL_TXT_CHAPTER.format(item["book_key"], cm_util.md5(item["name"])), txt
            )
        else:
            print("[warn]: cannot find content: %s,%s %s" % (item["url"], item["name"]))

    def run(self, url):
        key = cm_util.md5(url)
        cm_util.recode_begin(key)

        base_url = self.match(url)
        if not base_url:
            print("[warn] not match url: %s" % url)
            return

        print("[info] url:[%s] %s - %s" % (key, url, base_url))
        if not os.path.exists(URL_DATA.format(key)):
            os.makedirs(URL_DATA.format(key))

        matcher = self.url_matcher[base_url].get("get_cpt", None)
        if not matcher:
            print("[warn] not match url: %s" % url)
            return
        chap_data = matcher(url)

        conf_path = CONFIG_DATA.format(key)
        if os.path.exists(conf_path):
            chap_data = cm_util.read_file(conf_path)
        else:
            chap_data["base_url"] = base_url
            for item in chap_data["chapter"]:
                name = item.get("name", None)
                if name:
                    item["key"] = cm_util.md5(name)
                item["book_key"] = key
            cm_util.write_file(conf_path, chap_data)

        manager = ThreadManager(len(chap_data["chapter"]))
        thread_names = []
        for ch in range(self.max_thread_size):
            thread_names.append("thread_%d" % ch)

        manager.put_data(chap_data["chapter"])
        manager.put_cbk_thread(thread_names, self.get_text_thread)
        # 等待隊列清空
        manager.wait()
        # 通知線程是時候退出
        manager.exit()
        # 等待所有線程完成
        manager.join()

        # 小說名稱
        dest_file = URL_RESULT.format(chap_data["title"])
        cm_util.write_file(dest_file, "")
        # 按照順序合併
        for item in chap_data["chapter"]:
            ch_path = URL_TXT_CHAPTER.format(key, cm_util.md5(item["name"]))
            txt = cm_util.read_file(ch_path)
            if txt:
                cm_util.append_file(dest_file, txt)
        cm_util.recode_end(key)


story_factory = StoryFactory()

init.py文件:

# coding=utf-8

__version__ = "1.0.0"
__all__ = ["story_factory", "Parser"]

__author__ = "yszm"

from .story import *

使用:


if __name__ == "__main__":
    from story import *
else:
    from .story import *

class P67shu(Parser):
    def __init__(self):
        super().__init__("https://www.67shu.com")

    def get_chapter(self, url):
        doc = cm_util.soup(url)

        data = {"title": "unknow"}
        # 獲取標題
        h1 = doc.select(".book_info .xiaoshuo h1")
        if len(h1) > 0:
            data["title"] = h1[0].contents[0]

        # 獲取所有鏈接
        links = doc.select(".novel_list .book_article_listtext dd a")
        cp_arr = []
        for item in links:
            cp_arr.append(
                {"url": (url + "{}").format(item.get("href")), "name": item.string}
            )
        data["chapter"] = cp_arr
        return data

story_factory.registe_paser(P67shu())

if __name__ == "__main__":
    url = "https://www.67shu.com/112/112336/"
    story_factory.run(url)

7. common 源碼

源碼如下:

common/util.py

# coding=utf-8

import json
import os
import requests
from bs4 import BeautifulSoup
import fnmatch
import time
import hashlib

# 請求頭配置
HEADER_CONFIG = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
}

class Util:
    def __init__(self):
        self.time_map = None

    # 存儲數據到文件中
    def write_file(self, filename, data, mode="w+"):
        f = open(filename, mode)
        try:
            if isinstance(data, dict):
                data = json.dumps(data, ensure_ascii=False)
            f.write(data)
        finally:
            f.close()
        print("write data to file: %s" % filename)

    # 追加數據
    def append_file(self, filename, data):
        if not os.path.exists(filename):
            self.write_file(filename, data)
        else:
            self.write_file(filename, data, "a+")

    # 讀取文件中的數據
    def read_file(self, filename, mode="r"):
        data = None
        if not os.path.exists(filename):
            return data
        f = open(filename, mode)
        try:
            data = f.read()
            if filename.endswith(".json"):
                data = json.loads(data)
        finally:
            f.close()
        print("read data from file: %s" % filename)
        return data

    # 抓取數據
    def soup(self, url):
        s = requests.session()
        s.keep_alive = False
        txt = s.get(url, headers=HEADER_CONFIG, timeout=120).content
        return BeautifulSoup(txt, "lxml")

    # 搜索文件
    def list_files(self, path, fnexp):
        for root, dirs, files in os.walk(path):
            for filename in fnmatch.filter(files, fnexp):
                yield os.path.join(root, filename)

    # 當前時間 毫秒級
    def now(self):
        return int(round(time.time() * 1000))

    # 當前時間 秒級
    def now_s(self):
        return int(time.time())

    # 記錄開始時間戳
    def recode_begin(self, group="default"):
        if not self.time_map:
            self.time_map = {}
        self.time_map[group] = self.now()
        return self.time_map[group]

    # 記錄結束時間戳
    def recode_end(self, group="default"):
        t = (self.now() - self.time_map[group]) / 1000
        print("[%s]: 消耗時間:%s 秒" % (group, t))
        self.time_map.pop(group)
        return t

    # 格式化時間戳
    def time_format(self, timestamp, parttern="%Y-%m-%d %H:%M:%S"):
        time_local = time.localtime(timestamp)
        return time.strftime(parttern, time_local)

    # 獲取md5文本
    def md5(self, content):
        return hashlib.md5(content.encode("utf8")).hexdigest()


cm_util = Util()

common/thread_manager.py

# -*- coding: UTF-8 -*-
#
import queue
import threading
import time
#
class ThreadManager:
    def __init__(self, max_size=10):
        self.exit_flag = 0
        self.work_queue = queue.Queue(max_size)
        self.queue_lock = threading.Lock()
        self.threads = []
        self.cbk_group = {}
        self.thread_id = 100000
    #
    # 通知線程是時候退出
    def exit(self):
        self.exit_flag = 1
    #
    # 加鎖
    def lock(self):
        # 填充隊列
        self.queue_lock.acquire()
    #
    # 解鎖
    def unlock(self):
        self.queue_lock.release()
    #
    # 添加數據
    def put_data(self, data):
        self.lock()
        for item in data:
            self.work_queue.put(item)
        self.unlock()
    #
    # 添加線程
    def put_thread(self, thread):
        thread.start()
        self.threads.append(thread)
        self.thread_id = self.thread_id + 1
    #
    # 添加回調類型的線程 cbk:def cbk(data: 數據, thread_id: 線程id, thread_name:線程名稱)
    def put_cbk_thread(self, thread_name, cbk, repeat=1):
        if isinstance(thread_name, list):
            repeat = len(thread_name)
        if repeat == 1:
            thread = CBThread(self.thread_id, thread_name, self.process_data)
            self.cbk_group[self.thread_id] = cbk
            self.put_thread(thread)
        else:
            for i in range(repeat):
                name = thread_name
                if isinstance(thread_name, list):
                    name = thread_name[i]
                else:
                    name = "%s(%d)" % (thread_name, i + 1)
                thread = CBThread(self.thread_id, name, self.process_data)
                self.cbk_group[self.thread_id] = cbk
                self.put_thread(thread)
    #
    # 等待所有線程完成
    def join(self):
        for t in self.threads:
            t.join()
    #
    # 等待隊列清空
    def wait(self):
        while not self.work_queue.empty():
            pass
    #
    # 數據數量
    def data_size(self):
        return self.work_queue.qsize()
    #
    # 線程數量
    def thread_size(self):
        return len(self.threads)
    #
    # 處理數據
    def process_data(self, thread_id, thread_name):
        while not self.exit_flag:
            data = None
            self.lock()
            if not self.work_queue.empty():
                try:
                    data = self.work_queue.get()
                finally:
                    self.unlock()
                # 如果有回調,則處理回調
                cbk = self.cbk_group.get(thread_id, None)
                if data and cbk:
                    cbk(data, thread_id, thread_name)
                print("%s[%d] processing" % (thread_name, thread_id))
            else:
                self.unlock()
                time.sleep(1)
#
# 通用線程, 內含回調 可配合線程管理器使用
class CBThread(threading.Thread):
    def __init__(self, thread_id, name, cbk):
        threading.Thread.__init__(self)
        self.thread_id = thread_id
        self.thread_name = name
        self.cbk = cbk
    #
    def run(self):
        print("Starting %s" % self.thread_name)
        self.cbk(self.thread_id, self.thread_name)
        print("Exiting %s" % self.thread_name)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章