貼吧帖子標題 + 回覆內容 + 回覆圖片爬蟲

原創

cyj5201314

2020-06-21 13:16

import requests
from lxml import etree
import re
import json
import os
import time

class TieBaSpider():

    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }


    def get_html_str(self, url):
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            return response.content.decode("utf-8")
        else:
            return None


    def parse_list_page(self, html_str):
        posts_li = re.findall(r'<li class=" j_thread_list clearfix"(.*?)</li>', html_str, re.S)
        posts_li = map(lambda x: '<li class=" j_thread_list clearfix"' + x + "</li>", posts_li)
        posts_data = []
        for li in posts_li:
            li = etree.HTML(li)
            item = {}
            item["title"] = li.xpath(".//a[@class='j_th_tit ']/text()")[0]
            item["url"] = "https://tieba.baidu.com" + li.xpath(".//a[@class='j_th_tit ']/@href")[0]
            posts_data.append(item)

        return posts_data


    def parse_detail_page(self, html_str):
        html = etree.HTML(html_str)
        post_contents = html.xpath("//div[@class='d_post_content j_d_post_content  clearfix']/text()")
        post_contents = list(map(lambda x: x.strip(), post_contents))
        post_imgs_src = html.xpath("//div[@class='d_post_content j_d_post_content  clearfix']/img/@src")
        return post_contents, post_imgs_src

    def save_post_detail(self, item):
        fp = open("./data/tieba.json", "a", encoding="utf-8")
        json.dump(item, fp, ensure_ascii=False)
        fp.write("\n")
        fp.close()
        print(item["title"] + "\t詳情寫入成功...")


    def save_post_img(self, src_list, title):
        if len(src_list) == 0:
            print("當前帖子無圖片內容...")
            return None
        else:
            title = re.sub(r'[\\|/|:|*|?|<|>\|\n]', "", title)
            for index, src in enumerate(src_list):
                end_name = os.path.splitext(src)[1][:4]
                file_name = title + str(index) + end_name
                with open("./data/tieba_imgs/" + file_name, "wb") as fp:
                    fp.write(requests.get(src, timeout=10).content)
                time.sleep(1)
                print(file_name + "寫入成功...")



    def run(self):
        for i in range(20):
            pn = i * 50
            list_url = "https://tieba.baidu.com/f?kw=天津工業大學&ie=utf-8&pn={}".format(pn)
            html_str = self.get_html_str(list_url)
            print("當前爬取第{}頁...".format(i+1))
            posts_data = self.parse_list_page(html_str)
            for post in posts_data:
                detail_html = self.get_html_str(post["url"])
                post_contents, post_imgs_src = self.parse_detail_page(detail_html)
                item = {
                    "title": post["title"],
                    "url": post["url"],
                    "content": post_contents,
                    "imgs_src": post_imgs_src
                }
                self.save_post_detail(item)
                self.save_post_img(item["imgs_src"], item["title"])

if __name__ == '__main__':
    tbs = TieBaSpider()
    tbs.run()

爬取詳情結果如下：

爬取圖片結果如下:

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

貼吧帖子標題 + 回覆內容 + 回覆圖片爬蟲

使用neovim打造go ide(支持代碼跳轉, 代碼補全, 實時語法檢查)

挑戰程序設計競賽 2.3章習題 poj 3046 Ant Counting

Shell/Python中的用戶名獲取

scrapy利用下載器中間件給request對象修改User-Agent

scrapy利用登陸後的cookie請求人人網個人主頁

scrapy利用FormRequest.from_response模擬登陸

scrapy框架爬取起點小說分類

pandas讀取分析保險數據

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結