簡介

首先看下我們的小白鼠站點: 殼蛋網，這是一個博客，主要是發佈個人文章或者收錄網上優質文章, 包含SEO丶網賺丶編程技術等內容。

接下來就通過實踐一步一步的將整站文章抓取下來。

頁面分析

觀察這張圖片:

可以發現其實導航欄就是一個分類來的, 所以我們要做的就算將右側分類目錄下的分類先抓取下來。

查看該圖可以發現，分類是包含在一個class="widget_categories"的標籤裏面的。

裏面的li標籤對應的a標籤鏈接，就是分類目錄了，先將這些分類鏈接抓取下來。

隨便點擊一個文章多一點的分類：

可以看到尾頁的頁碼和鏈接包裹在<div class="pagination">標籤裏面，拿到尾頁頁碼即可構造出所有的分頁下的鏈接。

文章列表均在article標籤裏面。

隨便點擊一篇文章:
可以發現標題包裹在: <h1 class="article-title">中, 文章內容題包裹在<article class="article-content">中。

接下來就可以實戰寫代碼了。

實戰

爬取分類

通過訪問首頁去解析分類div塊內容，獲取所有分類的名稱和url。

import requests
from bs4 import BeautifulSoup


def fetch_html(url):
    """
    獲取網頁html源碼
    :return:
    """
    try:
        response = requests.get(url)
        return response.text
    except Exception as e:
        print(e.args)
        return None


def get_category(url='https://www.fenlanli.com/'):
    """
    :return:
    """
    res = []
    html = fetch_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    category_div = soup.find('div', {'class': 'widget_categories'})
    a_tag_list = category_div.find_all('a')  # 查找分類塊下所有的分類a標籤
    for a_tag in a_tag_list:
        res.append({
            'title': a_tag.text,  # 取分類名稱
            'url': a_tag.attrs['href'] # 取分類鏈接
        })
    return res


if __name__ == '__main__':
    categories = get_category()
    print(categories)

爬取文章列表

def get_article_list(category: dict):
    """
    獲取文章鏈接
    :return:
    """
    articles = []
    url = category['url']
    title = category['title']
    print('正在獲取分類:{} 下的文章列表...'.format(title))
    html = fetch_html(url)
    # 正則表達式搜索尾頁頁碼
    search_page = re.findall('尾頁</a></li><li><span>共 (.*?) 頁</span></li></ul></div>', html)
    if len(search_page) == 0:  # 搜索不到表明只有1頁
        max_page = 1
    else:
        max_page = int(search_page[0])

    for page in range(1, max_page + 1):
        cate_page_url = url + '/page/' + str(page)  # 分頁頁碼拼接
        cate_html = fetch_html(cate_page_url)
        soup = BeautifulSoup(cate_html, 'html.parser')
        article_div = soup.find_all('article', {'class': 'excerpt'})  # 查找文章標籤
        for art in article_div:
            art_url = art.h2.a.attrs['href']  # 文章鏈接
            art_title = art.h2.a.text  # 文章標題
            articles.append({
                'title': art_title,
                'url': art_url
            })
    print('正在獲取分類:{} 下的文章列表完成, 共{}篇...'.format(title, len(articles)))
    return articles

爬取文章詳情

def save_article(q):
    """
    :return:
    """
    while not q.empty():
        data = q.get()
        save_dir = os.path.join(os.getcwd(), data['cate_name'])
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        title = data['title']
        save_path = os.path.join(save_dir, title + '.html')
        save_path.replace('/', r'\/')  # 將/轉義
        print('正在獲取文章: 《{}》內容...'.format(title))
        url = data['url']
        text = fetch_html(url)
        soup = BeautifulSoup(text, 'html.parser')
        # 將文章體保存, 文章標題跟之前拿到的是一樣的，無需再提取
        article_content = soup.find('article', {'class': 'article-content'}) 
        with open(save_path, 'w') as f:
            f.write(str(article_content))

完整代碼

# _*_coding:utf8_*_
# Project: kdw_spider
# File: main.py
# Author: ClassmateLin
# Email: [email protected]
# Time: 2020/3/30 4:55 下午
# DESC:
import requests
from bs4 import BeautifulSoup
import re
import os
import queue


def fetch_html(url):
    """
    獲取網頁html源碼
    :return:
    """
    try:
        response = requests.get(url)
        return response.text
    except Exception as e:
        print(e.args)
        return None


def get_category(url='https://www.fenlanli.com/'):
    """
    :return:
    """
    res = []
    html = fetch_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    category_div = soup.find('div', {'class': 'widget_categories'})
    a_tag_list = category_div.find_all('a')  # 查找分類塊下所有的分類a標籤
    for a_tag in a_tag_list:
        res.append({
            'title': a_tag.text,  # 取分類名稱
            'url': a_tag.attrs['href']  # 取分類鏈接
        })
    return res


def get_article_list(category: dict):
    """
    獲取文章鏈接
    :return:
    """
    articles = []
    url = category['url']
    title = category['title']
    print('正在獲取分類:{} 下的文章列表...'.format(title))
    html = fetch_html(url)
    # 正則表達式搜索尾頁頁碼
    search_page = re.findall('尾頁</a></li><li><span>共 (.*?) 頁</span></li></ul></div>', html)
    if len(search_page) == 0:  # 搜索不到表明只有1頁
        max_page = 1
    else:
        max_page = int(search_page[0])

    for page in range(1, max_page + 1):
        cate_page_url = url + '/page/' + str(page)  # 分頁頁碼拼接
        cate_html = fetch_html(cate_page_url)
        soup = BeautifulSoup(cate_html, 'html.parser')
        article_div = soup.find_all('article', {'class': 'excerpt'})  # 查找文章標籤
        for art in article_div:
            art_url = art.h2.a.attrs['href']  # 文章鏈接
            art_title = art.h2.a.text  # 文章標題
            articles.append({
                'title': art_title,
                'url': art_url
            })
    print('正在獲取分類:{} 下的文章列表完成, 共{}篇...'.format(title, len(articles)))
    return articles


def save_article(q):
    """
    :return:
    """
    while not q.empty():
        data = q.get()
        save_dir = os.path.join(os.getcwd(), data['cate_name'])
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        title = data['title']
        save_path = os.path.join(save_dir, title + '.html')
        save_path.replace('/', r'\/')  # 將/轉義
        print('正在獲取文章: 《{}》內容...'.format(title))
        url = data['url']
        text = fetch_html(url)
        soup = BeautifulSoup(text, 'html.parser')
        # 將文章體保存, 文章標題跟之前拿到的是一樣的，無需再提取
        article_content = soup.find('article', {'class': 'article-content'})
        with open(save_path, 'w') as f:
            f.write(str(article_content))


if __name__ == '__main__':
    q = queue.Queue()
    categories = get_category()
    for cate in categories:
        articles = get_article_list(cate)
        for art in articles:
            art['cate_name'] = cate['title']
            q.put(art)
    save_article(q)

新手福利, 正經爬蟲教學, 手把手抓取殼蛋網所有文章!

簡介

頁面分析

實戰

爬取分類

爬取文章列表

爬取文章詳情

完整代碼

關於遊戲付費的一點想法

我通過CKA和CKS啦！

Python 文件類型詳解及生成使用 so/pyd文件

詳細介紹如何使用Gitbook + Github 發佈你的電子書。

django2 channels2 通過http請求推送消息至Websocket客戶端

mac brew 安裝慢替換清華源

Go Gin框架自定義路由包

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結