Python爬蟲練習三-爬取豆瓣電影top250

前言:

練習基本的Python爬蟲入手項目:噹噹網、豆瓣網。

 

 

實現代碼:

涉及到知識點:

  • beautifulSoup使用
  • xlwt用於xml的存儲
  • 設置時間
  • 文本寫操作
  • 爬取網站信息
import json
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import re
import time
import xlwt

def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
            }

        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

book = xlwt.Workbook(encoding='utf-8', style_compression=0)

sheet = book.add_sheet('豆瓣電影Top250', cell_overwrite_ok=True)
sheet.write(0, 0, '名稱')
sheet.write(0, 1, '圖片')
sheet.write(0, 2, '排名')
sheet.write(0, 3, '評分')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '簡介')

n = 1
def save_to_excel(soup):
    list = soup.find(class_='grid_view').find_all('li')

    for item in list:
        item_name = item.find(class_='title').string
        item_img = item.find('a').find('img').get('src')
        item_index = item.find(class_='').string
        item_score = item.find(class_='rating_num').string
        item_author = item.find('p').text
        if (item.find(class_='inq') != None):
            item_intr = item.find(class_='inq').string
        # print('爬取電影:' + item_index + ' | ' + item_name +' | ' + item_img +' | ' + item_score +' | ' + item_author +' | ' + item_intr )
        print('爬取電影:' + item_index + ' | ' + item_name + ' | ' + item_score + ' | ' + item_intr)
        global n
        sheet.write(n, 0, item_name)
        sheet.write(n, 1, item_img)
        sheet.write(n, 2, item_index)
        sheet.write(n, 3, item_score)
        sheet.write(n, 4, item_author)
        sheet.write(n, 5, item_intr)
        n = n + 1

def parse_one_page(html):
    pattern = re.compile(
        '<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',
        re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {'index': item[0],
               'title': item[1],
               'score': item[2],
               'comment': item[3]
               }


def write_to_file(content):
    with open('douban250.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def main(offset):
    url = 'https://movie.douban.com/top250?start=' + str(offset) + '&filter='
    html = get_one_page(url)
    soup = BeautifulSoup(html, 'lxml')
    save_to_excel(soup)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)


if __name__ == '__main__':
    for i in range(10):
        main(offset=i * 25)
        time.sleep(1)

book.save(u'豆瓣最受歡迎的250部電影.xlsx')

 

抓取B站-selenium 爬取b站上的請回答1988的視屏信息

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import xlwt

#配置google驅動
chrome_driver = 'D:\pythonEdit\chromedriver_win32\chromedriver.exe'  # chromedriver的文件位置
browser = webdriver.Chrome(executable_path=chrome_driver)
#等到這個元素可操作的時候纔會繼續執行下一步,時間爲10秒
WAIT = WebDriverWait(browser, 10)
#瀏覽器的窗口
browser.set_window_size(1400, 900)

#ECCEL格式
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
#EXCEL名稱與題目
sheet = book.add_sheet('請回答1988', cell_overwrite_ok=True)
sheet.write(0, 0, '名稱')
sheet.write(0, 1, '地址')
sheet.write(0, 2, '描述')
sheet.write(0, 3, '觀看次數')
sheet.write(0, 4, '彈幕數')
sheet.write(0, 5, '發佈時間')
#全局變量Excel條數
n = 1

#搜索內容
def search():
    try:
        print('開始訪問b站....')
        browser.get("https://www.bilibili.com/")

        # 被那個破登錄遮住了
        # index = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#primary_menu > ul > li.home > a")))
        # index.click()

        #等到這個元素可操作的時候纔會繼續執行下一步
        input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#nav_searchform > input")))
        #獲取提交的標籤
        submit = WAIT.until(EC.element_to_be_clickable(
            (By.XPATH, '/html/body/div[2]/div/div[1]/div[1]/div/div[2]/div/form/div/button')))

        input.send_keys('請回答1988')
        submit.click()

        # 跳轉到新的窗口
        print('跳轉到新窗口')
        #當前窗口第一頁的全部數據
        all_h = browser.window_handles
        browser.switch_to.window(all_h[1])
        get_source()

        #獲取總頁數
        total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,
                                                           "#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button")))
        return int(total.text)
    except TimeoutException:
        return search()


def next_page(page_num):
    try:
        print('獲取下一頁數據')
        #下一頁的按鈕
        next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR,
                                                          '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
        next_btn.click()
        #直到頁數被點擊
        WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,
                                                     '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),
                                                    str(page_num)))
        get_source()
    except TimeoutException:
        browser.refresh()
        return next_page(page_num)

#將數據存到Excel
def save_to_excel(soup):
    list = soup.find(class_='video-list clearfix').find_all(class_='video-item matrix')

    for item in list:
        item_title = item.find('a').get('title')
        item_link = item.find('a').get('href')
        item_dec = item.find(class_='des hide').text
        item_view = item.find(class_='so-icon watch-num').text
        item_biubiu = item.find(class_='so-icon hide').text
        item_date = item.find(class_='so-icon time').text

        print('爬取:' + item_title)

        global n

        sheet.write(n, 0, item_title)
        sheet.write(n, 1, item_link)
        sheet.write(n, 2, item_dec)
        sheet.write(n, 3, item_view)
        sheet.write(n, 4, item_biubiu)
        sheet.write(n, 5, item_date)

        n = n + 1


def get_source():
    #等到這個元素可操作的時候纔會繼續執行下一步
    WAIT.until(EC.presence_of_element_located(
        (By.CSS_SELECTOR, '#all-list > div.flow-loader > div.filter-wrap')))
    #上述元素之後獲剩餘的HTML
    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    print('到這')

    save_to_excel(soup)


def main():
    try:
        total = search()
        print(total)

        for i in range(2, int(total + 1)):
            next_page(i)

    finally:
        browser.close()


if __name__ == '__main__':
    main()
    book.save('請回答1988.xlsx')

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章