爬取imdb資料庫

原創

珂鸣玉

2019-10-25 22:41

寫一個簡單的python爬蟲程序，爬取imdb資料庫，將爬取到的電影信息，存儲到一個excel表格中

因爲imdb資料庫電影網沒有反扒措施，直接爬取即可

代碼如下：

import requests
from lxml import etree
import pandas as pd
import numpy as np

# 第一頁：'http://www.imdb.cn/IMDB250/'
# 第一頁：'http://www.imdb.cn/imdb250/2'
# 第三頁：'http://www.imdb.cn/imdb250/3'

class IMDB:

    def __init__(self,move_name_list,move_score_list,move_bieming_list,move_ywm_list,move_dir_list,move_time_list):
        self.move_name_list = move_name_list
        self.move_score_list = move_score_list
        self.move_bieming_list = move_bieming_list
        self.move_ywm_list = move_ywm_list
        self.move_dir_list = move_dir_list
        self.move_time_list = move_time_list
        self.spider()

    def shuzu(self,content):
        '''
        將列表轉化爲二維數組
        :param content: 列表
        :return:  二維數組
        '''
        content = np.array(content)
        content = content.reshape(len(content),1)
        return content

    def pankong(self,content):
        """
        判空函數
        :param content: 列表
        :return: 字符串
        """
        if len(content) > 0:
            return content[0]
        else:
            return '無'

    def spider(self):
        '''
        爬蟲程序
        :return:
        '''
        header = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
            'Host': 'www.imdb.cn'
        }
        for page in range(1,10):
            if page == 1:
                url = 'http://www.imdb.cn/IMDB250/'
            else:
                url = f'http://www.imdb.cn/imdb250/{page}'
            response = requests.get(url=url,headers=header).content.decode('utf-8')
            tree = etree.HTML(response)
            a_list = tree.xpath('//div[@class="ss-3 clear"]/a')
            print(len(a_list))
            for a in a_list:
                # 電影名稱
                move_name = a.xpath('.//div[@class="honghe-3"]/p/text()')
                move_name = self.pankong(move_name)
                self.move_name_list.append(move_name)
                print(move_name)
                # 電影評分
                move_score = a.xpath('.//div[@class="honghe-2"]/span/i/text()')
                move_score =  self.pankong(move_score)
                self.move_score_list.append(move_score)
                print(move_score)
                # 電影別名
                move_bieming = a.xpath('.//div[@class="honghe-4 clear"]/p[1]/i/text()')
                move_bieming =  self.pankong(move_bieming)
                self.move_bieming_list.append(move_bieming)
                print(move_bieming)
                # 電影英文名
                move_ywm = a.xpath('.//div[@class="honghe-4 clear"]/p[2]/text()')
                move_ywm = self.pankong(move_ywm)
                self.move_ywm_list.append(move_ywm)
                print(move_ywm)
                # 導演
                move_dir = a.xpath('.//div[@class="honghe-4 clear"]/p[3]/span/text()')
                move_dir = self.pankong(move_dir)
                self.move_dir_list.append(move_dir)
                print(move_dir)
                # 年份
                move_time = a.xpath('.//div[@class="honghe-4 clear"]/p[4]/i[3]/text()')
                move_time = self.pankong(move_time)
                self.move_time_list.append(move_time)
                print(move_time)
        # 將所有列表轉換爲二維數組
        self.move_name_list = self.shuzu(self.move_name_list)
        self.move_score_list = self.shuzu(self.move_score_list)
        self.move_bieming_list = self.shuzu(self.move_bieming_list)
        self.move_ywm_list = self.shuzu(self.move_ywm_list)
        self.move_dir_list = self.shuzu(self.move_dir_list)
        self.move_time_list = self.shuzu(self.move_time_list)
        # 將二維數組進行數據類型轉換，並存到excel表格中
        self.save_data(self.move_name_list,self.move_score_list,self.move_bieming_list,self.move_ywm_list,self.move_dir_list,self.move_time_list)

    def save_data(self,s1,s2,s3,s4,s5,s6):
        '''
        建立dataFrame數據類型,並將所有數據存到excel表格中
        :return:
        '''
        # 將所有二維數組進行橫向拼接
        content = np.concatenate((s1,s2,s3,s4,s5,s6),axis=1)
        print(content)
        print(content.shape)
        # 將拼接後的二維數組轉化爲dataFframe數據類型
        data = pd.DataFrame(content,columns=['電影名稱','評分','別名','英文名','導演','上映時間'],index=range(1,248))
        # 在同級目錄下創建IMDBtop250電影信息1.xlsx文件，並將數據存到該文件中
        data.to_excel('IMDBtop250電影信息1.xlsx',index=False)

if __name__ == '__main__':
    move_name_list = []
    move_score_list = []
    move_bieming_list = []
    move_ywm_list = []
    move_dir_list = []
    move_time_list = []
    IMDB(move_name_list,move_score_list,move_bieming_list,move_ywm_list,move_dir_list,move_time_list)

運行結果：

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

爬取imdb資料庫

python常見面試題集錄（一）

python希爾排序、歸併排序

python遞歸函數案例

python 二叉樹簡介

計算從2000年1月1日至今有多少個週一

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結