豆瓣爬蟲:根據tag 爬書籍,實現urllib、BeautifulSoup、導出excel

豆瓣爬蟲:根據tag 爬書籍,實現urllib、BeautifulSoup、導出excel

import time
import urllib
import urllib.parse
import numpy as np
from bs4 import BeautifulSoup
import random
from openpyxl import Workbook

hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \
       {
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \
       {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]


def bookListByTags(tags):
    return_list = []

    for tag in tags:
        page = 0
        page_size = 10
        while page < page_size:
            url = 'http://www.douban.com/tag/' + urllib.parse.quote(tag, '') + '/book?start=' + str(page * 20)

            time.sleep(np.random.rand() * 5)

            request = urllib.request.Request(url)
            request.add_header('user-agent', str(hds[random.randint(0, len(hds) - 1)]))
            response = urllib.request.urlopen(request)

            soup = BeautifulSoup(response)
            print(soup)
            book_list = soup.find('div', {'class': 'mod book-list'})

            for dl in book_list.find_all('dd'):
                # 詳細鏈接地址
                detail_href = dl.find('a', {'class': 'title'})['href']
                desc = dl.find('div', {'class': 'desc'}).string.strip()
                title = dl.find('a', {'class': 'title'}).string.strip()
                rating = dl.find('span', {'class': 'rating_nums'})

                return_list.append([title, desc, rating, detail_href])
                # print(detail_href)
                # print(desc)
            page += 1
    return return_list


def print_book_lists_excel(book_lists, book_tag_lists):
    wb = Workbook(True)
    ws = []
    for i in range(len(book_tag_lists)):
        ws.append(wb.create_sheet(book_tag_lists[i].decode()))  # utf8->unicode
    for i in range(len(book_tag_lists)):
        ws[i].append(['序號', '書名', '描述', '評分'])
        count = 1
        for bl in book_lists[i]:
            ws[i].append([count, bl[0], float(bl[1]), int(bl[2])])
            count += 1
    save_path = 'book_list'
    for i in range(len(book_tag_lists)):
        save_path += ('-' + book_tag_lists[i].decode())
    save_path += '.xlsx'
    wb.save(save_path)


def do_spider(book_tag_lists):
    book_lists = []
    for book_tag in book_tag_lists:
        book_list = bookListByTags(book_tag)
        book_list = sorted(book_list, lambda x: x[3], True)
        book_lists.append(book_list)
    return book_lists


if __name__ == '__main__':
    tags = ['小說', '隨筆']
    book_lists = do_spider(tags)
    print_book_lists_excel(book_lists,tags)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章