import time import urllib import urllib.parse import numpy as np from bs4 import BeautifulSoup import random from openpyxl import Workbook hds = [{'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}, \ { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'}, \ {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}] def bookListByTags(tags): return_list = [] for tag in tags: page = 0 page_size = 10 while page < page_size: url = 'http://www.douban.com/tag/' + urllib.parse.quote(tag, '') + '/book?start=' + str(page * 20) time.sleep(np.random.rand() * 5) request = urllib.request.Request(url) request.add_header('user-agent', str(hds[random.randint(0, len(hds) - 1)])) response = urllib.request.urlopen(request) soup = BeautifulSoup(response) print(soup) book_list = soup.find('div', {'class': 'mod book-list'}) for dl in book_list.find_all('dd'): # 詳細鏈接地址 detail_href = dl.find('a', {'class': 'title'})['href'] desc = dl.find('div', {'class': 'desc'}).string.strip() title = dl.find('a', {'class': 'title'}).string.strip() rating = dl.find('span', {'class': 'rating_nums'}) return_list.append([title, desc, rating, detail_href]) # print(detail_href) # print(desc) page += 1 return return_list def print_book_lists_excel(book_lists, book_tag_lists): wb = Workbook(True) ws = [] for i in range(len(book_tag_lists)): ws.append(wb.create_sheet(book_tag_lists[i].decode())) # utf8->unicode for i in range(len(book_tag_lists)): ws[i].append(['序號', '書名', '描述', '評分']) count = 1 for bl in book_lists[i]: ws[i].append([count, bl[0], float(bl[1]), int(bl[2])]) count += 1 save_path = 'book_list' for i in range(len(book_tag_lists)): save_path += ('-' + book_tag_lists[i].decode()) save_path += '.xlsx' wb.save(save_path) def do_spider(book_tag_lists): book_lists = [] for book_tag in book_tag_lists: book_list = bookListByTags(book_tag) book_list = sorted(book_list, lambda x: x[3], True) book_lists.append(book_list) return book_lists if __name__ == '__main__': tags = ['小說', '隨筆'] book_lists = do_spider(tags) print_book_lists_excel(book_lists,tags)
豆瓣爬蟲:根據tag 爬書籍,實現urllib、BeautifulSoup、導出excel
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.