爬取豆瓣《將夜》短評並繪製詞雲

《將夜》是根據貓膩小說改編,陳飛宇和宋伊人主演,最近在騰訊視頻熱播的電視劇,閒來無事,在學Python爬蟲的時候想要爬取評論看看。幾經努力,菜鳥水平終於能夠爬取短評了。由於豆瓣的限制,即使在登錄的狀態,依然只能爬取500條評論,具體代碼如下:

# 調用相關包
import json
import random
import requests
import time
import pandas as pd
from pyquery import PyQuery as pq
import pymongo
from bs4 import BeautifulSoup as bs
import re
import os
from pyecharts import Bar, Geo, Line, Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter

# 存儲到數據庫
MONGO_URL = 'localhost'
MONGO_DB = 'douban'
MONGO_COLLECTION = 'jiangye_comments'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

def save_to_mongo(result):
    try:
        if db[MONGO_COLLECTION].insert(result):
            print('存儲到MongoDB成功')
    except Exception:
        print('存儲到MongoDB失敗')

session = requests.Session()

def loginin():
    url = 'https://www.douban.com/accounts/login'
    name = '你的用戶名'
    psw = '你的密碼'
    headers = {
        "User-Agent": "'Mozilla/5.0 (Windows NT 6.1; rv:53.0)Gecko/20100101 Firefox/53.0'",
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
        "Accept-Encoding": "gzip,deflate",
        "Connection":"keep-alive"
    }
    data = {
        'form_email': name,
        'form_password': psw,
        'source': 'index_nav',
        'remember': 'on'
    }
    captcha = session.get(url, headers=headers, timeout=30)
    soup = bs(captcha.content, 'lxml')
    img = soup.find_all('img', id='captcha_image')
    print(img)
    if img:
        captcha_url = re.findall('src="(.*?)"', str(img))[0]
        print("驗證碼所在標籤爲:", captcha_url)
        a = captcha_url.split('&')[0]
        capid = a.split('=')[1]
        print(capid)
        cap = session.get(captcha_url, headers=headers).content
        with open('captcha.jpg', 'wb') as f:
            f.write(cap)
            f.close()
        im = Image.open('captcha.jpg')
        im.show()
        capimg = input('請輸入驗證碼:')
        newdata = {
            'captcha-solution': capimg,
            'captcha-id': capid
        }
        data.update(newdata)
        print(data)
        os.remove('captcha.jpg')
    else:
        print('不存在驗證碼,請直接登錄')
    
    r = session.post(url, data=data, headers=headers, timeout=30)
    print(r.status_code)

if __name__ == '__main__':
    loginin()
	# 爬取數據
	for i in range(0, 25):
	    i1 = i * 20
	    try:
	        time.sleep(2)
	        url = "https://movie.douban.com/subject/26848645/comments?start=" + str(i1) + \
	              "&limit=20&sort=new_score&status=P"
	        print("crawing:%s" % url)
	        # html = requests.get(url=url, cookies=cookie, headers=headers).content
	        html = session.get(url).content
	        doc = pq(html)
	        items = doc('#comments .comment-item').items()
	        for item in items:
	            jiangye_comments = {
	                'author': item.find('.avatar a').attr('title'),
	                'votes': item.find('.votes').text(),
	                'rating': item.find('.rating').text(),
	                'date': item.find('.comment-time').text(),
	                'comments': item.find('.short').text()
	            }
	            save_to_mongo(jiangye_comments)
	    except:
	        continue

由於豆瓣有反爬機制,爬取次數較多的話會限制登錄,所以採用登錄的方式,獲取Session,然後調用Session打開待爬取網頁。登錄過程中,由於登錄次數過多會要求驗證碼,需要判斷驗證碼是否存在,如果存在,會在本地保存驗證碼,要求在shell輸入,即可正確登錄。
由於豆瓣只能讀取500條評論,讀取的結果如下:
在這裏插入圖片描述## 繪製詞雲
在數據庫中提取短評信息,分詞並設置截止詞,繪製詞雲所用的原圖爲:
陳飛宇
繪製詞雲的代碼如下:

# 調用相關包
import json
import random
import requests
import time
import re
import pandas as pd
import numpy as np
from PIL import Image
from pyquery import PyQuery as pq
import pymongo
import os
from os import path
from pyecharts import Bar, Geo, Line, Overlap
import jieba
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from collections import Counter

# 從數據庫讀取數據
MONGO_URL = 'localhost'
MONGO_DB = 'douban'
MONGO_COLLECTION = 'jiangye_comments'
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

data = pd.DataFrame(list(db[MONGO_COLLECTION].find()))

def analysis(data):
	jieba.load_userdict("userdict.txt")
	text = ''
	for i in data['comments'].values:
		symbol_to_replace = '[!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+'
		i = re.sub(symbol_to_replace, '', i)
		text += ' '.join(jieba.cut(i, cut_all=False))
	# print(text)
	d = path.dirname(__file__) if "__file__" in locals() else os.getcwd()
	background_Image = np.array(Image.open(path.join(d, "chen.jpg")))
	
	# 添加stopswords
	stopwords = set()
	# 先運行對text進行詞頻統計再排序,再選擇要增加的停用詞
	stopwords.update(['如何','怎麼','一個','什麼','爲什麼','還是','我們','爲何','可能','不是','沒有','哪些','成爲','可以','背後','到底','就是','這麼','不要','怎樣','爲了','能否','你們','還有','這樣','這個','真的','那些','覺得','雖然','除了','感覺','但是','很多','有點','已經','那麼','完全','實在','開始','其他','自己'])
	wc = WordCloud(
		background_color = 'black',
		font_path = "C:\\Windows\\Fonts\\simhei.ttf",
		mask = background_Image,
		stopwords = stopwords,
		max_words = 2000,
		margin = 2,
		max_font_size = 100,
		random_state = 42,
		scale = 2
	)
	wc.generate_from_text(text)
	process_word = WordCloud.process_text(wc, text)
	# 下面是字典排序
	sort = sorted(process_word.items(), key=lambda e:e[1], reverse=True)
	print(sort[:50])
	img_colors = ImageColorGenerator(background_Image)
	wc.recolor(color_func = img_colors)
	plt.imshow(wc, interpolation='bilinear')
	plt.axis('off')
	plt.tight_layout()
	plt.savefig('jiangye.png', dpi=200)
	plt.show()

# 繪製詞雲
analysis(data)

最後繪製的詞雲如下:
獲得的詞雲

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章