心血來潮,研究下詞雲,先上圖,最終效果
代碼
import numpy as np
import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud
import PIL.Image as image
import jieba
# 爬取彈幕
url= 'https://comment.bilibili.com/128614096.xml'
request = requests.get(url)#獲取頁面
request.encoding='utf8'#因爲是中文,我們需要進行轉碼,否則出來的都是unicode
soup = BeautifulSoup(request.text, 'lxml')
results = soup.find_all('d')#找出所有'd'
comments = [comment.text for comment in results]#因爲出來的時候是bs4格式的,我們需要把他轉化成lis
comments = [x.upper() for x in comments]#統一大小寫
comments_clean = [comment.replace(' ','') for comment in comments]#去掉空格
set(comments_clean)#看一下都有啥類似的沒用的詞語
useless_words = ['//TEST',
'/TESR',
'/TEST',
'/TEST/',
'/TEXT',
'/TEXTSUPREME',
'/TSET',
'/Y',
'\\TEST']
comments_clean = [element for element in comments_clean if element not in useless_words]#去掉不想要的字符
fnl_words = [word for word in comments_clean if len(word)>1]#去掉單字
st = ''.join(fnl_words)#拼成一個字符串
cut_text = " ".join(jieba.cut(st)) #分詞
mask = np.array(image.open(r"C:\Users\jh\PycharmProjects\ftp\mask.png"))
wordcloud = WordCloud(
# 添加遮罩層
mask=mask,
# 生成中文字的字體,必須要加,不然看不到中文
font_path = "C:\Windows\Fonts\msyh.ttc").generate(cut_text)
image_produce = wordcloud.to_image()
image_produce.show()
注意:
####### 想製作別的視頻詞雲只需要更換URL即可 , 嗶哩嗶哩cid自行百度