《相見你》短評分析
前幾天,女朋友一直在追想見你這個臺劇,然後去豆瓣搜了一下,看到了評分竟然高達9.2分
想想我以前做的那些分析,所以就做一個簡單的數據分析來分析一下想見你這部電視劇
爬取短評和評分
其實豆瓣作爲爬蟲新手聯繫已經夠無奈了,後來豆瓣從根源上避免爬蟲,在豆瓣短評上只會顯示500條短評
這裏可以看到短評有14多萬條,但是你不斷下一頁就可以發現,最多隻會顯示到500條,而我覺得非常的無奈,然後就去找了時光網、貓眼等等的影評,但是最多也就是湊齊了1600多條影評和評論,下面只演示一下在豆瓣的爬蟲
import requests
from lxml import etree
import pandas as pd
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36","Cookie":'ll="118281"; bid=EDNXYtW8L7w; __yadk_uid=RVvI1oTanjPtjRsv6HtGu4HIC6Kqovwu; __gads=ID=1a054006518de23e:T=1583498908:S=ALNI_MYt37CtrdWsoH7GS4Ybhlpz7wOEtw; ct=y; _vwo_uuid_v2=D308461F850E43F2F56AB5732A9A76C8D|1d99baae6ee74e400ec3c3c2c54ec9fe; push_doumail_num=0; push_noty_num=0; __utmc=30149280; __utmz=30149280.1583642326.4.4.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmc=223695111; ap_v=0,6.0; __utmz=223695111.1583661503.7.5.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); dbcl2="213145170:iSkk3Xy0Uoc"; ck=PhDl; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1583666031%2C%22https%3A%2F%2Fwww.douban.com%2Fsearch%3Fq%3D%25E6%2583%25B3%25E8%25A7%2581%25E4%25BD%25A0%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1985249643.1583498911.1583661369.1583666031.7; __utmb=30149280.0.10.1583666031; __utma=223695111.1434622178.1583498925.1583661503.1583666031.8; __utmb=223695111.0.10.1583666031; douban-fav-remind=1; _pk_id.100001.4cf6=0b60581925fe7e5b.1583498925.7.1583666840.1583664038.'}
data_list = []
for e in ["h","m","l"]:
for i in range(25):
response = requests.get(
url="https://movie.douban.com/subject/30468961/comments?start={0}&limit=20&sort=new_score&status=P&percent_type={1}".format(str(i*20),e),
headers=headers).text
html = etree.HTML(response)
data = html.xpath("//div[@id='comments']/div[@class='comment-item']")
text = data[0].xpath("//div[@class='comment']/p/span[@class='short']/text()")
score = data[0].xpath("//div[@class='comment']/h3/span[@class='comment-info']/span[2]/@title")
date = data[0].xpath("//div[@class='comment']/h3/span[@class='comment-info']/span[3]/@title")
print(text)
for j in range(len(text)):
print("test")
data_dict = {}
data_dict["comment"] = text[j].strip().replace("\n","")
if(score[j] == "很差"):
data_dict["score"] = 1
elif(score[j] == "較差"):
data_dict["score"] = 2
elif(score[j] == "還行"):
data_dict["score"] = 3
elif(score[j] == "推薦"):
data_dict["score"] = 4
elif(score[j] == "力薦"):
data_dict["score"] = 5
try:
data_dict["date"] = date[j]
except:
data_dict["date"] = "0000-00-00 00:00:00"
data_list.append(data_dict)
print(text[j].strip().replace("\n",""))
data_frame = pd.DataFrame(data_list)
代碼邏輯,首先是請求頭,先要僞裝成瀏覽器來訪問,不然很容易就會被封,
然後循環訪問url地址,獲取的數據進行xpath表達式來處理,獲取到評論內容,評論時間,和評分,因爲有些評論沒有時間,所以要加一個錯誤處理來處理掉沒有時間的評論。
人物出場次數
在評論在分析每個人物的出場次數,也就是在評論區出現的次數,來表達觀衆對某個演員或者是某個角色的關注。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 先分析演員的出現次數
class Hero_time(object):
def __init__(self):
self.a_1 = ['柯佳嬿', '黃雨萱', '陳韻如','佳嬿','雨萱','韻如','女主','女主角']
self.a_2 = ['嚴藝文', '吳瑛嬋', '藝文', '瑛嬋']
self.a_3 = ['郭文頤', '昆布','文頤']
self.a_4 = ['林子珊', '小黛', '子珊']
self.a_5 = ['張毓晨', '毓晨', '娜姐']
self.a_6 = ['曾之喬', 'Sunny老師', 'sunny老師', 'Sunny', 'sunny']
self.a_7 = ['朱芷瑩', '楊碧雲', '芷瑩', '碧雲']
self.a_8 = ['簡廷芮', '廷芮', 'Vicky', 'vicky']
self.a_9 = ['馬惠珍', '惠珍', '莫奶奶']
self.a_10 = ['梁洳瑄','蔡旻柔','洳瑄','旻柔']
self.a_11 = ['江少儀', '詮勝母', '詮勝媽媽','男主的媽媽']
self.b_1 = ['許光漢', '王詮勝', '李子維', '光漢', '詮勝', '子維','男主','男主角']
self.b_2 = ['施柏宇', '莫俊傑', '柏宇', '俊傑']
self.b_3 = ['顏毓麟', '謝宗儒', '謝芝齊', '毓麟', '宗儒', '芝齊']
self.b_4 = ['張翰', '吳文磊', '文磊']
self.b_5 = ['林鶴軒', '陳思源', '鶴軒','思源']
self.b_6 = ['陳匡榮', '阿財', '匡榮']
self.b_7 = ['徐詣帆', '詣帆', '教官']
self.b_8 = ['連晨翔', '阿哲', '晨翔']
self.b_9 = ['黃鴻升', '杜齊閔', '鴻升','齊閔']
self.b_10 = ['邱勝翊', '顏力正', '力正', '勝翊', '王子']
def read_comments(self):
data = pd.read_csv("I:/crack/DATA/want_see.csv",encoding="utf_8_sig")
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', 1000)
comments = data['comment']
return comments
def counts(self,heros,hero_name):
count = 0
for i in self.read_comments():
for name in heros:
if name in i:
count +=1
else:
continue
comment_1 = {"英雄名稱":hero_name,"出現次數":count}
return comment_1
def all_hero(self):
list = []
list.append(self.counts(self.a_1,"柯佳嬿"))
list.append(self.counts(self.a_2, "嚴藝文"))
list.append(self.counts(self.a_3,"郭文頤"))
list.append(self.counts(self.a_4,"林子珊"))
list.append(self.counts(self.a_5,"張毓晨"))
list.append(self.counts(self.a_6,"曾之喬"))
list.append(self.counts(self.a_7,"朱芷瑩"))
list.append(self.counts(self.a_8,"簡廷芮"))
list.append(self.counts(self.a_9,"梁洳瑄"))
list.append(self.counts(self.a_10, "馬惠珍"))
list.append(self.counts(self.a_11, "江少儀"))
list.append(self.counts(self.b_1, "許光漢"))
list.append(self.counts(self.b_2, "施柏宇"))
list.append(self.counts(self.b_3, "顏毓麟"))
list.append(self.counts(self.b_4, "張翰"))
list.append(self.counts(self.b_5, "林鶴軒"))
list.append(self.counts(self.b_6, "陳匡榮"))
list.append(self.counts(self.b_7, "徐詣帆"))
list.append(self.counts(self.b_8, "連晨翔"))
list.append(self.counts(self.b_9, "黃鴻升"))
return list
def draw(self):
hero_counts = self.all_hero()
data = pd.DataFrame(hero_counts).sort_values('出現次數',ascending=False)
data_time = data['出現次數']
data_hero = data['英雄名稱']
x = data_hero.values
y = data_time.values
#設置中文字體
plt.rcParams['font.family'] = 'SimHei'
# 設置x刻度
plt.xticks(range(len(x)), x,rotation=45)
#繪圖
rect = plt.bar(range(len(x)),y,width=0.5,label="測試豆瓣評論,各英雄出現的次數可視化")
plt.xticks(range(len(x)),x)
plt.legend()
plt.show()
def main():
test = Hero_time()
test.draw()
if __name__ == '__main__':
main()
結果
雖然很多人都說男主因爲這部電視劇比較紅,但是從分析中看出,觀衆對女主的關注度比較高,其次都是男演員比較受關注,可能是因爲女觀衆比較多吧,而且幾個男演員也是比較帥的。
評論雲圖
import pandas as pd
import wordcloud
import jieba
from cv2 import imread
from PIL import Image
from os import path
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("I:/crack/DATA/want_see.csv",encoding="utf_8_sig")
comment = data["comment"].tolist()
world_all = "".join(comment)
word_list = jieba.cut(world_all,cut_all=False)
txt = "".join(word_list)
w = wordcloud.WordCloud(font_path = "I:crack/font/msyh.TTF",width = 1000,height = 700,background_color = "Pink") #這個是配置詞雲設置
w.generate(txt)
plt.imshow(w, interpolation="bilinear")
plt.show()
從雲圖中看出,除了電視劇的名字,在評論區出現比較多的還是,伍佰的歌,因爲我女朋友在看想見你的時候我也是聽到這首歌比較多。shou
其次出現比較多的是“你的名字”,我還不知道評論區出現這麼多的這個詞是爲什麼,然後比較受關注的角色是 李子維和陳韻如。
評分分佈
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
data = pd.read_csv("I:/crack/DATA/want_see.csv",encoding="utf_8_sig")
plt.rcParams['font.family'] = 'SimHei'
socre = data["score"].tolist()
socre_dict = Counter(socre)
socre_data = []
labels = []
for key,value in socre_dict.items():
socre_data.append(value)
labels.append(str(key) + "分")
plt.pie(socre_data,labels=labels,autopct = '%1.1f%%',shadow=True,startangle=90,radius=1.2,pctdistance=0.7,labeldistance=1.1,wedgeprops={'linewidth':5},textprops={'fontsize':16,'color':'w'})
#設置標題
plt.title('評分圖')
# 將橫、縱座標軸標準化處理,保證餅圖是一個正圓,否則爲橢圓
plt.legend(loc='upper right')
plt.show()
這次分析的短評都主要是熱門的評論,所以可以看到評分分佈的狀態還是3星的評分比較多,雖然豆瓣評分是9.2分,但是可能大部分觀衆還是覺得中等吧