此篇文章承接(貓眼電影-爬取)。
將電影數據儲存到MySQL中後,發現評論人數和票房的數據當中存在漢字,後期不好分析,所以需要將漢字轉化爲數值。
保險起見,我先將films表裏面的結構和數據複製了成了一個新表films_copy,然後新增了2列,people和box_price。
將數據轉化爲便於分析的數據,代碼如下:
import pymysql
def get_data():
data1 = []
data2 = []
db = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306)
cursor = db.cursor()
sql = "SELECT score_hum,box_office FROM films_copy"
try:
cursor.execute(sql)
results = cursor.fetchall()
for item in results:
if '萬' in item[0]:
change0 = item[0]
change1 = int(float(item[0].replace('萬', '')) * 10000)
data1.append((change1, change0))
else:
change0 = item[0]
change1 = item[0]
data1.append((change1, change0))
if '萬' in item[1]:
if '美元' in item[1]:
change0 = item[1]
change2 = int(float(item[1].replace('萬美元', '')) * 10000 * 6.8)
data2.append((change2, change0))
else:
change0 = item[1]
change2 = int(float(item[1].replace('萬', '')) * 10000)
data2.append((change2, change0))
elif '億' in item[1]:
if '美元' in item[1]:
change0 = item[1]
change2 = int(float(item[1].replace('億美元', '')) * 100000000 * 6.8)
data2.append((change2, change0))
else:
change0 = item[1]
change2 = int(float(item[1].replace('億', '')) * 100000000)
data2.append((change2, change0))
else:
change0 = item[1]
data2.append((item[1], change0))
except:
print("something wrong")
db.close()
return data1,data2
def change_hum(data1)
for i in range(len(data1)):
db = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306)
cursor = db.cursor()
sql1 = "UPDATE films_copy SET people = '%s' WHERE score_hum = '%s'" %(data1[i][0], data1[i][1])
print(data1[i][0], data1[i][1])
try:
if cursor.execute(sql1):
print('Successful')
db.commit()
except:
db.rollback()
print('Falied')
db.close()
def change_prices(data2):
for i in range(len(data2)):
db = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306)
cursor = db.cursor()
sql2 = "UPDATE films_copy SET box_price = '%s' WHERE box_office = '%s'" %(data2[i][0], data2[i][1])
try:
if cursor.execute(sql2):
print('Successful')
db.commit()
except:
db.rollback()
print('Falied')
db.close()
def main():
data1 = get_data()[0]
data2 = get_data()[1]
change_hum(data1)
change_prices(data2)
if '__name__' == '__main__':
main()
現在開始分析數據:(按照公衆號作者的代碼操作,一直沒有出現作者文章中的效果,所以找解決方法找了很久)
1,2018年電影評分TOP10
from pyecharts import Bar
import pandas as pd
import numpy as np
import pymysql
conn = pymysql.connect(host='localhost', user='root', passwd='password', db='maoyan', port=3306, charset='utf8mb4')
cursor = conn.cursor()
sql = 'select * from films_copy'
db = pd.read_sql(sql, conn)
df = db.sort_values(by='score', ascending=False)
dom = df[['name', 'score']]
v1 = dom['score'][0:10]
attr = dom['name'][0:10]
bar = Bar("2018年電影評分TOP10", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
xaxis_min=8,
xaxis_max=9.8,
is_yaxis_boundarygap=True,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='right',
is_yaxis_inverse=True,
is_convert=True,
yaxis_label_textsize=10)
bar.render("2018年電影評分TOP10-2.html")
因爲代碼運行環境是jupyter,所以出現的效果是x軸與y軸互換之後,y軸的文本沒有出現,然後一直在網絡上找原因與解決辦法。
後來升級了jupyter版本,以及將jupyter-echarts-pypkg==0.1.1,然後運行還是不行,y軸還是像上圖一下,但是用網頁打開就是正常的了,可還是出現了一個問題,就是y軸文本太長,只能顯示部分文字了。
2.電影票房TOP10
dom = df[['name', 'box_price']].sort_values(by='box_price', ascending=False)
v1 = round(dom['box_price'][0:10]/100000000, 2)
attr = dom['name'][0:10]
bar = Bar("2018年電影票房TOP10(億元)", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_yaxis_boundarygap=True,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='right',
is_yaxis_inverse=True,
is_convert=True,
yaxis_label_textsize=10)
bar.render("2018年電影票房TOP10.html")
bar
3.2018年電影人氣TOP10
dom = df[['name', 'people']].sort_values(by='people', ascending=False)
v1 = round(dom['people'][0:10]/10000, 2)
attr = dom['name'][0:10]
bar = Bar("2018年電影人氣TOP10(萬人)", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_yaxis_boundarygap=True,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='right',
is_yaxis_inverse=True,
is_convert=True,
yaxis_label_textsize=10)
bar.render("2018年電影人氣TOP10.html")
4.2018年電影名利雙收TOP10
def my_sum(a, b, c):
rate = (a + b) / c
result = float('%.4f' % rate)
return result
db['sort_num_money'] = db['box_price'].rank(ascending=0, method='dense')
db['sort_num_score'] = db['score'].rank(ascending=0, method='dense')
db['value'] = db.apply(lambda row:my_sum(row['sort_num_money'], row['sort_num_score'], len(db.index)), axis=1)
df = db.sort_values(by="value", ascending=True)[0:10]
v1 = ["{}".format('%.2f' % ((1-i)* 100))for i in df['value']]
attr = np.array(df['name'])
attr = ["{}".format(i.replace(':無限戰爭', '').replace(':全面瓦解', '')) for i in attr]#爲什麼可以直接在.replace後面再加上.replace
bar = Bar("2018年電影名利雙收TOP10(%)", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
xaxis_max=100,
xaxis_min=90,
is_yaxis_boundarygap=True,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='right',
is_yaxis_inverse=True,
is_convert=True)
bar.render("2018年電影名利雙收TOP10.html")
bar
5.2018年電影叫座不叫好TOP10
def my_sub(a, b, c):
rate = (a - b) / c
result = float('%.4f' % rate)
return result
db['sort_num_money'] = db['box_price'].rank(ascending=0, method='dense')
db['sort_num_score'] = db['score'].rank(ascending=0, method='dense')
db['value'] = db.apply(lambda row:my_sum(row['sort_num_money'], row['sort_num_score'], len(db.index)), axis=1)
df = db.sort_values(by="value", ascending=True)[0:10]
v1 = ["{}".format('%.2f' % (i * 100)) for i in df['value'][::-1]]
# v1 = ["{}".format('%.2f' % (i * 100)) for i in df['value']]
attr = np.array(df['name'])
# attr = ["{}".format(i.replace(':無限戰爭', '').replace(':全面瓦解', '')) for i in attr]#爲什麼可以直接在.replace後面再加上.replace
attr = ["{}".format(i.replace(':無限戰爭', '').replace(':全面瓦解', '')) for i in attr[::-1]]
bar = Bar("2018年電影叫座不叫好TOP10(%)", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
xaxis_max=6,
xaxis_min=0,
is_yaxis_boundarygap=True,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='right',
is_yaxis_inverse=True,
is_convert=True)
bar.render("2018年電影叫座不叫好TOP10.html")
bar
6.2018年每月電影上映數量
首先,我是按照自己的想法做的,然後發現數據結果跟作者的不一致。
dateOne = df[['name','released']]
month0 = []
for i in df['released']:
St = i.split('-')
month0.append(St[1][:2])
# df['months'] = np.array(month0)
monthUq = set (month0)
monthSo = [i for i in monthUq]
monthSo.sort()
data0 = {}
for i in monthSo:
data0[i] = month0.count(i)
print(data0)
v1 = [value for value in data0.values()]
attr = [key + '月' for key in data0.keys()]
print(v1)
bar = Bar("2018年電影每月上映數量)", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='top'
)
bar.render("2018年電影每月上映數量.html")
bar
以下爲公衆號作者代碼,最後結果與上圖一致,但是跟作者在公衆號上的數據展示不一致,雖然我的數據多了一個12月,但是之前的月份電影數量應該是一致的,除非貓眼後期又更新了數據。
df = db.sort_values(by="released", ascending=False)
dom = df[['name','released']]
month0 = []
for i in dom['released']:
St = i.split('-')
month0.append(St[1][:2])
db['month'] = month0
month_message = db.groupby(['month'])
month_com = month_message['month'].agg(['count'])
month_com.reset_index(inplace=True)
month_com_last = month_com.sort_index()
attr = ["{}".format(str(i)+'月') for i in range(1, 13)]
v1 = np.array(month_com_last['count'])
v1 = ["{}".format(i) for i in v1]
# print(attr, v1)
bar = Bar("2018年電影每月上映數量", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_label_show=True
)
bar.render("2018年電影每月上映數量.html")
bar
7.2018年每月電影上映票房
不知道爲什麼,我按照作者的代碼寫的,但是發現跟我實際的數據有出入。以下是作者的代碼:
df = db.sort_values(by="released", ascending=False)
dom = df[['name','released']]
month0 = []
for i in dom['released']:
St = i.split('-')
month0.append(St[1][:2])
db['month'] = month0
month_message = db.groupby(['month'])
month_com = month_message['box_price'].agg(['sum'])
month_com.reset_index(inplace=True)
month_com_last = month_com.sort_index()
attr = ["{}".format(str(i)+'月') for i in range(1, 13)]
v1 = np.array(month_com_last['sum'])
v1 = ["{}".format(float('%.2f' % (float(i) / 100000000))) for i in v1]
# print(attr, v1)
bar = Bar("2018年電影每月票房", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_label_show=True
)
bar.render("2018年電影每月票房.html")
bar
上圖1月份的數據爲17.91億,但是我用mysql算出來的是51.76億。以下爲我的代碼,終於跟我用mysql算出來的一樣了。
# df = db.sort_values(by="released", ascending=False)
dom = db[['name','released']]
month0 = []
for i in dom['released']:
St = i.split('-')
month0.append(St[1][:2])
db['month'] = month0
grouped = db['box_price'].groupby(db['month'])
sums = grouped.sum()
attr = ["{}".format(str(i)+'月') for i in range(1, 13)]
v1 = np.array(sums)
v1 = ["{}".format(float('%.2f' % (float(i) / 100000000))) for i in v1]
# print(attr, v1)
bar = Bar("2018年電影每月票房(億元)", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_label_show=True
)
bar.render("2018年電影每月票房(億元).html")
bar
按照我的數據顯示2月份爲票房最高月份。畢竟除夕在2月,情人節也在2月。而7月,8月正值學生的暑假期間,學生也是貢獻票房的主力軍。
8.2018年各國家電影數量TOP10
countrys = []
datas = {}
for i in db['country']:
St = i.split(',')
for j in St:
countrys.append(j)
countrysUq = set(countrys)
for i in countrysUq:
datas[i] = countrys.count(i)
dic1SortList = sorted( datas.items(),key = lambda x:x[1],reverse = True)
v1 = ["{}".format(i[1]) for i in dic1SortList][0:10]
attr = ["{}".format(i[0]) for i in dic1SortList][0:10]
bar = Bar("2018年各國電影數量TOP10", title_pos='center', title_top='18', width=800, height=400)
bar.add('',
attr,
v1,
is_xaxis_boundarygap=True,
is_label_show=True,
is_legend_show=False,
label_pos='top'
)
bar.render("2018年各國電影數量TOP10.html")
bar
2018年中國和美國上映電影要遠大於其他國家。
9,2018年中外電影票房對比(億元)
country_group = []
for i in db['country']:
if i[:2] == '中國':
St = '中國'
country_group.append(St)
else:
St = '外國'
country_group.append(St)
db['Country_group'] = country_group
grouped = db['box_price'].groupby(db['Country_group'])
sums = grouped.sum()
attr = ["中國", "外國"]
v1 = [i for i in sums]
v1 = ["{}".format(float('%.2f'% (float(i)/100000000))) for i in v1]
def label_formatter(params):
return params.value
pie = Pie("2018年中外電影票房對吧(億元)", title_pos='center')
pie.add("",
attr,
v1,
radius=[40, 75],
label_text_color=None,
is_label_show=True,
legend_orient="vertical",
legend_pos="left",
label_formatter=label_formatter
)
pie.render("2018n年中外電影票房對比.html")
pie
以上數據可以看出,中國的票房還是略低於外國的票房的。
10.2018年電影類型分佈圖
types = []
datas = {}
for i in db['type']:
St = i.split(',')
for j in St:
types.append(j)
typeUq = set (types)
for i in typeUq:
datas[i] = types.count(i)
def message():
for i in datas:
data = {}
data['name'] = i + ' ' + str(datas[i])
data['value'] = datas[i]
yield data
data0 = message()
dom = []
for item in data0:
dom.append(item)
# print(dom)
treemap = TreeMap("2018年電影類型分佈圖", title_pos='center', title_top='5', width=800, height=400)
treemap.add('數據',
dom,
is_label_show=True,
label_pos='inside',
is_legend_show=False
)
treemap.render('2018年電影類型分佈圖.html')
treemap
按作者公衆號操作的電影分析到此結束,鼓掌。因爲從作者的文字中接觸到了pyecharts,這對於我而言是一個新的知識,所以還需要再練習練習。