#-------Dask解決方案-------#
import dask.bag as db
import ujson as json
import pandas as pd
import numpy as np
import gzip
import re
b=db.read_text(r'F:/kaggle_dataset/亞馬遜評論/reviews_Digital_Music_5.json.gz',encoding='utf-8').map(json.loads)
b.take(1)
print(sum([1 for _ in gzip.open(r'F:/kaggle_dataset/亞馬遜評論/reviews_Digital_Music_5.json.gz')])) #統計多少條數據
tempDir = 'F:/kaggle_dataset/亞馬遜評論/製作亞馬遜用戶評論詞雲'
stopwords=set(pd.read_csv('C:/Users/zhangshuai_lc/stopwords_en.txt',header=None)[0])
pattern = re.compile(r'\w+') #正則
def hashFile():
temp_path_list = []
for i in range(1,101):
temp_path_list.append(open(tempDir+'/'+str(i)+'.txt',mode='w')) #構造100個文本文件路徑
for each in (gzip.open(r'F:/kaggle_dataset/亞馬遜評論/reviews_Digital_Music_5.json.gz')):
sentence = eval(each) #字符串轉字典
words = sentence['reviewText']
words_list = pattern.findall(words)
#print(words_list)
for word in words_list:
if word.lower() not in stopwords and len(word) >= 2:
word = word.lower()
temp_path_list[hash(word)%100].write(word+'\n') #對單詞進行hash,相同的單詞一定會hash到同一個文件中
for f in temp_path_list:
f.close()
hashFile()
#最後在找出100個文件中出現頻率最高的1000個單詞
import os
from collections import Counter
results = Counter()
for root, dirs, files in os.walk(r'F:/kaggle_dataset/亞馬遜評論/製作亞馬遜用戶評論詞雲'):
for file in files:
with open(os.path.join(root, file)) as f:
words_list = f.readlines()
words_list = list(map(lambda x: x.strip('\n'),words_list))
word_common_1000 = Counter(words_list).most_common(1000)
results.update(word_common_1000)
import heapq
words_fren_list = list(results.keys())
words_fren_list_100 = heapq.nlargest(100,words_fren_list,key = lambda x:x[1])
len(words_fren_list_100)
word_frequence = {x[0]: x[1] for x in words_fren_list_100} #注意數據結構
word_frequence
{'10': 11136,
'album': 140585,
'albums': 22047,
'amazing': 6245,
'artist': 5869,
'bad': 9842,
'band': 22550,
'bands': 4970,
'beat': 10468,
'beats': 7370,
'beautiful': 7736,
'bit': 8199,
'blues': 5310,
'buy': 7583,
'catchy': 5772,
'cd': 38605,
'classic': 13913,
'collection': 8004,
'dance': 5722}
下面使用WordCloud畫出詞雲:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
f, ax = plt.subplots(figsize=(7,4))
wordcloud = WordCloud(background_color='white',max_font_size=40,max_words=100,relative_scaling=.5).fit_words(word_frequence)
wordcloud.to_file("Amazonwordcloud.jpg")
plt.imshow(wordcloud)
plt.axis("off")
plt.show()