統計詞頻
英文文本詞頻統計
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
# 讀取數據
def get_data(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
text = file.read().strip()
return text
# 英文縮寫替換
def replace_abbreviations(text):
text = text.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
.replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
.replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
.replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
.replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
.replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't", "could not")\
.replace("you'll", "you will").replace("you've", "you have")
text = text.replace("'s", "")
return text
# 刪除標籤符號、數字及其他字符
def clear_str(text):
text = re.sub("[^a-zA-Z]", " ", text)
return " ".join(text.split())
# 詞幹提取
def stem_words(text):
text_words_stem = [lemma.lemmatize(word, pos='v') for word in text.split()]
return " ".join(text_words_stem)
# 統計詞頻
def collection_words(text):
# 字典
words_freq = {}
for word in text.split():
words_freq.setdefault(word, 0)
words_freq[word] += 1
return words_freq
# 主函數
def main(file_path):
text = get_data(file_path)
text = replace_abbreviations(text)
text = clear_str(text)
text = stem_words(text)
words_freq = collection_words(text)
return words_freq
if __name__ == "__main__":
file_path = 'D:/Python/JupyterNotebook/wordcounter/sophiesworld_1_to_2.txt'
lemma = WordNetLemmatizer()
words_freq = main(file_path)
# 輸出詞頻
# print(words_freq)
# 按照詞頻大小進行排序
words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
print(words_freq_sorted)
中文文本詞頻統計
1、方法1
構建字典
import jieba
import re
text = ['今晚19:30《天下足球》直播互動話題:國家德比,巴薩取勝的關鍵之處?歡迎積極留言,我們將選擇您的精彩留言與全國觀衆分享。',
'德甲前四捉對廝殺,“羅貝里”復活拜仁大勝、門興多特平分秋色。',
'今晚《天下足球》19:30,直播內容:專題《歐洲盃豪門恩怨》;專題《名人堂:蘇格拉底,大師遠去》;尤文米蘭雙雙取勝,積分榜上你追我趕。',
'今晚《天下足球》19:30,直播內容:國米羅馬遭遇詭異失利;巴薩皇馬用勝利迎國家德比。']
text_cut = []
for item in text:
item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item) # 刪除標點符號及數字
item_cut = list(jieba.cut(item)) # 分詞
text_cut.append(item_cut)
# 字典,用於統計詞頻
words_freq = {}
for item in text_cut:
for w in item:
words_freq.setdefault(w, 0) # 如果關鍵字在字典中不存在,把這個關鍵字加入字典,
words_freq[w] += 1 # 詞頻+1
print(words_freq)
輸出:
{‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘互動’: 1, ‘話題’: 1, ‘國家’: 2, ‘德比’: 2, ‘巴薩’: 2, ‘取勝’: 2, ‘的’: 2, ‘關鍵’: 1, ‘之’: 1, ‘處’: 1, ‘歡迎’: 1, ‘積極’: 1, ‘留言’: 2, ‘我們’: 1, ‘將’: 1, ‘選擇’: 1, ‘您’: 1, ‘精彩’: 1, ‘與’: 1, ‘全國’: 1, ‘觀衆’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘對’: 1, ‘廝殺’: 1, ‘羅’: 1, ‘貝里’: 1, ‘復活’: 1, ‘拜仁’: 1, ‘大勝’: 1, ‘門興’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘內容’: 2, ‘專題’: 2, ‘歐洲盃’: 1, ‘豪門’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘蘇格拉底’: 1, ‘大師’: 1, ‘遠去’: 1, ‘尤文’: 1, ‘米蘭’: 1, ‘雙雙’: 1, ‘積分榜’: 1, ‘上’: 1, ‘你追我趕’: 1, ‘國米’: 1, ‘羅馬’: 1, ‘遭遇’: 1, ‘詭異’: 1, ‘失利’: 1, ‘皇馬’: 1, ‘用’: 1, ‘勝利’: 1, ‘迎’: 1}
2、方法2
使用collections庫
import jieba
import collections
import re
text = ['今晚19:30《天下足球》直播互動話題:國家德比,巴薩取勝的關鍵之處?歡迎積極留言,我們將選擇您的精彩留言與全國觀衆分享。',
'德甲前四捉對廝殺,“羅貝里”復活拜仁大勝、門興多特平分秋色。',
'今晚《天下足球》19:30,直播內容:專題《歐洲盃豪門恩怨》;專題《名人堂:蘇格拉底,大師遠去》;尤文米蘭雙雙取勝,積分榜上你追我趕。',
'今晚《天下足球》19:30,直播內容:國米羅馬遭遇詭異失利;巴薩皇馬用勝利迎國家德比。']
text_str = ""
for item in text:
item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item) # 刪除標點符號及數字
item_cut = list(jieba.cut(item)) # 分詞
text_str += " ".join(item_cut) # 字符串相加
text_str += " "
words_freq = collections.Counter(text_str.split(" ")[:-1]) # 取到倒數第二位,最後一位不要
print(words_freq)
輸出:
Counter({‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘國家’: 2, ‘德比’: 2, ‘巴薩’: 2, ‘取勝’: 2, ‘的’: 2, ‘留言’: 2, ‘內容’: 2, ‘專題’: 2, ‘互動’: 1, ‘話題’: 1, ‘關鍵’: 1, ‘之’: 1, ‘處’: 1, ‘歡迎’: 1, ‘積極’: 1, ‘我們’: 1, ‘將’: 1, ‘選擇’: 1, ‘您’: 1, ‘精彩’: 1, ‘與’: 1, ‘全國’: 1, ‘觀衆’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘對’: 1, ‘廝殺’: 1, ‘羅’: 1, ‘貝里’: 1, ‘復活’: 1, ‘拜仁’: 1, ‘大勝’: 1, ‘門興’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘歐洲盃’: 1, ‘豪門’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘蘇格拉底’: 1, ‘大師’: 1, ‘遠去’: 1, ‘尤文’: 1, ‘米蘭’: 1, ‘雙雙’: 1, ‘積分榜’: 1, ‘上’: 1, ‘你追我趕’: 1, ‘國米’: 1, ‘羅馬’: 1, ‘遭遇’: 1, ‘詭異’: 1, ‘失利’: 1, ‘皇馬’: 1, ‘用’: 1, ‘勝利’: 1, ‘迎’: 1})
下面,我對《人民的名義》這篇小說進行詞頻統計:
import re
import jieba
# 讀取數據
def get_data(file_path):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
text = file.read().strip()
return text
# 刪除非中文字符
def clear_str(text):
text = re.sub("[!?'©'《》'\ufeff'(),。'\u3000'‘’/“”:.;、——【】……'\n'0-9a-zA-z' '-@$¥%*^~]", "", text)
return text
# 分詞
def text_cut(text):
# 對於一些人名和地名,jieba處理的不好,不過我們可以幫jieba加入以下詞彙, 如:
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('易學習', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('歐陽菁', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('李達康', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('趙東來', True)
jieba.suggest_freq('京州', True)
jieba.suggest_freq('毛婭', True)
jieba.suggest_freq('陳海', True)
jieba.suggest_freq('丁義珍', True)
jieba.suggest_freq('趙德漢', True)
jieba.suggest_freq('祁同偉', True)
jieba.suggest_freq('陸亦可', True)
jieba.suggest_freq('陳岩石', True)
jieba.suggest_freq('鄭西坡', True)
jieba.suggest_freq('陳清泉', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孫連城', True)
jieba.suggest_freq('偵察處', True)
jieba.suggest_freq('高小琴', True)
text = list(jieba.cut(text))
return text
# 統計詞頻
def collects_words(text):
words_freq = {}
for word in text:
words_freq.setdefault(word, 0)
words_freq[word] += 1
return words_freq
# 主函數
def main(file_path):
text = get_data(file_path)
text = clear_str(text)
text = text_cut(text)
words_freq = collects_words(text)
return words_freq
if __name__ == "__main__":
# 文件路徑
file_path = 'D:/Python/JupyterNotebook/in_the_name_of_people.txt'
words_freq = main(file_path)
# print(words_freq)
# 按照詞頻大小進行排序
words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
print(words_freq_sorted)