統計詞頻

統計詞頻

英文文本詞頻統計

import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# 讀取數據
def get_data(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().strip()
    return text

# 英文縮寫替換
def replace_abbreviations(text):
    text = text.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
        .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
        .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
        .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
        .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
        .replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't", "could not")\
        .replace("you'll", "you will").replace("you've", "you have")
    
    text = text.replace("'s", "")
    
    return text

# 刪除標籤符號、數字及其他字符
def clear_str(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    return " ".join(text.split())

# 詞幹提取
def stem_words(text):
    text_words_stem = [lemma.lemmatize(word, pos='v') for word in text.split()]
    return " ".join(text_words_stem)

# 統計詞頻
def collection_words(text):
    # 字典
    words_freq = {}
    for word in text.split():
        words_freq.setdefault(word, 0)
        words_freq[word] += 1
    return words_freq

# 主函數
def main(file_path):
    text = get_data(file_path)
    text = replace_abbreviations(text)
    text = clear_str(text)
    text = stem_words(text)
    words_freq = collection_words(text)
    return words_freq

if __name__ == "__main__":
    file_path = 'D:/Python/JupyterNotebook/wordcounter/sophiesworld_1_to_2.txt'
    lemma = WordNetLemmatizer()
    words_freq = main(file_path)
    
    # 輸出詞頻
    # print(words_freq)
    
    # 按照詞頻大小進行排序
    words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
    print(words_freq_sorted)

中文文本詞頻統計

1、方法1
構建字典

import jieba
import re

text = ['今晚19:30《天下足球》直播互動話題:國家德比,巴薩取勝的關鍵之處?歡迎積極留言,我們將選擇您的精彩留言與全國觀衆分享。',
        '德甲前四捉對廝殺,“羅貝里”復活拜仁大勝、門興多特平分秋色。',
        '今晚《天下足球》19:30,直播內容:專題《歐洲盃豪門恩怨》;專題《名人堂:蘇格拉底,大師遠去》;尤文米蘭雙雙取勝,積分榜上你追我趕。',
        '今晚《天下足球》19:30,直播內容:國米羅馬遭遇詭異失利;巴薩皇馬用勝利迎國家德比。']
text_cut = []

for item in text:
    item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item)  # 刪除標點符號及數字
    item_cut = list(jieba.cut(item))  # 分詞
    text_cut.append(item_cut)
    
# 字典,用於統計詞頻
words_freq = {}
for item in text_cut:
    for w in item:
        words_freq.setdefault(w, 0)  # 如果關鍵字在字典中不存在,把這個關鍵字加入字典,
        words_freq[w] += 1 # 詞頻+1
        
print(words_freq)

輸出:
{‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘互動’: 1, ‘話題’: 1, ‘國家’: 2, ‘德比’: 2, ‘巴薩’: 2, ‘取勝’: 2, ‘的’: 2, ‘關鍵’: 1, ‘之’: 1, ‘處’: 1, ‘歡迎’: 1, ‘積極’: 1, ‘留言’: 2, ‘我們’: 1, ‘將’: 1, ‘選擇’: 1, ‘您’: 1, ‘精彩’: 1, ‘與’: 1, ‘全國’: 1, ‘觀衆’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘對’: 1, ‘廝殺’: 1, ‘羅’: 1, ‘貝里’: 1, ‘復活’: 1, ‘拜仁’: 1, ‘大勝’: 1, ‘門興’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘內容’: 2, ‘專題’: 2, ‘歐洲盃’: 1, ‘豪門’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘蘇格拉底’: 1, ‘大師’: 1, ‘遠去’: 1, ‘尤文’: 1, ‘米蘭’: 1, ‘雙雙’: 1, ‘積分榜’: 1, ‘上’: 1, ‘你追我趕’: 1, ‘國米’: 1, ‘羅馬’: 1, ‘遭遇’: 1, ‘詭異’: 1, ‘失利’: 1, ‘皇馬’: 1, ‘用’: 1, ‘勝利’: 1, ‘迎’: 1}
2、方法2
使用collections庫

import jieba
import collections
import re

text = ['今晚19:30《天下足球》直播互動話題:國家德比,巴薩取勝的關鍵之處?歡迎積極留言,我們將選擇您的精彩留言與全國觀衆分享。',
        '德甲前四捉對廝殺,“羅貝里”復活拜仁大勝、門興多特平分秋色。',
        '今晚《天下足球》19:30,直播內容:專題《歐洲盃豪門恩怨》;專題《名人堂:蘇格拉底,大師遠去》;尤文米蘭雙雙取勝,積分榜上你追我趕。',
        '今晚《天下足球》19:30,直播內容:國米羅馬遭遇詭異失利;巴薩皇馬用勝利迎國家德比。']

text_str = ""

for item in text:
    item = re.sub("[:《》,?。“”‘’;!、0-9()]", "", item)  # 刪除標點符號及數字
    item_cut = list(jieba.cut(item))  # 分詞
    text_str += " ".join(item_cut)  # 字符串相加
    text_str += " "

words_freq = collections.Counter(text_str.split(" ")[:-1])  # 取到倒數第二位,最後一位不要
print(words_freq)

輸出:
Counter({‘今晚’: 3, ‘天下足球’: 3, ‘直播’: 3, ‘國家’: 2, ‘德比’: 2, ‘巴薩’: 2, ‘取勝’: 2, ‘的’: 2, ‘留言’: 2, ‘內容’: 2, ‘專題’: 2, ‘互動’: 1, ‘話題’: 1, ‘關鍵’: 1, ‘之’: 1, ‘處’: 1, ‘歡迎’: 1, ‘積極’: 1, ‘我們’: 1, ‘將’: 1, ‘選擇’: 1, ‘您’: 1, ‘精彩’: 1, ‘與’: 1, ‘全國’: 1, ‘觀衆’: 1, ‘分享’: 1, ‘德甲’: 1, ‘前四捉’: 1, ‘對’: 1, ‘廝殺’: 1, ‘羅’: 1, ‘貝里’: 1, ‘復活’: 1, ‘拜仁’: 1, ‘大勝’: 1, ‘門興’: 1, ‘多特’: 1, ‘平分秋色’: 1, ‘歐洲盃’: 1, ‘豪門’: 1, ‘恩怨’: 1, ‘名人堂’: 1, ‘蘇格拉底’: 1, ‘大師’: 1, ‘遠去’: 1, ‘尤文’: 1, ‘米蘭’: 1, ‘雙雙’: 1, ‘積分榜’: 1, ‘上’: 1, ‘你追我趕’: 1, ‘國米’: 1, ‘羅馬’: 1, ‘遭遇’: 1, ‘詭異’: 1, ‘失利’: 1, ‘皇馬’: 1, ‘用’: 1, ‘勝利’: 1, ‘迎’: 1})
下面,我對《人民的名義》這篇小說進行詞頻統計:

import re
import jieba

# 讀取數據
def get_data(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read().strip()
    return text

# 刪除非中文字符
def clear_str(text):
    text = re.sub("[!?'©'《》'\ufeff'(),。'\u3000'‘’/“”:.;、——【】……'\n'0-9a-zA-z' '-@$¥%*^~]", "", text)
    return text

# 分詞
def text_cut(text):
    # 對於一些人名和地名,jieba處理的不好,不過我們可以幫jieba加入以下詞彙, 如:
    jieba.suggest_freq('沙瑞金', True)
    jieba.suggest_freq('易學習', True)
    jieba.suggest_freq('王大路', True)
    jieba.suggest_freq('歐陽菁', True)
    jieba.suggest_freq('高育良', True)
    jieba.suggest_freq('李達康', True)
    jieba.suggest_freq('侯亮平', True)
    jieba.suggest_freq('趙東來', True)
    jieba.suggest_freq('京州', True)
    jieba.suggest_freq('毛婭', True)
    jieba.suggest_freq('陳海', True)
    jieba.suggest_freq('丁義珍', True)
    jieba.suggest_freq('趙德漢', True)
    jieba.suggest_freq('祁同偉', True)
    jieba.suggest_freq('陸亦可', True)
    jieba.suggest_freq('陳岩石', True)
    jieba.suggest_freq('鄭西坡', True)
    jieba.suggest_freq('陳清泉', True)
    jieba.suggest_freq('蔡成功', True)
    jieba.suggest_freq('孫連城', True)
    jieba.suggest_freq('偵察處', True)
    jieba.suggest_freq('高小琴', True)
    
    text = list(jieba.cut(text))
    return text

# 統計詞頻
def collects_words(text):
    words_freq = {}
    for word in text:
        words_freq.setdefault(word, 0)
        words_freq[word] += 1
    return words_freq

# 主函數
def main(file_path):
    text = get_data(file_path)
    text = clear_str(text)
    text = text_cut(text)
    words_freq = collects_words(text)
    return words_freq

if __name__ == "__main__":
    # 文件路徑
    file_path = 'D:/Python/JupyterNotebook/in_the_name_of_people.txt'
    words_freq = main(file_path)
    # print(words_freq)
    
    # 按照詞頻大小進行排序
    words_freq_sorted = sorted(words_freq.items(), key=lambda x: x[1], reverse=True)
    print(words_freq_sorted)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章