本次NLP作業需要每個人在小組選定領域下進行子領域詞典制作,我們小組選定的領域爲動物。我個人選定的子領域爲昆蟲,原始語料庫來自《昆蟲記》這本書。通過爬蟲或者複製粘貼可以在本地得到關於《昆蟲記》的文本文件。
數據的處理
- 讀取文本,將句號替換成換行,跳過空行
- 通過自建篩選字典和清華動物字典,對文本進行處理,保留每行含有動物詞彙的行
- 按照7:3的比例,劃分訓練集和測試集
- 讀取訓練集,生成昆蟲領域詞典。(most_common可以指定返回數目,因爲有些動物名只出現一次,並且生成詞典較小。所以這裏並未指定,而是全部返回,然後人工篩選)
- 用jeiba對訓練集進行分詞處理
評估
中文分詞系統的評價指標一般有切分準確率(Precision)、召回率(Recall)、分詞精度(F1-Measure)等。本實驗將F1作爲評測指標來評價HMM模型對昆蟲領域的分詞效果,F1的計算公式如下所示:
python程序
import os
import sys
import random
import json
import jieba
import jieba.posseg
from collections import Counter
class ProcessData(object):
def __init__(self, *dic_path):
self.dictionary = set()
self.maximum = 0
# 加載詞典
for path in dic_path:
self.load_dic(path)
# 加載字典
def load_dic(self, dic_path):
with open(dic_path, 'r', encoding='utf-8') as fp:
for line in fp:
line = line.strip().split()[0]
if not line:
continue
self.dictionary.add(line)
self.maximum = max(self.maximum, len(line))
# 判斷text中是否包含詞典中的詞
def return_data(self, text):
index = 0
while index < len(text):
match = False
for size in range(self.maximum, 0, -1):
if index + size > len(text):
continue
piece = text[index:(index + size)]
if piece in self.dictionary:
return True
if not match:
index += 1
return False
# 劃分數據集和測試集
def train_test_split(self, data_path, train_path, test_path):
try:
with open(data_path, 'r', encoding='utf-8') as fp, open(
train_path, 'w',
encoding='utf-8') as train, open(test_path,
'w',
encoding='utf-8') as test:
lines = fp.readlines()
threshold = int(len(lines) * 0.7)
random.shuffle(lines)
for i, line in enumerate(lines):
if i < threshold:
train.write(line)
else:
test.write(line)
except Exception:
print(sys.stderr, "文件讀寫出現錯誤")
raise Exception
sys.exit(1)
# 對text文件進行詞性標註
def postag_txt(self, inputFile, outputFile):
with open(inputFile, 'r',
encoding="utf-8") as fin, open(outputFile,
'w+',
encoding="utf-8",
newline='') as fout:
# 詞性統計
d = {}
for eachLine in fin:
# 跳過空行
if not len(eachLine.strip()):
continue
else:
line = eachLine.strip().replace('。', os.linesep)
# 使用jieba詞性標註
posLine = jieba.posseg.cut(line)
newLine = ''
for key in posLine:
# newLine += "{}/{} ".format(key.word, key.flag)
d[key.flag] = d.get(key.flag, 0) + 1
# fout.write(newLine + os.linesep)
fout.write(json.dumps(d))
return True
# 詞頻統計
def count_words(self, text):
# 加載停用詞表
stop_path = os.path.join(sys.path[0], r'.\data\stop_words.utf8')
stopwords = [
line.strip()
for line in open(stop_path, 'r', encoding='utf-8').readlines()
]
seg_list = jieba.cut(text)
c = Counter()
for word in seg_list:
if word not in stopwords and len(word) > 1 and word != os.linesep:
c[word] += 1
# 前count
lines = ""
for (word, num) in c.most_common():
line = word + " " + str(num) + os.linesep
lines += line
return lines
# jeiba分詞
def seq_word(self, input_path, output_path):
with open(input_path, 'r',
encoding='utf-8') as fin, open(output_path,
'w',
encoding='utf-8') as fout:
for line in fin.readlines():
seq_list = jieba.cut(line, cut_all=False)
fout.write(' '.join(seq_list))
print("jeiba分詞完成")
def Process():
dict1_path = os.path.join(sys.path[0],
r'.\data\THUOCL_animal.txt') # 清華動物詞典
dict2_path = os.path.join(sys.path[0], r'.\data\my_animal.txt') # 自建詞典
input_path = os.path.join(sys.path[0], r'.\data\insect_origin.txt')
output_path = os.path.join(sys.path[0], r'.\data\insect.txt')
train_path = os.path.join(sys.path[0],
r'.\data\train_insect.txt') # 生成訓練集路徑
test_path = os.path.join(sys.path[0], r'.\data\test_insect.txt') # 生成測試集路徑
pro = ProcessData(dict1_path, dict2_path) # 加載詞典
try:
with open(input_path, 'r',
encoding='utf-8') as input_text, open(output_path,
'w',
encoding='utf-8',
newline='') as output:
for line in input_text:
flag = pro.return_data(line.strip())
if flag:
print("line:", line)
output.write(line)
except Exception:
print(sys.stderr, "文件打開錯誤")
raise Exception
sys.exit(1)
print("數據處理完成")
pro.train_test_split(output_path, train_path, test_path)
print("訓練集和測試集生成")
my_dict = os.path.join(sys.path[0], r'.\data\my_dict.txt')
# 詞典生成
with open(train_path, 'r',
encoding='utf-8') as fin, open(my_dict,
'w',
encoding='utf-8',
newline="") as fout:
text = fin.read()
text = pro.count_words(text)
fout.write(text)
# 用jeiba對訓練集進行分詞處理
jieba_train = os.path.join(sys.path[0], r'.\data\jieba_train.txt')
pro.seq_word(train_path, jieba_train)
# 用jeiba對測試集進行分詞處理
jieba_train = os.path.join(sys.path[0], r'.\data\jieba_test.txt')
pro.seq_word(test_path, jieba_train)
output_tag = os.path.join(sys.path[0], r'.\data\train_tag.txt')
pro.postag_txt(train_path, output_tag)
# F1值計算
def estimate_F1():
# 評估分詞模型的準確率
hmm_path = r'.\data\hmm_output.txt'
jieba_path = r'.\data\jieba_test.txt'
with open(hmm_path, 'r',
encoding='utf-8') as f_hmm, open(jieba_path,
'r',
encoding='utf-8') as f_jieba:
all_words_answer = 0
all_words_sample = 0
correct = 0
sentence1 = f_hmm.readline()
sentence2 = f_jieba.readline()
while sentence1:
hmm_line = sentence1.split()
jieab_line = set(sentence2.split())
same_list = [x for x in hmm_line if x in jieab_line]
all_words_answer += len(hmm_line)
all_words_sample += len(jieab_line)
correct += len(same_list)
sentence1 = f_hmm.readline()
sentence2 = f_jieba.readline()
recall = correct / all_words_answer
precision = correct / all_words_sample
f_mesure = (2 * precision * recall) / (precision + recall)
print("詞數:", all_words_answer)
print("Precision:", round(precision, 4), "Recall", round(recall, 4),
"F-mesure", round(f_mesure, 4))
if __name__ == "__main__":
Process()
清華動物詞典
自建篩選詞典
stop_words文件
insect.txt文件
生成詞典my_dict