NLP之昆蟲詞典制作

本次NLP作業需要每個人在小組選定領域下進行子領域詞典制作，我們小組選定的領域爲動物。我個人選定的子領域爲昆蟲，原始語料庫來自《昆蟲記》這本書。通過爬蟲或者複製粘貼可以在本地得到關於《昆蟲記》的文本文件。

數據的處理

讀取文本，將句號替換成換行，跳過空行
通過自建篩選字典和清華動物字典，對文本進行處理，保留每行含有動物詞彙的行
按照7:3的比例，劃分訓練集和測試集
讀取訓練集，生成昆蟲領域詞典。（most_common可以指定返回數目，因爲有些動物名只出現一次，並且生成詞典較小。所以這裏並未指定，而是全部返回，然後人工篩選）
用jeiba對訓練集進行分詞處理

評估

中文分詞系統的評價指標一般有切分準確率(Precision)、召回率(Recall)、分詞精度(F1-Measure)等。本實驗將F1作爲評測指標來評價HMM模型對昆蟲領域的分詞效果，F1的計算公式如下所示：
$\begin{aligned} P&=\frac{系統正確識別的詞語個數}{系統識別的詞語個數}\times100\% \\ \\ R&=\frac{系統正確識別的詞語個數}{測試語料中所有的詞語個數}\times100\% \\ \\ F1&=\frac{2×P×R}{P+R}(P+R)\times100\% \end{aligned}$

python程序

import os
import sys
import random
import json
import jieba
import jieba.posseg
from collections import Counter


class ProcessData(object):
    def __init__(self, *dic_path):
        self.dictionary = set()
        self.maximum = 0
        # 加載詞典
        for path in dic_path:
            self.load_dic(path)

    # 加載字典
    def load_dic(self, dic_path):
        with open(dic_path, 'r', encoding='utf-8') as fp:
            for line in fp:
                line = line.strip().split()[0]
                if not line:
                    continue
                self.dictionary.add(line)
                self.maximum = max(self.maximum, len(line))

    # 判斷text中是否包含詞典中的詞
    def return_data(self, text):
        index = 0
        while index < len(text):
            match = False
            for size in range(self.maximum, 0, -1):
                if index + size > len(text):
                    continue
                piece = text[index:(index + size)]
                if piece in self.dictionary:
                    return True
            if not match:
                index += 1
        return False

    # 劃分數據集和測試集
    def train_test_split(self, data_path, train_path, test_path):
        try:
            with open(data_path, 'r', encoding='utf-8') as fp, open(
                    train_path, 'w',
                    encoding='utf-8') as train, open(test_path,
                                                     'w',
                                                     encoding='utf-8') as test:
                lines = fp.readlines()
                threshold = int(len(lines) * 0.7)
                random.shuffle(lines)

                for i, line in enumerate(lines):
                    if i < threshold:
                        train.write(line)
                    else:
                        test.write(line)
        except Exception:
            print(sys.stderr, "文件讀寫出現錯誤")
            raise Exception
            sys.exit(1)

    # 對text文件進行詞性標註
    def postag_txt(self, inputFile, outputFile):
        with open(inputFile, 'r',
                  encoding="utf-8") as fin, open(outputFile,
                                                 'w+',
                                                 encoding="utf-8",
                                                 newline='') as fout:
            # 詞性統計
            d = {}
            for eachLine in fin:
                # 跳過空行
                if not len(eachLine.strip()):
                    continue
                else:
                    line = eachLine.strip().replace('。', os.linesep)
                    # 使用jieba詞性標註
                    posLine = jieba.posseg.cut(line)
                    newLine = ''
                    for key in posLine:
                        # newLine += "{}/{}  ".format(key.word, key.flag)
                        d[key.flag] = d.get(key.flag, 0) + 1
                    # fout.write(newLine + os.linesep)
            fout.write(json.dumps(d))
        return True

    # 詞頻統計
    def count_words(self, text):
        # 加載停用詞表
        stop_path = os.path.join(sys.path[0], r'.\data\stop_words.utf8')
        stopwords = [
            line.strip()
            for line in open(stop_path, 'r', encoding='utf-8').readlines()
        ]
        seg_list = jieba.cut(text)
        c = Counter()
        for word in seg_list:
            if word not in stopwords and len(word) > 1 and word != os.linesep:
                c[word] += 1

        # 前count
        lines = ""
        for (word, num) in c.most_common():
            line = word + "  " + str(num) + os.linesep
            lines += line
        return lines

    # jeiba分詞
    def seq_word(self, input_path, output_path):
        with open(input_path, 'r',
                  encoding='utf-8') as fin, open(output_path,
                                                 'w',
                                                 encoding='utf-8') as fout:
            for line in fin.readlines():
                seq_list = jieba.cut(line, cut_all=False)
                fout.write(' '.join(seq_list))
        print("jeiba分詞完成")


def Process():

    dict1_path = os.path.join(sys.path[0],
                              r'.\data\THUOCL_animal.txt')  # 清華動物詞典
    dict2_path = os.path.join(sys.path[0], r'.\data\my_animal.txt')  # 自建詞典

    input_path = os.path.join(sys.path[0], r'.\data\insect_origin.txt')
    output_path = os.path.join(sys.path[0], r'.\data\insect.txt')

    train_path = os.path.join(sys.path[0],
                              r'.\data\train_insect.txt')  # 生成訓練集路徑

    test_path = os.path.join(sys.path[0], r'.\data\test_insect.txt')  # 生成測試集路徑

    pro = ProcessData(dict1_path, dict2_path)  # 加載詞典

    try:
        with open(input_path, 'r',
                  encoding='utf-8') as input_text, open(output_path,
                                                        'w',
                                                        encoding='utf-8',
                                                        newline='') as output:
            for line in input_text:
                flag = pro.return_data(line.strip())
                if flag:
                    print("line:", line)
                    output.write(line)
    except Exception:
        print(sys.stderr, "文件打開錯誤")
        raise Exception
        sys.exit(1)

    print("數據處理完成")

    pro.train_test_split(output_path, train_path, test_path)
    print("訓練集和測試集生成")

    my_dict = os.path.join(sys.path[0], r'.\data\my_dict.txt')
    # 詞典生成
    with open(train_path, 'r',
              encoding='utf-8') as fin, open(my_dict,
                                             'w',
                                             encoding='utf-8',
                                             newline="") as fout:
        text = fin.read()
        text = pro.count_words(text)
        fout.write(text)

    # 用jeiba對訓練集進行分詞處理
    jieba_train = os.path.join(sys.path[0], r'.\data\jieba_train.txt')
    pro.seq_word(train_path, jieba_train)

    # 用jeiba對測試集進行分詞處理
    jieba_train = os.path.join(sys.path[0], r'.\data\jieba_test.txt')
    pro.seq_word(test_path, jieba_train)

    output_tag = os.path.join(sys.path[0], r'.\data\train_tag.txt')
    pro.postag_txt(train_path, output_tag)

# F1值計算
def estimate_F1():
    # 評估分詞模型的準確率
    hmm_path = r'.\data\hmm_output.txt'
    jieba_path = r'.\data\jieba_test.txt'

    with open(hmm_path, 'r',
              encoding='utf-8') as f_hmm, open(jieba_path,
                                               'r',
                                               encoding='utf-8') as f_jieba:
        all_words_answer = 0
        all_words_sample = 0
        correct = 0
        sentence1 = f_hmm.readline()
        sentence2 = f_jieba.readline()

        while sentence1:
            hmm_line = sentence1.split()
            jieab_line = set(sentence2.split())
            same_list = [x for x in hmm_line if x in jieab_line]

            all_words_answer += len(hmm_line)
            all_words_sample += len(jieab_line)
            correct += len(same_list)

            sentence1 = f_hmm.readline()
            sentence2 = f_jieba.readline()

            recall = correct / all_words_answer
            precision = correct / all_words_sample
            f_mesure = (2 * precision * recall) / (precision + recall)
        print("詞數：", all_words_answer)
        print("Precision:", round(precision, 4), "Recall", round(recall, 4),
              "F-mesure", round(f_mesure, 4))


if __name__ == "__main__":
    Process()