詞典分詞
切分算法
詞典導入
# -*- coding:utf-8 -*-
# Author:AG
# Date: 2020-07-5
from pyhanlp import *
def load_dictionary():
IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')
path = HanLP.Config.CoreDictionaryPath
dic = IOUtil.loadDictionary([path])
return set(dic.keySet())
完全切分
完全切分事實上還不能稱之爲中文分詞,它沒有體現有意義的詞語序列
# -*- encoding: utf-8 -*-
"""
@File : fully_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-05 21:29 AG 1.0 None
"""
from dictionarySeg import utility
def fully_segment(dic, text):
word_list = []
for i in range(len(text)):
for j in range(i + 1, len(text) + 1):
word = text[i:j]
if word in dic:
word_list.append(word)
return word_list
dic = utility.load_dictionary()
text = '我愛智能信息處理研究所'
print(fully_segment(dic, text))
結果展示
正向最長匹配
# -*- encoding: utf-8 -*-
"""
@File : forward_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-05 21:51 AG 1.0 None
"""
from dictionarySeg import utility
def forward_seg(dic, text):
i = 0
word_list = []
while i < len(text):
longest_word = text[i]
for j in range(i + 1, len(text) + 1):
word = text[i:j]
if len(word) > len(longest_word) and word in dic:
longest_word = word
word_list.append(longest_word)
i += len(longest_word)
return word_list
dic = utility.load_dictionary()
text1 = '項目的研究'
text2 = '研究生命起源'
print()
print(forward_seg(dic, text1))
print(forward_seg(dic, text2))
結果展示
逆向最長匹配
# -*- encoding: utf-8 -*-
"""
@File : backward_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-06 12:56 AG 1.0 None
"""
from dictionarySeg import utility
def backward_seg(dic, text):
word_list = []
i = len(text) - 1
while i >= 0:
longest_word = text[i]
for j in range(0, i):
word = text[j:i+1]
if len(word) > len(longest_word) and word in dic:
longest_word = word
break # 這個break非常重要!
# word_list.append(longest_word) # back_seg裏 append就不合適了
word_list.insert(0, longest_word)
i = i - len(longest_word)
return word_list
dic = utility.load_dictionary()
text = '研究生命起源'
print()
print(backward_seg(dic, text))
結果展示
雙向最長匹配
# -*- encoding: utf-8 -*-
"""
@File : bidirectional_seg.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-06 13:16 AG 1.0 其實雙向,不怎麼樣
"""
from dictionarySeg import utility
from dictionarySeg import forward_seg
from dictionarySeg import backward_seg
def count_single_char(word_list: list) -> int:
return sum(1 for word in word_list if word == 1)
def bidirectional_seg(dic, text):
f = forward_seg.forward_seg(dic, text)
b = backward_seg.backward_seg(dic, text)
if len(f) < len(b):
return f
elif len(f) > len(b):
return b
else:
if count_single_char(f) < count_single_char(b):
return f
else:
return b
dic = utility.load_dictionary()
text = '商品和服務'
print(bidirectional_seg(dic, text))
函數參數中的“:”是參數的類型建議符(告訴程序員希望傳入的實參的類型)
函數後面跟着的“->”是函數返回值的類型建議符(用來說明該函數返回的值是什麼類型)
舉個例子:
def isValid(s: 'str') -> 'bool':
return s
這裏的參數:‘註解內容’ 和 箭頭‘註解內容’的用法是爲標註了參數和返回值的類型,使代碼更具有閱讀性
和 def isValid(s):
return s
效果上其實沒有區別
速度測評
# -*- encoding: utf-8 -*-
"""
@File : speedEvl.py
@Contact : [email protected]
@License : (C)Copyright 2019-2020, CodingPark
@Modify Time @Author @Version @Desciption
------------ ------- -------- -----------
2020-07-06 13:55 AG 1.0 None
"""
import time
from dictionarySeg import utility
from dictionarySeg import forward_seg
from dictionarySeg import backward_seg
from dictionarySeg import bidirectional_seg
def evaluate_speed(segment, dic, text):
start_time = time.time()
for i in range(pressure):
segment(dic, text)
elapsed_time = time.time() - start_time
print('%.2f 萬字/秒' % (len(text) * pressure / 10000 / elapsed_time))
if __name__ == '__main__':
pressure = 10000
text = "江西鄱陽湖乾枯,中國最大淡水湖變成大草原"
dic = utility.load_dictionary()
print('\n由於JPype調用開銷巨大,以下速度顯著慢於原生Java')
print('\n-----前向最大匹配時速-----')
evaluate_speed(forward_seg.forward_seg, dic, text)
print('\n-----後向最大匹配時速-----')
evaluate_speed(backward_seg.backward_seg, dic, text)
print('\n-----雙向最大匹配時速-----')
evaluate_speed(bidirectional_seg.bidirectional_seg, dic, text)
print('\n我認爲 後向最大匹配 在速度與準確率上均很出色')
結果展示