切分算法[完全切分 & 正向 & 逆向 & 雙向最長匹配 & 速度測評]_CodingPark編程公園

詞典分詞

切分算法

詞典導入

# -*- coding:utf-8 -*-
# Author:AG
# Date: 2020-07-5

from pyhanlp import *


def load_dictionary():
    IOUtil = JClass('com.hankcs.hanlp.corpus.io.IOUtil')
    path = HanLP.Config.CoreDictionaryPath
    dic = IOUtil.loadDictionary([path])
    return set(dic.keySet())


完全切分

完全切分事實上還不能稱之爲中文分詞,它沒有體現有意義的詞語序列

# -*- encoding: utf-8 -*-
"""
@File    :   fully_seg.py
@Contact :   [email protected]
@License :   (C)Copyright 2019-2020, CodingPark

@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2020-07-05 21:29   AG         1.0         None
"""
from dictionarySeg import utility


def fully_segment(dic, text):
    word_list = []
    for i in range(len(text)):
        for j in range(i + 1, len(text) + 1):
            word = text[i:j]
            if word in dic:
                word_list.append(word)

    return word_list

dic = utility.load_dictionary()
text = '我愛智能信息處理研究所'

print(fully_segment(dic, text))



結果展示
在這裏插入圖片描述

正向最長匹配

# -*- encoding: utf-8 -*-
"""
@File    :   forward_seg.py    
@Contact :   [email protected]
@License :   (C)Copyright 2019-2020, CodingPark

@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2020-07-05 21:51   AG         1.0         None
"""
from dictionarySeg import utility


def forward_seg(dic, text):
    i = 0
    word_list = []
    while i < len(text):
        longest_word = text[i]
        for j in range(i + 1, len(text) + 1):
            word = text[i:j]
            if len(word) > len(longest_word) and word in dic:
                longest_word = word

        word_list.append(longest_word)
        i += len(longest_word)

    return word_list


dic = utility.load_dictionary()
text1 = '項目的研究'
text2 = '研究生命起源'

print()
print(forward_seg(dic, text1))
print(forward_seg(dic, text2))


結果展示
在這裏插入圖片描述

逆向最長匹配

# -*- encoding: utf-8 -*-
"""
@File    :   backward_seg.py    
@Contact :   [email protected]
@License :   (C)Copyright 2019-2020, CodingPark

@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2020-07-06 12:56   AG         1.0         None
"""
from dictionarySeg import utility


def backward_seg(dic, text):
    word_list = []
    i = len(text) - 1
    while i >= 0:
        longest_word = text[i]
        for j in range(0, i):
            word = text[j:i+1]
            if len(word) > len(longest_word) and word in dic:
                longest_word = word
                break       # 這個break非常重要!

        # word_list.append(longest_word)        # back_seg裏 append就不合適了
        word_list.insert(0, longest_word)
        i = i - len(longest_word)

    return word_list


dic = utility.load_dictionary()
text = '研究生命起源'

print()
print(backward_seg(dic, text))



結果展示
在這裏插入圖片描述

雙向最長匹配

# -*- encoding: utf-8 -*-
"""
@File    :   bidirectional_seg.py    
@Contact :   [email protected]
@License :   (C)Copyright 2019-2020, CodingPark

@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2020-07-06 13:16   AG         1.0         其實雙向,不怎麼樣
"""
from dictionarySeg import utility
from dictionarySeg import forward_seg
from dictionarySeg import backward_seg


def count_single_char(word_list: list) -> int:
    return sum(1 for word in word_list if word == 1)

def bidirectional_seg(dic, text):
    f = forward_seg.forward_seg(dic, text)
    b = backward_seg.backward_seg(dic, text)
    if len(f) < len(b):
        return f
    elif len(f) > len(b):
        return b
    else:
        if count_single_char(f) < count_single_char(b):
            return f
        else:
            return b

dic = utility.load_dictionary()
text = '商品和服務'

print(bidirectional_seg(dic, text))




函數參數中的“:”是參數的類型建議符(告訴程序員希望傳入的實參的類型)

函數後面跟着的“->”是函數返回值的類型建議符(用來說明該函數返回的值是什麼類型)


舉個例子:

def isValid(s: 'str') -> 'bool':

    return s

這裏的參數:‘註解內容’ 和 箭頭‘註解內容’的用法是爲標註了參數和返回值的類型,使代碼更具有閱讀性

和 def isValid(s):

        return s

效果上其實沒有區別


速度測評

# -*- encoding: utf-8 -*-
"""
@File    :   speedEvl.py    
@Contact :   [email protected]
@License :   (C)Copyright 2019-2020, CodingPark

@Modify Time      @Author    @Version    @Desciption
------------      -------    --------    -----------
2020-07-06 13:55   AG         1.0         None
"""
import time
from dictionarySeg import utility
from dictionarySeg import forward_seg
from dictionarySeg import backward_seg
from dictionarySeg import bidirectional_seg



def evaluate_speed(segment, dic, text):
    start_time = time.time()
    for i in range(pressure):
        segment(dic, text)
    elapsed_time = time.time() - start_time
    print('%.2f 萬字/秒' % (len(text) * pressure / 10000 / elapsed_time))


if __name__ == '__main__':
    pressure = 10000
    text = "江西鄱陽湖乾枯,中國最大淡水湖變成大草原"
    dic = utility.load_dictionary()

    print('\n由於JPype調用開銷巨大,以下速度顯著慢於原生Java')
    print('\n-----前向最大匹配時速-----')
    evaluate_speed(forward_seg.forward_seg, dic, text)
    print('\n-----後向最大匹配時速-----')
    evaluate_speed(backward_seg.backward_seg, dic, text)
    print('\n-----雙向最大匹配時速-----')
    evaluate_speed(bidirectional_seg.bidirectional_seg, dic, text)

    print('\n我認爲 後向最大匹配 在速度與準確率上均很出色')



結果展示
在這裏插入圖片描述

在這裏插入圖片描述

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章