數據列表文本相似度去重,中文英文都可以

代碼

import spacy
from itertools import combinations

nlp = spacy.load("zh_core_web_lg")
# nlp = spacy.load("en_core_web_md")

# 1、預處理數據列表,去重數據行中的停用詞,中文如‘的’,英文如'of',再重組返回數據列表    
# 2、注意spacy.load使用英文庫時可以使用lemmatized_tokens.append(token.lemma_),中文庫時使用lemmatized_tokens.append(token.text)    
def pre_process(titles):
    """
    Pre-processes titles by removing stopwords and lemmatizing text.
    :param titles: list of strings, contains target titles,.
    :return: preprocessed_title_docs, list containing pre-processed titles.
    """

    # Preprocess all the titles
    title_docs = [nlp(x) for x in titles]
    preprocessed_title_docs = []
    lemmatized_tokens = []
    for title_doc in title_docs:
        for token in title_doc:
            is_stop = token.is_stop
            text = token.text
            lemma_ = token.lemma_
            if not is_stop:
                lemmatized_tokens.append(token.text)
        preprocessed_title_docs.append(" ".join(lemmatized_tokens))
        del lemmatized_tokens[:]  # empty the lemmatized tokens list as the code moves onto a new title

    return titles

# 1、打散數據列表重組成兩兩一對的大列表,數據發散成n*n,    
# 2、使用spacy的餘弦相似度函數過濾出兩兩小列表中相似分符合指定分值的,本文中指定分值是similarity > 0.8    
# 3、遞歸過濾剔除數據列表重複值    
def similarity_filter(titles):
    """
    Recursively check if titles pass a similarity filter.
    :param titles: list of strings, contains titles.
    If the function finds titles that fail the similarity test, the above param will be the function output.
    :return: this method upon itself unless there are no similar titles; in that case the feed that was passed
    in is returned.
    """

    # Preprocess titles
    preprocessed_title_docs = pre_process(titles)

    # Remove similar titles
    all_summary_pairs = list(combinations(preprocessed_title_docs, 2))
    similar_titles = []
    for pair in all_summary_pairs:
        title1 = nlp(pair[0])
        title2 = nlp(pair[1])
        similarity = title1.similarity(title2)
        if similarity > 0.8:
            similar_titles.append(pair)

    titles_to_remove = []
    for a_title in similar_titles:
        # Get the index of the first title in the pair
        index_for_removal = preprocessed_title_docs.index(a_title[0])
        titles_to_remove.append(index_for_removal)

    # Get indices of similar titles and remove them
    similar_title_counts = set(titles_to_remove)
    similar_titles = [
        x[1] for x in enumerate(titles) if x[0] in similar_title_counts
    ]

    # Exit the recursion if there are no longer any similar titles
    if len(similar_title_counts) == 0:
        return titles
    # Continue the recursion if there are still titles to remove
    else:
        # Remove similar titles from the next input
        for title in similar_titles:
            idx = titles.index(title)
            titles.pop(idx)

        return similarity_filter(titles)

if __name__ == '__main__':
    titles_ = [
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "我愛中國人民",
        "2020年家庭",
        "1月銷售預測",
        "賬戶餘額2017-2018",
        "銷售預測1月份",
        "年終晚會回顧-2020",
    ]

    res = similarity_filter(titles_)
    print(res) 
# console output ['我愛中國人民', '2020年家庭', '賬戶餘額2017-2018', '銷售預測1月份', '年終晚會回顧-2020']

spacy安裝教程

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章