人物關係抽取——基於特徵工程

本文代碼,不得轉載。

# -*- coding: utf-8 -*-
# Author: lx
# extract features from the text

import pandas as pd
import numpy as np
from text1 import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils.validation import check_array
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from data_process import load_data_and_labels
from nltk.corpus import stopwords
import nltk
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.corenlp import CoreNLPParser
from nltk.parse.stanford import StanfordDependencyParser
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE

# 載入數據
trainFile = r'D:\file_download\BaiduNetdiskDownload\PyCharm_File\graduation\person_relation.txt'
# e1,e2: 位置索引, pos1,pos2: 相對位置,e1,e2爲中心(100)
texts, raw_label, e1, e2, pos1, pos2 = load_data_and_labels(trainFile)

# 分詞
def token(texts):
    token = []
    for text_raw in texts:
        text = nltk.word_tokenize(text_raw)
        token.append(text)
    return token

# 詞性標註,先用list保存
def pos(texts):
    rfiltered_list = []
    for text_raw in texts:
        text = nltk.word_tokenize(text_raw)
        # 去掉標點符號
        # english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
        # text = [w for w in text if w not in english_punctuations]
        # 去停用詞??還是不去??
        # filtered = [w for w in text if w not in stopwords.words('english')]
        rfiltered = nltk.pos_tag(text)
        rfiltered_list.append(rfiltered)
    return rfiltered_list

# 命名實體識別,結果爲所有句子識別後的tree對象組成的list
def ner(texts):
    # ner_list = []
    # for text in pos(texts):
    #     tree_list = []
    #     for tree in nltk.ne_chunk(text, binary=False).subtrees():
    #         # 過濾根樹
    #         if tree.label() == "S":
    #             continue
    #         tree_list.append(tree)
    #     ner_list.append(tree_list)
    # 不過濾根樹,由所有tree對象組成的list
    ner_list = []
    for text in pos(texts):
        ner_list.append(nltk.ne_chunk(text, binary=False))
    return ner_list

# 句法分析+依存句法分析,結果爲所有句子分析後的tree對象組成的list
def parser(texts):
    # standfordnlp有問題
    parser = CoreNLPParser(r'')
    parser_list = []
    for text in texts:
        parse_result = parser.parse(nltk.word_tokenize(text))
        # print parse_result
        parser_list.append(parse_result)
    return parser_list

# 構造特徵向量
def featurevector(texts):
    # 各維數據
    x = []
    # 詞彙特徵
    # f1,e1詞性; f2,e2詞性# f3,e1前一個詞,f4,e2前一個詞# f5,e1前一個詞詞性,f6,e2前一個詞的詞性,
    # f7,e1前第二個詞; f8,e2前第二個詞; f9,e1前第二個詞詞性; f10,e2前第二個詞詞性;

    for i in range(0,len(pos(texts))):
        x1 = (pos(texts)[i][e1[i]])[1]
        x2 = (pos(texts)[i][e2[i]])[1]
        if e1[i] == 0:
            x3 = 'null'
            x5 = 'null'
            x7 = 'null'
            x9 = 'null'
        else:
            x3 = token(texts)[i][e1[i]-1]
            x5 = (pos(texts)[i][e1[i]-1])[1]
            x7 = token(texts)[i][e1[i]-2]
            x9 = (pos(texts)[i][e1[i]-2])[1]
        x4 = token(texts)[i][e2[i]-1]
        x6 = (pos(texts)[i][e2[i]-1])[1]
        x8 = token(texts)[i][e2[i]-2]
        x10 = (pos(texts)[i][e2[i]-2])[1]
        # f11,位置特徵:此處用實體間的距離
        x11 = int(e2[i] - e1[i])
        # print [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11]
        print i
        x.append([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11])
    feature = pd.DataFrame(data=x, columns=["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11"])
    return feature

def tansx(feature):
    estimator = PCA(n_components=2)
    x_pca = estimator.fit_transform(feature)
    return x_pca


    # x2 = ner(texts)  # 2,命名實體
    # x3 = parser(texts)  # 3,句法分析

smo = SMOTE(random_state=42)
dict_vec = DictVectorizer(sparse=False)
feature = dict_vec.fit_transform(featurevector(texts).to_dict(orient='record'))

print(dict_vec.feature_names_)
X_smo, y_smo = smo.fit_sample(np.array(feature).reshape(400,-1), np.array(raw_label).reshape(400,-1))

x_train, x_test, y_train, y_test = train_test_split(X_smo, y_smo, train_size=0.8, random_state=33)
# x_train, x_test, y_train, y_test = train_test_split(featurevector(texts), raw_label, train_size=0.8, random_state=33)
# wordVectorizer = CountVectorizer(min_df=3, token_pattern="\t", ngram_range=(1, 2))
# wordVectorizer = CountVectorizer(ngram_range=(1, 2))
# train_feature = wordVectorizer.fit_transform(x_train)
# train_feature = tansx(train_feature)
# # wordTransformer = TfidfTransformer()
# # train_feature = wordTransformer.fit_transform(train_feature)
# # test_feature = wordTransformer.transform(wordVectorizer.transform(x_test))
# test_feature = wordVectorizer.fit_transform(x_test)
# # onevsrest = OneVsRestClassifier(LogisticRegression(C = 1, tol = 0.01))
# dict_vec = DictVectorizer(sparse=False)
# X_train = dict_vec.fit_transform(x_train.to_dict(orient='record'))
# X_test = dict_vec.transform(x_test.to_dict(orient='record'))
#
# print(dict_vec.feature_names_)

train_feature = x_train
test_feature = x_test
onevsrest = OneVsRestClassifier(LinearSVC(C=1, tol=0.0001, dual=True))
onevsrest.fit(train_feature, y_train)
y_pred = onevsrest.predict(test_feature)
print "the mean accuracy:"
print onevsrest.score(test_feature, y_test)

print "詳細的評估指標:"
print classification_report(y_pred, y_test)



predictions = onevsrest.decision_function(test_feature)
predictions[predictions == 0] = - np.inf
# predictions = onevsrest.predict_proba(test_feature)

acc_list = [0.0] * 10
for prediction, label in zip(predictions, y_test):
    topN = np.argsort(prediction)[-10:]


    for i in range(1, 11):
        if label[topN[-i:]].sum() > 0:
            acc_list[i - 1] += 1

for i, acc in enumerate(acc_list, 1):
    print i, acc / len(y_test)


# if __name__ == "__main__":

    # print pos(texts)
    # print ner(texts)
    # print ner(texts)[0].draw()
    # for t in tree:
    #     print t
    #     print t.draw()
    # text = "After repeating the 35-word oath of office, Trump stretched his arms wide and hugged his wife, Melania, and other members of his family"
    # text = "This tradition has been narrated in prophet's early biographies"
    # # 參數分別是jar包和model
    # parser = StanfordParser(r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar', r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar')
    # # parse_result = parser.parse(nltk.word_tokenize(text))
    # # for i in parse_result:
    # #     print i
    # # for i in list(parse_result):
    # #     i.draw()
    # lexparser = StanfordDependencyParser(r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser.jar', r'D:\file_download\BaiduNetdiskDownload\standford-nlp\stanford-parser-full-2015-12-09\stanford-parser-full-2015-12-09\stanford-parser-3.6.0-models.jar')
    # parse_result2 = lexparser.parse(nltk.word_tokenize(text))
    # for i in list(parse_result2)[0].triples():
    #     print i

    # from nltk.tag import StanfordPOSTagger
    # nltk.internals.config_java(bin=r"D:/Java/bin/java")
    # eng_tagger = StanfordPOSTagger(model_filename=r'D:/file_download\BaiduNetdiskDownload/standford-nlp/stanford-postagger-full-2015-12-09/stanford-postagger-full-2015-12-09/models/english-bidirectional-distsim.tagger',path_to_jar=r'D:/file_download/BaiduNetdiskDownload/standford-nlp/stanford-postagger-full-2015-12-09/stanford-postagger-full-2015-12-09/stanford-postagger.jar')
    # print(eng_tagger.tag('What is the airspeed of an unladen swallow'.split()))
    # print featurevector(texts)
    # print token(texts)
    # tex = "This tradition has been narrated in prophet's early biographies"
    # tok = nltk.word_tokenize(tex)
    # print nltk.pos_tag(tok)
    # print nltk.ne_chunk(nltk.pos_tag(tok), binary=False)
    # print nltk.ne_chunk(nltk.pos_tag(tok), binary=False).draw()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章