1. 模型理論與應用
以下幾個問題都是比較經典的問題,會對模型的深入理解會有很大的幫助。 特別是對於邏輯迴歸的二次導數的求解過程可以用來證明一個函數是否凸函數。
1.1 邏輯迴歸相關
假設我們有訓練數據, 其中爲每一個樣本,而且是樣本的特徵並且, 代表樣本數據的標籤(label), 取值爲或者. 在邏輯迴歸中,模型的參數爲。對於向量,我們一般用粗體來表達。請回答以下問題。
(a) 在邏輯迴歸模型下,請寫出目標函數(objective function), 也就是我們需要"最小化"的目標(也稱之爲損失函數或者loss function),不需要考慮正則
(b) 求出的梯度(或者計算導數),需要必要的中間過程。
$\frac{\partial L(\mathbf{w},b)}{\partial \mathbf{w}}\
= argmin_{w,b}-\sum_{i=1}ny_i\frac{\sigma(wTx_i + b)[1 - \sigma(w^Tx_i + b)]x_i}{\sigma(w^Tx_i+b)} + (1 - y_i)\frac{(-1)\sigma(w^Tx_i + b)[1 - \sigma(w^Tx_i + b)]x_i} {1 - \sigma(w^Tx_i + b)}\
= argmin_{w,b}-\sum_{i = 1}^n y_i[1 - \sigma(w^Tx_i + b)]x_i + (y_i - 1)\sigma(w^Tx_i + b)x_i\
= argmin_{w,b}-\sum_{i = 1}^n [y_i - y_i\sigma(w^Tx_i + b)]x_i + y_i\sigma(w^Tx_i + b)x_i - \sigma(w^Tx_i + b)x_i\
= argmin_{w,b}-\sum_{i = 1}^n [y_i - \sigma(w^Tx_i + b)]x_i\
= argmin_{w,b}\sum_{i = 1}^n [\sigma(w^Tx_i + b) - y_i]x_i$
$\frac{\partial L(\mathbf{w},b)}{\partial b}\
= argmin_{w,b}-\sum_{i=1}ny_i\frac{\sigma(wTx_i + b)[1 - \sigma(w^Tx_i + b)]} {\sigma(w^Tx_i + b)} + (1 - y_i)\frac{(-1)\sigma(w^Tx_i + b)[1 - \sigma(w^Tx_i + b)]} {1 - \sigma(w^Tx_i + b)}\
= argmin_{w,b}-\sum_{i=1}^ny_i[1 - \sigma(w^Tx_i + b)] + (1 - y_i)(-1)\sigma(w^Tx_i + b)\
= argmin_{w,b}-\sum_{i=1}^n[y_i - \sigma(w^Tx_i + b)]\
= argmin_{w,b}\sum_{i=1}n[\sigma(wTx_i + b) - y_i]$
© 請寫出基於梯度下降法(batch)的對於和的更新
(d) 假設在(a)的基礎上加了一個L2正則項,請寫出基於梯度下降法(batch)的對於和的更新
(e) 在(b)的基礎上接着對求導(等於二階導數,二階導數的維度爲),這個二階導數也稱之爲Hessian Matrix(https://en.wikipedia.org/wiki/Hessian_matrix) 對於矩陣、向量的求導請參考:https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf
(f) 請說明在(e)的得出來的Hessian Matrix是Positive Definite. 提示:爲了證明一個的矩陣爲Positive Semidefinite,需要證明對於任意一個非零向量, 需要得出
請推導或者說明:
證明:
假設
那麼,有
因爲、以及都是大於等於0的
所以
所以,最終得出
2. 情感分析項目
文本讀取
import re
import jieba
import numpy as np
# 讀取文件內容
def read_train_file(file_path='', comments=[], labels=[], val=''):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read().replace(' ','').replace('\n','')
reg = '<reviewid="\d{1,4}">(.*?)</review>'
results = re.findall(reg, text)
for result in results:
result = ','.join(jieba.cut(result))
comments.append(result)
labels.append(val)
def read_test_file(file_path='', comments=[], labels=[]):
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read().replace(' ','').replace('\n','')
reg = '<reviewid="\d{1,4}".*?</review>'
results = re.findall(reg, text)
for result in results:
label_reg = '<reviewid="\d{1,4}"label="(\d)">'
com_reg = '>(.*?)</review>'
label = re.findall(label_reg, result)[0]
comment = re.findall(com_reg, result)[0]
labels.append(label)
comment = ','.join(jieba.cut(comment))
comments.append(comment)
assert(len(comments) == len(labels))
# TODO: 讀取文件部分,把具體的內容寫入到變量裏面
train_comments = []
train_labels = []
test_comments = []
test_labels = []
def process_file():
"""
讀取訓練數據和測試數據,並對它們做一些預處理
"""
train_pos_file = "data/train.positive.txt"
train_neg_file = "data/train.negative.txt"
test_comb_file = "data/test.combined.txt"
# 讀取正面評論文件內容
read_train_file(train_pos_file, train_comments, train_labels, '1')
# 讀取負面評論文件內容
read_train_file(train_neg_file, train_comments, train_labels, '0')
# 讀取測試文件數據
read_test_file(test_comb_file, test_comments, test_labels)
process_file()
print(len(train_comments), len(train_labels), len(test_comments), len(test_labels))
簡單的可視化分析
import matplotlib.pyplot as plt
import numpy as np
pos_comments_count = []
neg_comments_count = []
pos_train_comments = []
neg_train_comments = []
def get_comments():
index = 0
for flag in train_labels:
comment = train_comments[index]
length = len(comment)
if flag == '1':
pos_comments_count.append(length)
pos_train_comments.append(comment)
else:
neg_comments_count.append(length)
neg_train_comments.append(comment)
index = index + 1
get_comments()
pos_total_count = len(pos_comments_count)
neg_total_count = len(neg_comments_count)
print(pos_total_count, neg_total_count)
# 計算相同長度的字符串出現的次數
def cal_statics(comments_count=[]):
temp_dict = {}
total_num = len(comments_count)
for length in comments_count:
temp_dict[length] = temp_dict.get(length, 0) + 1
for key in temp_dict:
temp_dict[key] = temp_dict[key]/total_num
return temp_dict
pos_statics = cal_statics(pos_comments_count)
neg_statics = cal_statics(neg_comments_count)
print(len(pos_statics), len(neg_statics))
# 排序
pos_statics = dict(sorted(pos_statics.items(), key = lambda x:x[0]))
neg_statics = dict(sorted(neg_statics.items(), key = lambda x:x[0]))
# 畫正樣本histogram
pos_x = list(pos_statics.keys())
pos_y = list(pos_statics.values())
neg_x = list(neg_statics.keys())
neg_y = list(neg_statics.values())
fig = plt.figure()
plt.bar(pos_x, pos_y, 1, color="red")
plt.xlabel("every comment string length")
plt.ylabel("percentage of this string length")
plt.title("positive comments histogram")
fig = plt.figure()
plt.bar(neg_x, neg_y, 1, color="green")
plt.xlabel("every comment string length")
plt.ylabel("percentage of this string length")
plt.title("negative comments histogram")
import collections
import jieba
def get_top20_words(comments=[]):
word_library = [] # 儲存所有詞
for comment in comments:
for i in jieba.cut(comment):
word_library.append(i)
word_dic = collections.Counter(word_library).most_common(20)
top20_list = [i[0] for i in word_dic]
return top20_list
pos_top20_words = get_top20_words(pos_train_comments)
neg_top20_words = get_top20_words(neg_train_comments)
print('pos_top20_words:' + str(pos_top20_words))
print('neg_top20_words:' + str(neg_top20_words))
# 將正面評價和負面評價中共同出現的詞作爲停用詞
stop_words = []
for word in pos_top20_words:
if word in neg_top20_words and word.isalnum():
stop_words.append(word)
print('stop_words:' + str(stop_words))
pos_top20_words:[',', ',', '的', '。', '了', '是', '!', '很', '我', '也', '在', '有', '~', '都', '好', '.', '不錯', '就', '買', '這']
neg_top20_words:[',', ',', '的', '。', '了', '!', '是', '我', '不', '買', '就', '也', '都', '很', '有', '在', '?', '沒有', '!', '.']
stop_words:['的', '了', '是', '很', '我', '也', '在', '有', '都', '就', '買']
文本處理部分
import string
def text_preprocessing(comments=[]):
new_comments = []
for comment in comments:
new_sentence = ''
for word in jieba.cut(comment):
# 去除停用詞、標點符號、數字
if word not in stop_words and word.isalnum() and not word.isdigit():
new_sentence += word
new_comments.append(new_sentence)
return new_comments
train_comments_new = text_preprocessing(train_comments)
test_comments_new = text_preprocessing(test_comments)
print(len(train_comments_new), len(test_comments_new))
從文本中提取特徵
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_comments) # 訓練數據的特徵
y_train = np.array(train_labels) # 訓練數據的label
X_test = vectorizer.transform(test_comments) # 測試數據的特徵
y_test = np.array(test_labels) # 測試數據的label
print(np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))
訓練模型以及選擇合適的超參數
利用邏輯迴歸來訓練模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
def process_text(text=''):
text = ''.join(e for e in text if e.isalnum())
return ', '.join(jieba.cut(text))
parameters = { 'C': np.logspace(-3, 3, 7)}
lr = LogisticRegression(solver='liblinear')
clf = GridSearchCV(lr, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))
# clf = LogisticRegression(C=1.0).fit(X_train, y_train)
# 打印在訓練數據上的準確率
print("訓練數據上的準確率爲:" + str(clf.score(X_train, y_train)))
# # 打印在測試數據上的準確率
print("測試數據上的準確率爲: " + str(clf.score(X_test, y_test)))
test_comment1 = '這個寶貝還是比較不錯滴'
test_comment2 = '很不好,太差了'
test = []
test.append(process_text(test_comment2))
print(test)
print(vectorizer.transform(test))
print(clf.predict(vectorizer.transform(test)))
{'C': 1.0}
precision recall f1-score support
0 0.86 0.54 0.66 1250
1 0.67 0.91 0.77 1250
micro avg 0.73 0.73 0.73 2500
macro avg 0.76 0.73 0.72 2500
weighted avg 0.76 0.73 0.72 2500
訓練數據上的準確率爲:0.8721636701797892
測試數據上的準確率爲: 0.7268
['很, 不好, 太差, 了']
(0, 10188) 0.8064523512198745
(0, 3669) 0.591299082708519
['0']
利用SVM來訓練模型
from sklearn import svm
# TODO: 利用SVM來訓練模型
parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':np.logspace(-3, 3, 7)}
svc = svm.SVC(gamma='scale')
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))
{'C': 1.0, 'kernel': 'sigmoid'}
precision recall f1-score support
0 0.85 0.59 0.70 1250
1 0.69 0.89 0.78 1250
micro avg 0.74 0.74 0.74 2500
macro avg 0.77 0.74 0.74 2500
weighted avg 0.77 0.74 0.74 2500
仍然使用SVM模型,但在這裏使用Bayesian Optimization來尋找最好的超參數
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.svm import SVC
def svm_cv(C, gamma):
svm = SVC(C=10 ** C, gamma=10 ** gamma, random_state=1)
val = cross_val_score(svm,X_train, y_train, cv=5).mean()
return val
pbounds = {'C':(0,1), 'gamma':(2,20)}
svm_bo = BayesianOptimization(svm_cv, pbounds=pbounds)
svm_bo.maximize()
| iter | target | C | gamma |
-------------------------------------------------
| 1 | 0.6206 | 0.3705 | 6.928 |
| 2 | 0.6206 | 0.9682 | 4.705 |
| 3 | 0.6206 | 0.7015 | 7.333 |
| 4 | 0.6206 | 0.5141 | 12.72 |
| 5 | 0.6206 | 0.6732 | 6.483 |
| 6 | 0.6206 | 0.6284 | 19.99 |
| 7 | 0.6206 | 0.04032 | 19.99 |
| 8 | 0.6208 | 0.8602 | 2.037 |
| 9 | 0.6206 | 0.1939 | 20.0 |
| 10 | 0.6208 | 0.2209 | 2.017 |
| 11 | 0.6206 | 0.8674 | 20.0 |
| 12 | 0.6208 | 0.58 | 2.033 |
| 13 | 0.6208 | 0.859 | 2.009 |
| 14 | 0.6206 | 0.9947 | 19.94 |
| 15 | 0.6208 | 0.06059 | 2.017 |
| 16 | 0.6206 | 0.2054 | 19.95 |
| 17 | 0.6208 | 0.8543 | 2.084 |
| 18 | 0.6208 | 0.103 | 2.021 |
| 19 | 0.6206 | 0.8894 | 19.97 |
| 20 | 0.6208 | 0.3986 | 2.015 |
| 21 | 0.6208 | 0.4768 | 2.015 |
| 22 | 0.6206 | 0.2138 | 19.98 |
| 23 | 0.6208 | 0.3656 | 2.135 |
| 24 | 0.6208 | 0.4324 | 2.012 |
| 25 | 0.6206 | 0.06989 | 20.0 |
| 26 | 0.6208 | 0.5922 | 2.132 |
| 27 | 0.6208 | 0.9851 | 2.101 |
| 28 | 0.6208 | 0.1901 | 2.146 |
| 29 | 0.6206 | 0.2751 | 20.0 |
| 30 | 0.6208 | 0.5422 | 2.036 |
=================================================
特徵: 添加n-gram特徵
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_comments) # 添加完bigram之後的特徵
y_train = np.array(train_labels) #
X_test = vectorizer.transform(test_comments) # 添加完bigram之後的特徵
y_test = np.array(test_labels) #
print (np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))
利用邏輯迴歸來訓練模型
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
def process_text(text=''):
text = ''.join(e for e in text if e.isalnum())
return ', '.join(jieba.cut(text))
parameters = { 'C': np.logspace(-3, 3, 7)}
lr = LogisticRegression(solver='liblinear')
clf = GridSearchCV(lr, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))
# clf = LogisticRegression(C=1.0).fit(X_train, y_train)
# 打印在訓練數據上的準確率
print("訓練數據上的準確率爲:" + str(clf.score(X_train, y_train)))
# # 打印在測試數據上的準確率
print("測試數據上的準確率爲: " + str(clf.score(X_test, y_test)))
test_comment1 = '這個寶貝還是比較不錯滴'
test_comment2 = '很不好,太差了'
test = []
test.append(process_text(test_comment2))
print(test)
print(vectorizer.transform(test))
print(clf.predict(vectorizer.transform(test)))
{'C': 10.0}
precision recall f1-score support
0 0.84 0.61 0.71 1250
1 0.69 0.89 0.78 1250
micro avg 0.75 0.75 0.75 2500
macro avg 0.77 0.75 0.74 2500
weighted avg 0.77 0.75 0.74 2500
訓練數據上的準確率爲:0.9952882827030378
測試數據上的準確率爲: 0.7484
['很, 不好, 太差, 了']
(0, 55400) 0.8064523512198745
(0, 15496) 0.591299082708519
['0']
利用SVM來訓練模型
from sklearn import svm
parameters = {'kernel':('linear', 'rbf', 'poly', 'sigmoid'), 'C':np.logspace(-3, 3, 7)}
svc = svm.SVC(gamma='scale')
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train, y_train)
print(clf.best_params_)
y_predict = clf.predict(X_test)
print(classification_report(y_test, y_predict))
{'C': 1.0, 'kernel': 'linear'}
precision recall f1-score support
0 0.85 0.61 0.71 1250
1 0.70 0.89 0.78 1250
micro avg 0.75 0.75 0.75 2500
macro avg 0.77 0.75 0.75 2500
weighted avg 0.77 0.75 0.75 2500
仍然使用SVM模型,但在這裏使用Bayesian Optimization來尋找最好的超參數
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
from sklearn.svm import SVC
def svm_cv(C, gamma):
svm = SVC(C=10 ** C, gamma=10 ** gamma, random_state=1)
val = cross_val_score(svm,X_train, y_train, cv=5).mean()
return val
pbounds = {'C':(0,1), 'gamma':(2,20)}
svm_bo = BayesianOptimization(svm_cv, pbounds=pbounds)
svm_bo.maximize()
| iter | target | C | gamma |
-------------------------------------------------
| 1 | 0.6202 | 0.1987 | 16.93 |
| 2 | 0.6202 | 0.9998 | 8.928 |
| 3 | 0.6202 | 0.381 | 12.99 |
| 4 | 0.6202 | 0.2872 | 15.86 |
| 5 | 0.6202 | 0.845 | 7.817 |
| 6 | 0.6201 | 0.9857 | 2.006 |
| 7 | 0.6202 | 0.2213 | 20.0 |
| 8 | 0.6202 | 0.04001 | 2.014 |
| 9 | 0.6202 | 0.9077 | 20.0 |
| 10 | 0.6201 | 0.6421 | 2.061 |
| 11 | 0.6202 | 0.8399 | 20.0 |
| 12 | 0.6201 | 0.2333 | 2.027 |
| 13 | 0.6202 | 0.5979 | 19.97 |
| 14 | 0.6202 | 0.6701 | 2.109 |
| 15 | 0.6202 | 0.1631 | 19.95 |
| 16 | 0.6201 | 0.4138 | 2.022 |
| 17 | 0.6202 | 0.1256 | 19.98 |
| 18 | 0.6201 | 0.09698 | 2.062 |
| 19 | 0.6202 | 0.3008 | 19.91 |
| 20 | 0.6202 | 0.281 | 19.97 |
| 21 | 0.6202 | 0.6433 | 2.072 |
| 22 | 0.6202 | 0.5776 | 19.98 |
| 23 | 0.6201 | 0.8474 | 2.026 |
| 24 | 0.6202 | 0.8294 | 19.94 |
| 25 | 0.6202 | 0.005122 | 19.99 |
| 26 | 0.6201 | 0.9513 | 2.034 |
| 27 | 0.6202 | 0.03517 | 19.99 |
| 28 | 0.6201 | 0.1467 | 2.003 |
| 29 | 0.6202 | 0.5118 | 19.99 |
| 30 | 0.6201 | 0.7277 | 2.028 |
=================================================