【tensorflow學習】Ftrl學習

【tensorflow學習】處理MNISTS數據集

理論

理論知識
交叉熵理解

應用

#encoding=utf8
import tensorflow as tf
import pandas as pd
import argparse
import numpy as np
import gzip
import os
import sys

def parse_arg():
    parser = argparse.ArgumentParser("Training for FTRL Ctr model.")
    parser.add_argument(
        "--train_data_dir",
        type=str,
        required=True,
        help="The path of training data.")
    parser.add_argument(
        "--batch_size",
        type=int,
        default=10,
        help="The number of batch size.")
    parser.add_argument(
        "--feature_num",
        type=int,
        default=14,
        help="The number of features.")
    return parser.parse_args()

def smart_open(file_name):
    """
    @Brief: 適配打開.gz文件
    """
    try:
        if file_name[-3:] == ".gz":
            return gzip.open(file_name, 'r')
        else:
            return open(file_name, 'r')
    except Exception as e:
        print("smart open file_name: {0} error: {1}".format(file_name,e), file=sys.stderr)
    return None
        
def one_hot_feature(feature_list):
    m = len(feature_list)
    n = len(feature_list[0])
    sign_dict_list = [{}] * n
    index_list = [0] * n
    for i in range(m):
        for j in  range(n):
            #print(i,j,m,n,feature_list[i][j])
            if feature_list[i][j] not in sign_dict_list[j].keys():
                sign_dict_list[j][feature_list[i][j]] = index_list[j]
                #print("index = {0} i= {1}  j ={2}  value = {3}".format(sign_dict_list[j][feature_list[i][j]], i, j, feature_list[i][j]))
                index_list[j] += 1
    
    ins_list = []
    for i in range(m):
        ins_i_sparse = []
        pos_list = []
        for j in range(n):
            feature_j_sparse = [0.0] * len(sign_dict_list[j])
            #print("index = {0}".format(sign_dict_list[j][feature_list[i][j]]))
            feature_j_sparse[sign_dict_list[j][feature_list[i][j]]] = 1
            ins_i_sparse += feature_j_sparse
            pos_list.append(sign_dict_list[j][feature_list[i][j]])
        ins_list.append(ins_i_sparse)
    return np.array(ins_list)
    
def read_data(file_dir, feature_num):
    labels = []
    features = []
    files = os.listdir(file_dir)
    for file in files:
        file = file_dir + "/" + file
        if not os.path.isdir(file):
            f = smart_open(file)
            for line in f:
                if "gz" in file:
                    data = line.decode().strip().split('\t')
                else:
                    data = line.strip().split('\t')
                label_feature = data[0].split(' ')
                label = float(label_feature[1])
                feature = list(map(lambda x: x.split(':')[0], label_feature[2:]))
                labels.append([label])
                if (len(feature) < feature_num):
                    continue
                features.append(feature[:feature_num])
                #print("len :{0}".format(len(feature)))
    return labels, features


def model(feature, label):
    n = feature.shape[1]
    x = tf.placeholder(dtype = tf.float32, name = 'x', shape = [None, n])
    y = tf.placeholder(dtype = tf.float32, name = 'y', shape = [None, 1])
    W = tf.Variable(tf.truncated_normal([n, 1], stddev = 0.1))
    bias = tf.Variable(tf.zeros([1]))
    predict_y = tf.sigmoid(tf.matmul(x, W) + bias)
    y_ = tf.sigmoid(tf.matmul(x, W) + bias)
    loss = tf.reduce_mean(-tf.matmul(tf.transpose(y), tf.log(predict_y)) - tf.matmul(tf.transpose(1-y), tf.log(1-predict_y)))
    opt =  tf.train.FtrlOptimizer(0.03, l1_regularization_strength=0.01, l2_regularization_strength=0.01).minimize(loss)
    auc_value, auc_op = tf.metrics.auc(labels=y,predictions=y_)
    init = tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
    with tf.Session() as sess:
        sess.run(init)
        for i in range(10):
            _, _loss, predict_y_1, _y, _auc_op= sess.run([opt, loss, predict_y, y, auc_op], feed_dict = {x: feature, y: label})
            print("pass: {0}  loss = {1}  auc = {2}".format(i, _loss, sess.run(auc_value)))


if __name__ == '__main__':
    args = parse_arg()
    labels, features = read_data(args.train_data_dir, args.feature_num)
    features = one_hot_feature(features)
    model(features, labels)

To Do

實現大規模離散LR的參數稀疏存儲及更新

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章