本人代码库: https://github.com/beathahahaha/tensorflow-DeepFM-master-original
DeepFM原作者代码库: https://github.com/ChenglongChen/tensorflow-DeepFM
解析DeepFM代码 博客推荐:https://mp.weixin.qq.com/s/QrO48ZdP483TY_EnnWFhsQ
为了熟悉该代码的使用,我在example文件夹编写了一个test_1.py文件,可以直接运行
一、定义DeepFM 输入:
需要train.csv(59列,有连续性数值,也有离散型数值,其中多分类都用的0,1,2,3表示),test.csv是kaggle比赛时需要输出的东西,非必要
(参考该数据格式:https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data?select=train.csv)
二、定义DeepFM 输出:
yy = dfm.predict(Xi_valid_, Xv_valid_) 得到一维np.array,其中数值为float代表概率值
tensorflow 建议1.14 gpu版本
如果自己要DIY的话,要注意哪些地方呢?
答:
1. config.py 里面的设置,和输入数据密切相关,要定义好离散型和连续型的列
2. 喂入的数据格式必须严格统一,注意修改test_1.py 中的列标签名字相关的内容(因此建议使用test_1.py 而不是原作者的main.py)
test_1.py:
import tensorflow as tf
from sklearn.metrics import roc_auc_score
import os
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import config
from metrics import gini_norm
from DataReader import FeatureDictionary, DataParser
sys.path.append("..")
from DeepFM import DeepFM
def _load_data():
dfTrain = pd.read_csv(config.TRAIN_FILE)
dfTest = pd.read_csv(config.TEST_FILE)
cols = \[c for c in dfTrain.columns if c not in \["id", "target"\]\]
cols = \[c for c in cols if (not c in config.IGNORE\_COLS)\]
X\_train = dfTrain\[cols\].values
y\_train = dfTrain\["target"\].values
X\_test = dfTest\[cols\].values
ids\_test = dfTest\["id"\].values
cat\_features\_indices = \[i for i, c in enumerate(cols) if c in config.CATEGORICAL\_COLS\]
return dfTrain, dfTest, X\_train, y\_train, X\_test, ids\_test, cat\_features\_indices
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
numeric_cols=config.NUMERIC_COLS,
ignore_cols=config.IGNORE_COLS)
data_parser = DataParser(feat_dict=fd)
Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)
dfm\_params\["feature\_size"\] = fd.feat\_dim
dfm\_params\["field\_size"\] = len(Xi\_train\[0\])
y\_train\_meta = np.zeros((dfTrain.shape\[0\], 1), dtype=float)
y\_test\_meta = np.zeros((dfTest.shape\[0\], 1), dtype=float)
\_get = lambda x, l: \[x\[i\] for i in l\]
gini\_results\_cv = np.zeros(len(folds), dtype=float)
gini\_results\_epoch\_train = np.zeros((len(folds), dfm\_params\["epoch"\]), dtype=float)
gini\_results\_epoch\_valid = np.zeros((len(folds), dfm\_params\["epoch"\]), dtype=float)
for i, (train\_idx, valid\_idx) in enumerate(folds):
# k折交叉,每一折中的fit中,含有epoch轮训练,每一次epoch拆分了batch来喂入
Xi\_train\_, Xv\_train\_, y\_train\_ = \_get(Xi\_train, train\_idx), \_get(Xv\_train, train\_idx), \_get(y\_train, train\_idx)
Xi\_valid\_, Xv\_valid\_, y\_valid\_ = \_get(Xi\_train, valid\_idx), \_get(Xv\_train, valid\_idx), \_get(y\_train, valid\_idx)
dfm = DeepFM(\*\*dfm\_params)
dfm.fit(Xi\_train\_, Xv\_train\_, y\_train\_, Xi\_valid\_, Xv\_valid\_, y\_valid\_) # fit中包含对train和valid的评估
yy = dfm.predict(Xi\_valid\_, Xv\_valid\_)
# print("type(yy):",type(yy))
# print("type(y\_valid\_):", type(y\_valid\_))
# print("yy.shape:",yy.shape) #yy : array
# print("y\_valid\_.shape:", y\_valid\_.shape) #y\_valid\_ : list
#print("yy:", yy) # 原始的predict出来的是概率值
for index in range(len(yy)):
if (yy\[index\] <= 0.5):
yy\[index\] = 0
else:
yy\[index\] = 1
#print("y\_valid\_:", y\_valid\_)
print("accuracy\_score(y\_valid\_, yy):", accuracy\_score(y\_valid\_, yy))
y\_train\_meta\[valid\_idx, 0\] = yy
y\_test\_meta\[:, 0\] += dfm.predict(Xi\_test, Xv\_test)
y\_test\_meta /= float(len(folds))
return y\_train\_meta, y\_test\_meta
dfm_params = {
"use_fm": True,
"use_deep": True,
"embedding_size": 8,
"dropout_fm": [1.0, 1.0],
"deep_layers": [32, 32],
"dropout_deep": [0.5, 0.5, 0.5],
"deep_layers_activation": tf.nn.relu,
"epoch": 10,
"batch_size": 1024,
"learning_rate": 0.001,
"optimizer_type": "adam",
"batch_norm": 1,
"batch_norm_decay": 0.995,
"l2_reg": 0.01,
"verbose": True,
"eval_metric": roc_auc_score,
"random_seed": 2017
}
dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = _load_data()
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
random_state=config.RANDOM_SEED).split(X_train, y_train))
y_train_dfm, y_test_dfm = _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params)
print("over")
手机扫一扫
移动阅读更方便
你可能感兴趣的文章