DeepFM——tensorflow代码改编
阅读原文时间:2023年07月09日阅读:1

本人代码库: https://github.com/beathahahaha/tensorflow-DeepFM-master-original

DeepFM原作者代码库: https://github.com/ChenglongChen/tensorflow-DeepFM

解析DeepFM代码 博客推荐:https://mp.weixin.qq.com/s/QrO48ZdP483TY_EnnWFhsQ

为了熟悉该代码的使用,我在example文件夹编写了一个test_1.py文件,可以直接运行

一、定义DeepFM 输入:

  需要train.csv(59列,有连续性数值,也有离散型数值,其中多分类都用的0,1,2,3表示),test.csv是kaggle比赛时需要输出的东西,非必要

  (参考该数据格式:https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data?select=train.csv)

二、定义DeepFM 输出:

  yy = dfm.predict(Xi_valid_, Xv_valid_) 得到一维np.array,其中数值为float代表概率值

tensorflow 建议1.14 gpu版本

如果自己要DIY的话,要注意哪些地方呢?

答:

1. config.py 里面的设置,和输入数据密切相关,要定义好离散型和连续型的列

2. 喂入的数据格式必须严格统一,注意修改test_1.py 中的列标签名字相关的内容(因此建议使用test_1.py 而不是原作者的main.py)

test_1.py:

import tensorflow as tf
from sklearn.metrics import roc_auc_score
import os
import sys

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

import config
from metrics import gini_norm
from DataReader import FeatureDictionary, DataParser

sys.path.append("..")
from DeepFM import DeepFM

def _load_data():
dfTrain = pd.read_csv(config.TRAIN_FILE)
dfTest = pd.read_csv(config.TEST_FILE)

cols = \[c for c in dfTrain.columns if c not in \["id", "target"\]\]  
cols = \[c for c in cols if (not c in config.IGNORE\_COLS)\]

X\_train = dfTrain\[cols\].values  
y\_train = dfTrain\["target"\].values  
X\_test = dfTest\[cols\].values  
ids\_test = dfTest\["id"\].values  
cat\_features\_indices = \[i for i, c in enumerate(cols) if c in config.CATEGORICAL\_COLS\]

return dfTrain, dfTest, X\_train, y\_train, X\_test, ids\_test, cat\_features\_indices

def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
numeric_cols=config.NUMERIC_COLS,
ignore_cols=config.IGNORE_COLS)
data_parser = DataParser(feat_dict=fd)
Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

dfm\_params\["feature\_size"\] = fd.feat\_dim  
dfm\_params\["field\_size"\] = len(Xi\_train\[0\])

y\_train\_meta = np.zeros((dfTrain.shape\[0\], 1), dtype=float)  
y\_test\_meta = np.zeros((dfTest.shape\[0\], 1), dtype=float)  
\_get = lambda x, l: \[x\[i\] for i in l\]  
gini\_results\_cv = np.zeros(len(folds), dtype=float)  
gini\_results\_epoch\_train = np.zeros((len(folds), dfm\_params\["epoch"\]), dtype=float)  
gini\_results\_epoch\_valid = np.zeros((len(folds), dfm\_params\["epoch"\]), dtype=float)  
for i, (train\_idx, valid\_idx) in enumerate(folds):  
    # k折交叉,每一折中的fit中,含有epoch轮训练,每一次epoch拆分了batch来喂入  
    Xi\_train\_, Xv\_train\_, y\_train\_ = \_get(Xi\_train, train\_idx), \_get(Xv\_train, train\_idx), \_get(y\_train, train\_idx)  
    Xi\_valid\_, Xv\_valid\_, y\_valid\_ = \_get(Xi\_train, valid\_idx), \_get(Xv\_train, valid\_idx), \_get(y\_train, valid\_idx)

    dfm = DeepFM(\*\*dfm\_params)  
    dfm.fit(Xi\_train\_, Xv\_train\_, y\_train\_, Xi\_valid\_, Xv\_valid\_, y\_valid\_)  # fit中包含对train和valid的评估

    yy = dfm.predict(Xi\_valid\_, Xv\_valid\_)  
    # print("type(yy):",type(yy))  
    # print("type(y\_valid\_):", type(y\_valid\_))

    # print("yy.shape:",yy.shape)               #yy : array  
    # print("y\_valid\_.shape:", y\_valid\_.shape)  #y\_valid\_ : list

    #print("yy:", yy)  # 原始的predict出来的是概率值  
    for index in range(len(yy)):  
        if (yy\[index\] <= 0.5):  
            yy\[index\] = 0  
        else:  
            yy\[index\] = 1

    #print("y\_valid\_:", y\_valid\_)

    print("accuracy\_score(y\_valid\_, yy):", accuracy\_score(y\_valid\_, yy))

    y\_train\_meta\[valid\_idx, 0\] = yy

    y\_test\_meta\[:, 0\] += dfm.predict(Xi\_test, Xv\_test)

y\_test\_meta /= float(len(folds))

return y\_train\_meta, y\_test\_meta

params

dfm_params = {
"use_fm": True,
"use_deep": True,
"embedding_size": 8,
"dropout_fm": [1.0, 1.0],
"deep_layers": [32, 32],
"dropout_deep": [0.5, 0.5, 0.5],
"deep_layers_activation": tf.nn.relu,
"epoch": 10,
"batch_size": 1024,
"learning_rate": 0.001,
"optimizer_type": "adam",
"batch_norm": 1,
"batch_norm_decay": 0.995,
"l2_reg": 0.01,
"verbose": True,
"eval_metric": roc_auc_score,
"random_seed": 2017
}

dfTrain, dfTest, X_train, y_train, X_test, ids_test, cat_features_indices = _load_data()

folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS, shuffle=True,
random_state=config.RANDOM_SEED).split(X_train, y_train))

y_train_dfm, y_test_dfm = _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params)

print("over")

Xi_train, Xv_train, y_train = prepare(…)

Xi_valid, Xv_valid, y_valid = prepare(…)