Quellcode durchsuchen

add start shell and rank model

xielixun vor 4 Jahren
Ursprung
Commit
a4d1f51b2c

+ 11 - 54
.idea/workspace.xml

@@ -2,57 +2,12 @@
 <project version="4">
   <component name="ChangeListManager">
     <list default="true" id="6b55ac01-3e23-447a-a42e-54e011f0a33e" name="Default Changelist" comment="">
-      <change afterPath="$PROJECT_DIR$/deep_rank/DeepFM_tzld_rank.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/deep_rank/WideNDeep_tzld_rank.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/app_dssm_0329.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/embedding_manager.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/embedding_manager_user.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/app_fm_0329.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/embedding_manager.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/embedding_manager_user.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/app_item2vec_0316.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/embedding_manager.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/embedding_manager_user.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/Dssm_tzld_match.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/FM_tzld_match.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/Procedure.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/dataloader.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/main.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/model.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/parse.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/register.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/utils.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/world.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/imgs/tf.jpg" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/imgs/torch.png" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/requirements.txt" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/__init__.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_evaluate_foldout.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_evaluate_foldout.cpython-37m-x86_64-linux-gnu.so" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_evaluate_foldout.pyx" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_tools.cpp" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_tools.cpython-37m-x86_64-linux-gnu.so" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_tools.pyx" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/build/temp.linux-x86_64-3.7/apt_evaluate_foldout.o" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/build/temp.linux-x86_64-3.7/apt_tools.o" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/evaluate_foldout.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/include/evaluate_foldout.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/include/thread_pool.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/include/tools.h" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/python/evaluate_foldout.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/python/evaluate_loo.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/batch_test.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/helper.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/load_data.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/parser.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/item2Vec-userEmbedding-tzld.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/mind_tzld_match.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/preprocess_tzld210322_gen.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/preprocess_tzld210423_gen.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/spark-item2Vec-tzld.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/video_semantic_emb_bert.py" afterDir="false" />
-      <change afterPath="$PROJECT_DIR$/match_recall/youtube_tzld_match.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/deep_rank/FiBiNET_tzld_rank.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/deep_rank/xdeepfm_tzld_rank.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/start-prod.sh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/start-prod.sh" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/start-prod.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
     </list>
     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
     <option name="SHOW_DIALOG" value="false" />
@@ -72,13 +27,15 @@
   </component>
   <component name="ProjectId" id="1reOMONht7RSJ9cTOLXJqtDnNmi" />
   <component name="PropertiesComponent">
-    <property name="last_opened_file_path" value="$PROJECT_DIR$/emb_faiss" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec" />
   </component>
   <component name="RecentsManager">
     <key name="CopyFile.RECENT_KEYS">
-      <recent name="$PROJECT_DIR$/emb_faiss" />
+      <recent name="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec" />
+      <recent name="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm" />
+      <recent name="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm" />
       <recent name="$PROJECT_DIR$/deep_rank" />
-      <recent name="$PROJECT_DIR$/match_recall" />
+      <recent name="$PROJECT_DIR$/emb_faiss" />
     </key>
   </component>
   <component name="RunDashboard">

+ 181 - 0
deep_rank/FiBiNET_tzld_rank.py

@@ -0,0 +1,181 @@
+import numpy as np
+import pandas as pd
+import gc
+import os
+import time
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+
+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
+
+import tensorflow as tf
+from deepctr.models import *
+
+def split(x):
+    key_ans = x.split('|')
+    for key in key_ans:
+        if key not in key2index:
+            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
+            key2index[key] = len(key2index) + 1
+    return list(map(lambda x: key2index[x], key_ans))
+
+
+if __name__ == "__main__":
+
+    begin_time = time.time()
+    data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
+    sparse_features = ["videoid", "mid",
+                       "videoGenre1", "videoGenre2",  "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
+                       "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
+
+    target = ['label']
+    feature_max_idx = {}
+    data = data[data["mid"] != "unknown"].copy()
+    data["mid"].replace("unknown", "N000111111D", inplace=True)
+    data = data[data["mid"] != "N000111111D"].copy()
+
+    # 和上面函数的功能是一样的,见 deepMatch  DSSM
+    def add_index_column(param_df, column_name):
+        values = list(param_df[column_name].unique())
+        value_index_dict = {value: idx for idx, value in enumerate(values)}
+        if column_name == "mid":
+            param_df["uidx"] = param_df[column_name].copy()
+            param_df["mid"] = param_df[column_name].map(value_index_dict)
+            feature_max_idx["mid"] = param_df["mid"].max() + 1
+
+    add_index_column(data, "mid")
+
+    for column_name in sparse_features:
+        lbe = LabelEncoder()
+        print("\n\n--------------   " + column_name)
+        print(data[column_name])
+
+        if column_name == "videoGenre1" or column_name == "videoGenre2" or \
+                column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
+            data[column_name].fillna("社会", inplace=True)
+
+        if column_name == "userCity":
+            data[column_name].fillna("北京", inplace=True)
+
+        if column_name == "mid":
+            continue
+
+        data[column_name] = lbe.fit_transform(data[column_name])
+        feature_max_idx[column_name] = data[column_name].max() + 1
+
+    key2index = {}
+
+    print("\n\n  ************ data process finish")
+    user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
+
+    user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
+    max_len = 50
+    # print(list(user_video_length.keys()))
+    # print(list(user_video_length.keys))
+    mid_list = list(user_video_list_df["mid"])
+    print(user_video_list_df["mid"])
+    # print(mid_list)
+
+    user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
+    print(user_video_list_df)
+    print(len(user_video_list_df))
+    emb_dim = 10
+
+    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=emb_dim)
+                              for feat in sparse_features]
+    print(fixlen_feature_columns)
+
+    use_weighted_sequence = False
+    if use_weighted_sequence:
+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
+            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
+                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
+    else:
+        # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
+        #     key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
+        #                                            weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
+                                                              embedding_name="videoid"), maxlen=max_len, combiner='mean',
+                                                   length_name="hist_len")]  # Notice : value 0 is for padding for sequence input feature
+
+    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    print(dnn_feature_columns)
+    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
+    df_merge = pd.merge(left=data,
+                        right=user_video_list_df,
+                        left_on="mid",
+                        right_on="mid",
+                        how="right")
+    df_merge.head()
+
+    print("df_merge len is:  ", len(df_merge))
+
+    df_merge = df_merge.sample(frac=1.0)
+    del data, user_video_list_df
+    gc.collect()
+
+    print("after sample df_merge len is:  ", len(df_merge))
+
+    model_input = {name: df_merge[name] for name in sparse_features}  #
+    video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
+
+    model_input["hist_video_id"] = video_hist_seq_pad
+    print("\n\n\n")
+    print(video_hist_seq_pad)
+    print("\n\nuser_vids_input len is:  ", len(df_merge["hist_video_id"]))
+
+    # model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)
+    # model_input["hist_len"] = np.array(user_vids_len_input)
+    model_input["hist_len"] = df_merge["hist_len"]
+
+    print("\n\nuser_vids_len_input len is:  ", len(df_merge["hist_len"]))
+    model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='binary')
+    logdir = os.path.join("log_callbacks")  # Tensorboard需要一个文件夹
+    if not os.path.exists(logdir):
+        os.mkdir(logdir)
+    output_model_file = os.path.join(logdir,
+                                     'xdeepfm_model.h5')
+
+    callbacks = [
+        tf.keras.callbacks.TensorBoard(logdir),
+        tf.keras.callbacks.ModelCheckpoint(output_model_file,
+                                           save_best_only=True),
+        tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
+    ]
+
+    METRICS = [
+        tf.keras.metrics.TruePositives(name='tp'),
+        tf.keras.metrics.FalsePositives(name='fp'),
+        tf.keras.metrics.TrueNegatives(name='tn'),
+        tf.keras.metrics.FalseNegatives(name='fn'),
+        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
+        tf.keras.metrics.Precision(name='precision'),
+        tf.keras.metrics.Recall(name='recall'),
+        # tf.keras.metrics.AUC(name='auc'),
+        tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
+        tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
+    ]
+
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer='adam',
+        metrics=METRICS
+    )
+
+    model.fit(model_input, df_merge[target].values,
+                        batch_size=2048, epochs=5,
+              verbose=2, validation_split=0.2, callbacks=callbacks)
+
+    model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
+
+    tf.keras.models.save_model(model,
+                               "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
+                               overwrite=True,
+                               include_optimizer=True,
+                               save_format=None,
+                               signatures=None,
+                               options=None)
+
+    print("*******  train FiBiNET cost time is:  " + str(time.time() - begin_time))
+

+ 219 - 0
deep_rank/xdeepfm_tzld_rank.py

@@ -0,0 +1,219 @@
+import numpy as np
+import pandas as pd
+import gc
+import os
+import time
+from sklearn.preprocessing import LabelEncoder
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+
+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
+
+import tensorflow as tf
+from deepctr.models import xDeepFM
+
+
+def split(x):
+    key_ans = x.split('|')
+    for key in key_ans:
+        if key not in key2index:
+            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
+            key2index[key] = len(key2index) + 1
+    return list(map(lambda x: key2index[x], key_ans))
+
+
+if __name__ == "__main__":
+
+    begin_time = time.time()
+    data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
+    sparse_features = ["videoid", "mid",
+                       "videoGenre1", "videoGenre2",  "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
+                       "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
+
+    # target = ['rating']
+    target = ['label']
+
+    feature_max_idx = {}
+
+    data = data[data["mid"] != "unknown"].copy()
+    data["mid"].replace("unknown", "N000111111D", inplace=True)
+    data = data[data["mid"] != "N000111111D"].copy()
+
+    # 和上面函数的功能是一样的,见 deepMatch  DSSM
+    def add_index_column(param_df, column_name):
+        values = list(param_df[column_name].unique())
+        value_index_dict = {value: idx for idx, value in enumerate(values)}
+        if column_name == "mid":
+            param_df["uidx"] = param_df[column_name].copy()
+            param_df["mid"] = param_df[column_name].map(value_index_dict)
+            feature_max_idx["mid"] = param_df["mid"].max() + 1
+
+    add_index_column(data, "mid")
+    feature_max_idx["videoid"] = data["videoid"].max() + 1
+
+    for column_name in sparse_features:
+        lbe = LabelEncoder()
+        print("\n\n--------------   " + column_name)
+        print(data[column_name])
+
+        if column_name == "videoGenre1" or column_name == "videoGenre2" or \
+                column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
+            data[column_name].fillna("社会", inplace=True)
+
+        if column_name == "userCity":
+            data[column_name].fillna("北京", inplace=True)
+
+        if column_name == "mid":
+            continue
+        if column_name == "videoid":
+            continue
+
+        data[column_name] = lbe.fit_transform(data[column_name])
+        feature_max_idx[column_name] = data[column_name].max() + 1
+
+    key2index = {}
+
+    print("\n\n  ************ data process finish")
+    user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
+    user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
+
+    print(user_video_list_df)
+    print(type(user_video_list_df))
+
+    max_len = 50
+    # print(list(user_video_length.keys()))
+    # print(list(user_video_length.keys))
+    mid_list = list(user_video_list_df["mid"])
+    print(user_video_list_df["mid"])
+    user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
+    print(user_video_list_df)
+    print(len(user_video_list_df))
+    emb_dim = 10
+
+    fixlen_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=emb_dim)
+                              for feat in sparse_features]
+
+    print(fixlen_feature_columns)
+
+    use_weighted_sequence = False
+    if use_weighted_sequence:
+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
+            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
+                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
+    else:
+        # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
+        #     key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
+        #                                            weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
+                                                              embedding_name="videoid"), maxlen=max_len, combiner='mean',
+                                                   length_name="hist_len")]  # Notice : value 0 is for padding for sequence input feature
+
+    print(varlen_feature_columns)
+    #linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    linear_feature_columns = fixlen_feature_columns
+    dnn_feature_columns = fixlen_feature_columns #+ varlen_feature_columns
+    print(dnn_feature_columns)
+    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
+
+    print("data len is:  ", len(data))
+
+    df_merge = pd.merge(left=data,
+                        right=user_video_list_df,
+                        left_on="mid",
+                        right_on="mid",
+                        #how="right")
+                        how="inner")
+    df_merge.head()
+
+    print("df_merge len is:  ", len(df_merge))
+
+    df_merge = df_merge.sample(frac=1.0)
+    #df_merge = df_merge.sample(frac=0.3)
+    del data, user_video_list_df
+    gc.collect()
+
+    print("after sample df_merge len is:  ", len(df_merge))
+
+    model_input = {name: df_merge[name] for name in sparse_features}  #
+
+    # df_merge["hist_video_id"].fillna(str([0, 0, 0]), inplace=True)
+    # df_merge["hist_len"].fillna(0, inplace=True)
+
+    print(df_merge)
+    video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
+
+    model_input["hist_video_id"] = video_hist_seq_pad
+    print("\n\n\n")
+    print(video_hist_seq_pad)
+    print("\n\nuser_vids_input len is:  ", len(df_merge["hist_video_id"]))
+
+    model_input["hist_len"] = df_merge["hist_len"]
+
+    print("\n\nuser_vids_len_input len is:  ", len(df_merge["hist_len"]))
+    user_features = ["uidx", "mid",
+                     "userRatedVideo1", "userRatedVideo2", "userRatedVideo3",
+                     "userGenre1", "userGenre2", "userCity",
+                     "userRealplayCount", "hist_video_id", "hist_len"]
+
+    item_features = ["videoid", "videoGenre1", "videoGenre2",
+                     "authorid", "videoRealPlayCount", "videoDuration"]
+
+    df_merge["hist_video_id"] = list(map(lambda x: ' '.join(list(map(str, x))), video_hist_seq_pad))  # 上下两条功能相同
+
+    user_df = df_merge[user_features].drop_duplicates('uidx')
+    video_df = df_merge[item_features].drop_duplicates('videoid')
+
+    from datetime import datetime
+
+    TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
+    user_df.to_csv("user_df" + TIMESTAMP + ".csv", index=False)
+    video_df.to_csv("video_df" + TIMESTAMP + ".csv", index=False)
+
+    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
+
+    logdir = os.path.join("log_callbacks")  # Tensorboard需要一个文件夹
+    if not os.path.exists(logdir):
+        os.mkdir(logdir)
+    output_model_file = os.path.join(logdir,
+                                     'xdeepfm_model.h5')
+
+    callbacks = [
+        tf.keras.callbacks.TensorBoard(logdir),
+        tf.keras.callbacks.ModelCheckpoint(output_model_file,
+                                           save_best_only=True),
+        tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
+    ]
+
+    METRICS = [
+        tf.keras.metrics.TruePositives(name='tp'),
+        tf.keras.metrics.FalsePositives(name='fp'),
+        tf.keras.metrics.TrueNegatives(name='tn'),
+        tf.keras.metrics.FalseNegatives(name='fn'),
+        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
+        tf.keras.metrics.Precision(name='precision'),
+        tf.keras.metrics.Recall(name='recall'),
+        # tf.keras.metrics.AUC(name='auc'),
+        tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
+        tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
+    ]
+
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer='adam',
+        metrics=METRICS
+    )
+
+    model.fit(model_input, df_merge[target].values,
+                        batch_size=4096, epochs=2,
+              verbose=2, validation_split=0.2, callbacks=callbacks)
+
+    model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
+    tf.keras.models.save_model(model,
+                               "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
+                               overwrite=True,
+                               include_optimizer=True,
+                               save_format=None,
+                               signatures=None,
+                               options=None)
+
+    print("*******  train xdeepfm cost time is:  " + str(time.time() - begin_time))
+

+ 24 - 0
emb_faiss/faiss-tzld-videos-users-dssm/start-prod.sh

@@ -0,0 +1,24 @@
+#!/bin/sh
+
+file_log=./log
+file_img_video=/datalog/video_data
+
+mkdir -p ${file_log}
+mkdir -p ${file_img_video}
+
+if [ ! -d ${file_log}]; then
+    mkdir -p ${file_log}
+else
+    echo "log文件夹已经存在"
+fi
+
+if [ ! -d ${file_img_video}]; then
+    mkdir -p ${file_img_video}
+else
+    echo "img_video文件夹已经存在"
+fi
+
+#rm -rf ${file_log}/run*
+ps -ef | grep "app_dssm_0329.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+
+nohup python3 -u  ./app_dssm_0329.py --env prod > /dev/null 2>&1 &

+ 24 - 0
emb_faiss/faiss-tzld-videos-users-fm/start-prod.sh

@@ -0,0 +1,24 @@
+#!/bin/sh
+
+file_log=./log
+file_img_video=/datalog/video_data
+
+mkdir -p ${file_log}
+mkdir -p ${file_img_video}
+
+if [ ! -d ${file_log}]; then
+    mkdir -p ${file_log}
+else
+    echo "log文件夹已经存在"
+fi
+
+if [ ! -d ${file_img_video}]; then
+    mkdir -p ${file_img_video}
+else
+    echo "img_video文件夹已经存在"
+fi
+
+#rm -rf ${file_log}/run*
+ps -ef | grep "app_fm_0329.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+
+nohup python3 -u  ./app_fm_0329.py --env prod > /dev/null 2>&1 &

+ 24 - 0
emb_faiss/faiss-tzld-videos-users-item2vec/start-prod.sh

@@ -0,0 +1,24 @@
+#!/bin/sh
+
+file_log=./log
+file_img_video=/datalog/video_data
+
+mkdir -p ${file_log}
+mkdir -p ${file_img_video}
+
+if [ ! -d ${file_log}]; then
+    mkdir -p ${file_log}
+else
+    echo "log文件夹已经存在"
+fi
+
+if [ ! -d ${file_img_video}]; then
+    mkdir -p ${file_img_video}
+else
+    echo "img_video文件夹已经存在"
+fi
+
+#rm -rf ${file_log}/run*
+ps -ef | grep "app_item2vec_0316.py" | grep -v grep | awk '{print $2}' | xargs kill -9
+
+nohup python3 -u  ./app_item2vec_0316.py --env prod > /dev/null 2>&1 &