4 éve · a4d1f51b2c
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,57 +2,12 @@
 
				 <project version="4">
			
 
				   <component name="ChangeListManager">
			
 
				     <list default="true" id="6b55ac01-3e23-447a-a42e-54e011f0a33e" name="Default Changelist" comment="">
			
 
				-      <change afterPath="$PROJECT_DIR$/deep_rank/DeepFM_tzld_rank.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/deep_rank/WideNDeep_tzld_rank.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/app_dssm_0329.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/embedding_manager.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/embedding_manager_user.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/app_fm_0329.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/embedding_manager.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/embedding_manager_user.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/app_item2vec_0316.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/embedding_manager.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/embedding_manager_user.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/Dssm_tzld_match.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/FM_tzld_match.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/Procedure.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/dataloader.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/main.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/model.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/parse.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/register.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/utils.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/code/world.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/imgs/tf.jpg" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/imgs/torch.png" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN-PyTorch/requirements.txt" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/LightGCN.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/__init__.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_evaluate_foldout.cpp" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_evaluate_foldout.cpython-37m-x86_64-linux-gnu.so" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_evaluate_foldout.pyx" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_tools.cpp" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_tools.cpython-37m-x86_64-linux-gnu.so" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/apt_tools.pyx" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/build/temp.linux-x86_64-3.7/apt_evaluate_foldout.o" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/build/temp.linux-x86_64-3.7/apt_tools.o" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/evaluate_foldout.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/include/evaluate_foldout.h" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/include/thread_pool.h" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/cpp/include/tools.h" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/python/evaluate_foldout.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/evaluator/python/evaluate_loo.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/batch_test.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/helper.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/load_data.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/LightGCN_match/utility/parser.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/item2Vec-userEmbedding-tzld.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/mind_tzld_match.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/preprocess_tzld210322_gen.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/preprocess_tzld210423_gen.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/spark-item2Vec-tzld.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/video_semantic_emb_bert.py" afterDir="false" />
			
 
				-      <change afterPath="$PROJECT_DIR$/match_recall/youtube_tzld_match.py" afterDir="false" />
			
 
				+      <change afterPath="$PROJECT_DIR$/deep_rank/FiBiNET_tzld_rank.py" afterDir="false" />
			
 
				+      <change afterPath="$PROJECT_DIR$/deep_rank/xdeepfm_tzld_rank.py" afterDir="false" />
			
 
				+      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm/start-prod.sh" afterDir="false" />
			
 
				+      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm/start-prod.sh" afterDir="false" />
			
 
				+      <change afterPath="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec/start-prod.sh" afterDir="false" />
			
 
				+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
			
 
				     </list>
			
 
				     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
			
 
				     <option name="SHOW_DIALOG" value="false" />
			
@@ -72,13 +27,15 @@
 
				   </component>
			
 
				   <component name="ProjectId" id="1reOMONht7RSJ9cTOLXJqtDnNmi" />
			
 
				   <component name="PropertiesComponent">
			
 
				-    <property name="last_opened_file_path" value="$PROJECT_DIR$/emb_faiss" />
			
 
				+    <property name="last_opened_file_path" value="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec" />
			
 
				   </component>
			
 
				   <component name="RecentsManager">
			
 
				     <key name="CopyFile.RECENT_KEYS">
			
 
				-      <recent name="$PROJECT_DIR$/emb_faiss" />
			
 
				+      <recent name="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-item2vec" />
			
 
				+      <recent name="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-fm" />
			
 
				+      <recent name="$PROJECT_DIR$/emb_faiss/faiss-tzld-videos-users-dssm" />
			
 
				       <recent name="$PROJECT_DIR$/deep_rank" />
			
 
				-      <recent name="$PROJECT_DIR$/match_recall" />
			
 
				+      <recent name="$PROJECT_DIR$/emb_faiss" />
			
 
				     </key>
			
 
				   </component>
			
 
				   <component name="RunDashboard">
			
--- a/deep_rank/FiBiNET_tzld_rank.py
+++ b/deep_rank/FiBiNET_tzld_rank.py
@@ -0,0 +1,181 @@
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import gc
			
 
				+import os
			
 
				+import time
			
 
				+from sklearn.preprocessing import LabelEncoder
			
 
				+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
			
 
				+
			
 
				+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+from deepctr.models import *
			
 
				+
			
 
				+def split(x):
			
 
				+    key_ans = x.split('|')
			
 
				+    for key in key_ans:
			
 
				+        if key not in key2index:
			
 
				+            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
			
 
				+            key2index[key] = len(key2index) + 1
			
 
				+    return list(map(lambda x: key2index[x], key_ans))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    begin_time = time.time()
			
 
				+    data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
			
 
				+    sparse_features = ["videoid", "mid",
			
 
				+                       "videoGenre1", "videoGenre2",  "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
			
 
				+                       "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
			
 
				+
			
 
				+    target = ['label']
			
 
				+    feature_max_idx = {}
			
 
				+    data = data[data["mid"] != "unknown"].copy()
			
 
				+    data["mid"].replace("unknown", "N000111111D", inplace=True)
			
 
				+    data = data[data["mid"] != "N000111111D"].copy()
			
 
				+
			
 
				+    # 和上面函数的功能是一样的，见 deepMatch  DSSM
			
 
				+    def add_index_column(param_df, column_name):
			
 
				+        values = list(param_df[column_name].unique())
			
 
				+        value_index_dict = {value: idx for idx, value in enumerate(values)}
			
 
				+        if column_name == "mid":
			
 
				+            param_df["uidx"] = param_df[column_name].copy()
			
 
				+            param_df["mid"] = param_df[column_name].map(value_index_dict)
			
 
				+            feature_max_idx["mid"] = param_df["mid"].max() + 1
			
 
				+
			
 
				+    add_index_column(data, "mid")
			
 
				+
			
 
				+    for column_name in sparse_features:
			
 
				+        lbe = LabelEncoder()
			
 
				+        print("\n\n--------------   " + column_name)
			
 
				+        print(data[column_name])
			
 
				+
			
 
				+        if column_name == "videoGenre1" or column_name == "videoGenre2" or \
			
 
				+                column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
			
 
				+            data[column_name].fillna("社会", inplace=True)
			
 
				+
			
 
				+        if column_name == "userCity":
			
 
				+            data[column_name].fillna("北京", inplace=True)
			
 
				+
			
 
				+        if column_name == "mid":
			
 
				+            continue
			
 
				+
			
 
				+        data[column_name] = lbe.fit_transform(data[column_name])
			
 
				+        feature_max_idx[column_name] = data[column_name].max() + 1
			
 
				+
			
 
				+    key2index = {}
			
 
				+
			
 
				+    print("\n\n  ************ data process finish")
			
 
				+    user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
			
 
				+
			
 
				+    user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
			
 
				+    max_len = 50
			
 
				+    # print(list(user_video_length.keys()))
			
 
				+    # print(list(user_video_length.keys))
			
 
				+    mid_list = list(user_video_list_df["mid"])
			
 
				+    print(user_video_list_df["mid"])
			
 
				+    # print(mid_list)
			
 
				+
			
 
				+    user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
			
 
				+    print(user_video_list_df)
			
 
				+    print(len(user_video_list_df))
			
 
				+    emb_dim = 10
			
 
				+
			
 
				+    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=emb_dim)
			
 
				+                              for feat in sparse_features]
			
 
				+    print(fixlen_feature_columns)
			
 
				+
			
 
				+    use_weighted_sequence = False
			
 
				+    if use_weighted_sequence:
			
 
				+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
			
 
				+            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
			
 
				+                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
			
 
				+    else:
			
 
				+        # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
			
 
				+        #     key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
			
 
				+        #                                            weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
			
 
				+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
			
 
				+                                                              embedding_name="videoid"), maxlen=max_len, combiner='mean',
			
 
				+                                                   length_name="hist_len")]  # Notice : value 0 is for padding for sequence input feature
			
 
				+
			
 
				+    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
			
 
				+    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
			
 
				+    print(dnn_feature_columns)
			
 
				+    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
			
 
				+    df_merge = pd.merge(left=data,
			
 
				+                        right=user_video_list_df,
			
 
				+                        left_on="mid",
			
 
				+                        right_on="mid",
			
 
				+                        how="right")
			
 
				+    df_merge.head()
			
 
				+
			
 
				+    print("df_merge len is:  ", len(df_merge))
			
 
				+
			
 
				+    df_merge = df_merge.sample(frac=1.0)
			
 
				+    del data, user_video_list_df
			
 
				+    gc.collect()
			
 
				+
			
 
				+    print("after sample df_merge len is:  ", len(df_merge))
			
 
				+
			
 
				+    model_input = {name: df_merge[name] for name in sparse_features}  #
			
 
				+    video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
			
 
				+
			
 
				+    model_input["hist_video_id"] = video_hist_seq_pad
			
 
				+    print("\n\n\n")
			
 
				+    print(video_hist_seq_pad)
			
 
				+    print("\n\nuser_vids_input len is:  ", len(df_merge["hist_video_id"]))
			
 
				+
			
 
				+    # model_input["genres_weight"] = np.random.randn(data.shape[0], max_len, 1)
			
 
				+    # model_input["hist_len"] = np.array(user_vids_len_input)
			
 
				+    model_input["hist_len"] = df_merge["hist_len"]
			
 
				+
			
 
				+    print("\n\nuser_vids_len_input len is:  ", len(df_merge["hist_len"]))
			
 
				+    model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='binary')
			
 
				+    logdir = os.path.join("log_callbacks")  # Tensorboard需要一个文件夹
			
 
				+    if not os.path.exists(logdir):
			
 
				+        os.mkdir(logdir)
			
 
				+    output_model_file = os.path.join(logdir,
			
 
				+                                     'xdeepfm_model.h5')
			
 
				+
			
 
				+    callbacks = [
			
 
				+        tf.keras.callbacks.TensorBoard(logdir),
			
 
				+        tf.keras.callbacks.ModelCheckpoint(output_model_file,
			
 
				+                                           save_best_only=True),
			
 
				+        tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
			
 
				+    ]
			
 
				+
			
 
				+    METRICS = [
			
 
				+        tf.keras.metrics.TruePositives(name='tp'),
			
 
				+        tf.keras.metrics.FalsePositives(name='fp'),
			
 
				+        tf.keras.metrics.TrueNegatives(name='tn'),
			
 
				+        tf.keras.metrics.FalseNegatives(name='fn'),
			
 
				+        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
			
 
				+        tf.keras.metrics.Precision(name='precision'),
			
 
				+        tf.keras.metrics.Recall(name='recall'),
			
 
				+        # tf.keras.metrics.AUC(name='auc'),
			
 
				+        tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
			
 
				+        tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
			
 
				+    ]
			
 
				+
			
 
				+    model.compile(
			
 
				+        loss='binary_crossentropy',
			
 
				+        optimizer='adam',
			
 
				+        metrics=METRICS
			
 
				+    )
			
 
				+
			
 
				+    model.fit(model_input, df_merge[target].values,
			
 
				+                        batch_size=2048, epochs=5,
			
 
				+              verbose=2, validation_split=0.2, callbacks=callbacks)
			
 
				+
			
 
				+    model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
			
 
				+
			
 
				+    tf.keras.models.save_model(model,
			
 
				+                               "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
			
 
				+                               overwrite=True,
			
 
				+                               include_optimizer=True,
			
 
				+                               save_format=None,
			
 
				+                               signatures=None,
			
 
				+                               options=None)
			
 
				+
			
 
				+    print("*******  train FiBiNET cost time is:  " + str(time.time() - begin_time))
			
 
				+
			
--- a/deep_rank/xdeepfm_tzld_rank.py
+++ b/deep_rank/xdeepfm_tzld_rank.py
@@ -0,0 +1,219 @@
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import gc
			
 
				+import os
			
 
				+import time
			
 
				+from sklearn.preprocessing import LabelEncoder
			
 
				+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
			
 
				+
			
 
				+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
			
 
				+
			
 
				+import tensorflow as tf
			
 
				+from deepctr.models import xDeepFM
			
 
				+
			
 
				+
			
 
				+def split(x):
			
 
				+    key_ans = x.split('|')
			
 
				+    for key in key_ans:
			
 
				+        if key not in key2index:
			
 
				+            # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
			
 
				+            key2index[key] = len(key2index) + 1
			
 
				+    return list(map(lambda x: key2index[x], key_ans))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    begin_time = time.time()
			
 
				+    data = pd.read_csv("/work/xielixun/dwa_sum_graphembedding_user_action_feature_app_20210225.csv")
			
 
				+    sparse_features = ["videoid", "mid",
			
 
				+                       "videoGenre1", "videoGenre2",  "userRatedVideo1", "userRatedVideo2", "userRatedVideo3", "userGenre1", "userGenre2", "userCity",
			
 
				+                       "authorid", "userRealplayCount", "videoRealPlayCount", "videoDuration"]
			
 
				+
			
 
				+    # target = ['rating']
			
 
				+    target = ['label']
			
 
				+
			
 
				+    feature_max_idx = {}
			
 
				+
			
 
				+    data = data[data["mid"] != "unknown"].copy()
			
 
				+    data["mid"].replace("unknown", "N000111111D", inplace=True)
			
 
				+    data = data[data["mid"] != "N000111111D"].copy()
			
 
				+
			
 
				+    # 和上面函数的功能是一样的，见 deepMatch  DSSM
			
 
				+    def add_index_column(param_df, column_name):
			
 
				+        values = list(param_df[column_name].unique())
			
 
				+        value_index_dict = {value: idx for idx, value in enumerate(values)}
			
 
				+        if column_name == "mid":
			
 
				+            param_df["uidx"] = param_df[column_name].copy()
			
 
				+            param_df["mid"] = param_df[column_name].map(value_index_dict)
			
 
				+            feature_max_idx["mid"] = param_df["mid"].max() + 1
			
 
				+
			
 
				+    add_index_column(data, "mid")
			
 
				+    feature_max_idx["videoid"] = data["videoid"].max() + 1
			
 
				+
			
 
				+    for column_name in sparse_features:
			
 
				+        lbe = LabelEncoder()
			
 
				+        print("\n\n--------------   " + column_name)
			
 
				+        print(data[column_name])
			
 
				+
			
 
				+        if column_name == "videoGenre1" or column_name == "videoGenre2" or \
			
 
				+                column_name == "videoGenre3" or column_name == "userGenre1" or column_name == "userGenre2" or column_name == "userGenre3":
			
 
				+            data[column_name].fillna("社会", inplace=True)
			
 
				+
			
 
				+        if column_name == "userCity":
			
 
				+            data[column_name].fillna("北京", inplace=True)
			
 
				+
			
 
				+        if column_name == "mid":
			
 
				+            continue
			
 
				+        if column_name == "videoid":
			
 
				+            continue
			
 
				+
			
 
				+        data[column_name] = lbe.fit_transform(data[column_name])
			
 
				+        feature_max_idx[column_name] = data[column_name].max() + 1
			
 
				+
			
 
				+    key2index = {}
			
 
				+
			
 
				+    print("\n\n  ************ data process finish")
			
 
				+    user_video_list_df = data[data["label"] > 0].copy().groupby("mid")['videoid'].apply(list).reset_index()
			
 
				+    user_video_list_df.rename(columns={'videoid': 'hist_video_id'}, inplace=True)
			
 
				+
			
 
				+    print(user_video_list_df)
			
 
				+    print(type(user_video_list_df))
			
 
				+
			
 
				+    max_len = 50
			
 
				+    # print(list(user_video_length.keys()))
			
 
				+    # print(list(user_video_length.keys))
			
 
				+    mid_list = list(user_video_list_df["mid"])
			
 
				+    print(user_video_list_df["mid"])
			
 
				+    user_video_list_df["hist_len"] = user_video_list_df["hist_video_id"].apply(lambda x: len(x))
			
 
				+    print(user_video_list_df)
			
 
				+    print(len(user_video_list_df))
			
 
				+    emb_dim = 10
			
 
				+
			
 
				+    fixlen_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=emb_dim)
			
 
				+                              for feat in sparse_features]
			
 
				+
			
 
				+    print(fixlen_feature_columns)
			
 
				+
			
 
				+    use_weighted_sequence = False
			
 
				+    if use_weighted_sequence:
			
 
				+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
			
 
				+            key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
			
 
				+                                                   weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
			
 
				+    else:
			
 
				+        # varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
			
 
				+        #     key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
			
 
				+        #                                            weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
			
 
				+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('hist_video_id', vocabulary_size=feature_max_idx["videoid"], embedding_dim=emb_dim,
			
 
				+                                                              embedding_name="videoid"), maxlen=max_len, combiner='mean',
			
 
				+                                                   length_name="hist_len")]  # Notice : value 0 is for padding for sequence input feature
			
 
				+
			
 
				+    print(varlen_feature_columns)
			
 
				+    #linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
			
 
				+    linear_feature_columns = fixlen_feature_columns
			
 
				+    dnn_feature_columns = fixlen_feature_columns #+ varlen_feature_columns
			
 
				+    print(dnn_feature_columns)
			
 
				+    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
			
 
				+
			
 
				+    print("data len is:  ", len(data))
			
 
				+
			
 
				+    df_merge = pd.merge(left=data,
			
 
				+                        right=user_video_list_df,
			
 
				+                        left_on="mid",
			
 
				+                        right_on="mid",
			
 
				+                        #how="right")
			
 
				+                        how="inner")
			
 
				+    df_merge.head()
			
 
				+
			
 
				+    print("df_merge len is:  ", len(df_merge))
			
 
				+
			
 
				+    df_merge = df_merge.sample(frac=1.0)
			
 
				+    #df_merge = df_merge.sample(frac=0.3)
			
 
				+    del data, user_video_list_df
			
 
				+    gc.collect()
			
 
				+
			
 
				+    print("after sample df_merge len is:  ", len(df_merge))
			
 
				+
			
 
				+    model_input = {name: df_merge[name] for name in sparse_features}  #
			
 
				+
			
 
				+    # df_merge["hist_video_id"].fillna(str([0, 0, 0]), inplace=True)
			
 
				+    # df_merge["hist_len"].fillna(0, inplace=True)
			
 
				+
			
 
				+    print(df_merge)
			
 
				+    video_hist_seq_pad = pad_sequences(df_merge["hist_video_id"], maxlen=max_len, padding='post', truncating='post', value=0)
			
 
				+
			
 
				+    model_input["hist_video_id"] = video_hist_seq_pad
			
 
				+    print("\n\n\n")
			
 
				+    print(video_hist_seq_pad)
			
 
				+    print("\n\nuser_vids_input len is:  ", len(df_merge["hist_video_id"]))
			
 
				+
			
 
				+    model_input["hist_len"] = df_merge["hist_len"]
			
 
				+
			
 
				+    print("\n\nuser_vids_len_input len is:  ", len(df_merge["hist_len"]))
			
 
				+    user_features = ["uidx", "mid",
			
 
				+                     "userRatedVideo1", "userRatedVideo2", "userRatedVideo3",
			
 
				+                     "userGenre1", "userGenre2", "userCity",
			
 
				+                     "userRealplayCount", "hist_video_id", "hist_len"]
			
 
				+
			
 
				+    item_features = ["videoid", "videoGenre1", "videoGenre2",
			
 
				+                     "authorid", "videoRealPlayCount", "videoDuration"]
			
 
				+
			
 
				+    df_merge["hist_video_id"] = list(map(lambda x: ' '.join(list(map(str, x))), video_hist_seq_pad))  # 上下两条功能相同
			
 
				+
			
 
				+    user_df = df_merge[user_features].drop_duplicates('uidx')
			
 
				+    video_df = df_merge[item_features].drop_duplicates('videoid')
			
 
				+
			
 
				+    from datetime import datetime
			
 
				+
			
 
				+    TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now())
			
 
				+    user_df.to_csv("user_df" + TIMESTAMP + ".csv", index=False)
			
 
				+    video_df.to_csv("video_df" + TIMESTAMP + ".csv", index=False)
			
 
				+
			
 
				+    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
			
 
				+
			
 
				+    logdir = os.path.join("log_callbacks")  # Tensorboard需要一个文件夹
			
 
				+    if not os.path.exists(logdir):
			
 
				+        os.mkdir(logdir)
			
 
				+    output_model_file = os.path.join(logdir,
			
 
				+                                     'xdeepfm_model.h5')
			
 
				+
			
 
				+    callbacks = [
			
 
				+        tf.keras.callbacks.TensorBoard(logdir),
			
 
				+        tf.keras.callbacks.ModelCheckpoint(output_model_file,
			
 
				+                                           save_best_only=True),
			
 
				+        tf.keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
			
 
				+    ]
			
 
				+
			
 
				+    METRICS = [
			
 
				+        tf.keras.metrics.TruePositives(name='tp'),
			
 
				+        tf.keras.metrics.FalsePositives(name='fp'),
			
 
				+        tf.keras.metrics.TrueNegatives(name='tn'),
			
 
				+        tf.keras.metrics.FalseNegatives(name='fn'),
			
 
				+        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
			
 
				+        tf.keras.metrics.Precision(name='precision'),
			
 
				+        tf.keras.metrics.Recall(name='recall'),
			
 
				+        # tf.keras.metrics.AUC(name='auc'),
			
 
				+        tf.keras.metrics.AUC(name='auc-ROC', curve='ROC'),
			
 
				+        tf.keras.metrics.AUC(name='auc-PRC', curve='PR')
			
 
				+    ]
			
 
				+
			
 
				+    model.compile(
			
 
				+        loss='binary_crossentropy',
			
 
				+        optimizer='adam',
			
 
				+        metrics=METRICS
			
 
				+    )
			
 
				+
			
 
				+    model.fit(model_input, df_merge[target].values,
			
 
				+                        batch_size=4096, epochs=2,
			
 
				+              verbose=2, validation_split=0.2, callbacks=callbacks)
			
 
				+
			
 
				+    model.save("./tensorflow_xdeepfm-0325-tzld-2.h5")
			
 
				+    tf.keras.models.save_model(model,
			
 
				+                               "file:///work/xielixun/xDeepFM0325/tensorflow_xdeepfm-0325-tzld/001",
			
 
				+                               overwrite=True,
			
 
				+                               include_optimizer=True,
			
 
				+                               save_format=None,
			
 
				+                               signatures=None,
			
 
				+                               options=None)
			
 
				+
			
 
				+    print("*******  train xdeepfm cost time is:  " + str(time.time() - begin_time))
			
 
				+
			
--- a/emb_faiss/faiss-tzld-videos-users-dssm/start-prod.sh
+++ b/emb_faiss/faiss-tzld-videos-users-dssm/start-prod.sh
@@ -0,0 +1,24 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+file_log=./log
			
 
				+file_img_video=/datalog/video_data
			
 
				+
			
 
				+mkdir -p ${file_log}
			
 
				+mkdir -p ${file_img_video}
			
 
				+
			
 
				+if [ ! -d ${file_log}]; then
			
 
				+    mkdir -p ${file_log}
			
 
				+else
			
 
				+    echo "log文件夹已经存在"
			
 
				+fi
			
 
				+
			
 
				+if [ ! -d ${file_img_video}]; then
			
 
				+    mkdir -p ${file_img_video}
			
 
				+else
			
 
				+    echo "img_video文件夹已经存在"
			
 
				+fi
			
 
				+
			
 
				+#rm -rf ${file_log}/run*
			
 
				+ps -ef | grep "app_dssm_0329.py" | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				+
			
 
				+nohup python3 -u  ./app_dssm_0329.py --env prod > /dev/null 2>&1 &
			
--- a/emb_faiss/faiss-tzld-videos-users-fm/start-prod.sh
+++ b/emb_faiss/faiss-tzld-videos-users-fm/start-prod.sh
@@ -0,0 +1,24 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+file_log=./log
			
 
				+file_img_video=/datalog/video_data
			
 
				+
			
 
				+mkdir -p ${file_log}
			
 
				+mkdir -p ${file_img_video}
			
 
				+
			
 
				+if [ ! -d ${file_log}]; then
			
 
				+    mkdir -p ${file_log}
			
 
				+else
			
 
				+    echo "log文件夹已经存在"
			
 
				+fi
			
 
				+
			
 
				+if [ ! -d ${file_img_video}]; then
			
 
				+    mkdir -p ${file_img_video}
			
 
				+else
			
 
				+    echo "img_video文件夹已经存在"
			
 
				+fi
			
 
				+
			
 
				+#rm -rf ${file_log}/run*
			
 
				+ps -ef | grep "app_fm_0329.py" | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				+
			
 
				+nohup python3 -u  ./app_fm_0329.py --env prod > /dev/null 2>&1 &
			
--- a/emb_faiss/faiss-tzld-videos-users-item2vec/start-prod.sh
+++ b/emb_faiss/faiss-tzld-videos-users-item2vec/start-prod.sh
@@ -0,0 +1,24 @@
 
				+#!/bin/sh
			
 
				+
			
 
				+file_log=./log
			
 
				+file_img_video=/datalog/video_data
			
 
				+
			
 
				+mkdir -p ${file_log}
			
 
				+mkdir -p ${file_img_video}
			
 
				+
			
 
				+if [ ! -d ${file_log}]; then
			
 
				+    mkdir -p ${file_log}
			
 
				+else
			
 
				+    echo "log文件夹已经存在"
			
 
				+fi
			
 
				+
			
 
				+if [ ! -d ${file_img_video}]; then
			
 
				+    mkdir -p ${file_img_video}
			
 
				+else
			
 
				+    echo "img_video文件夹已经存在"
			
 
				+fi
			
 
				+
			
 
				+#rm -rf ${file_log}/run*
			
 
				+ps -ef | grep "app_item2vec_0316.py" | grep -v grep | awk '{print $2}' | xargs kill -9
			
 
				+
			
 
				+nohup python3 -u  ./app_item2vec_0316.py --env prod > /dev/null 2>&1 &