4 years ago · f2ce233c7c
--- a/match_recall/Dssm_tzld_match.py
+++ b/match_recall/Dssm_tzld_match.py
@@ -213,7 +213,6 @@ if __name__ == "__main__":
 
				 
			
 
				     if tf.__version__ >= '2.0.0':
			
 
				         tf.compat.v1.disable_eager_execution()
			
 
				-    # 3.Define Model and train
			
 
				 
			
 
				     model = DSSM(user_feature_columns, item_feature_columns)
			
 
				     logdir = os.path.join("log_callbacks_dssm")  # Tensorboard需要一个文件夹
			
--- a/match_recall/FM_tzld_match.py
+++ b/match_recall/FM_tzld_match.py
@@ -168,7 +168,6 @@ if __name__ == "__main__":
 
				     user_profile.set_index("mid", inplace=True)
			
 
				     print(data)
			
 
				     print("\n\n after group by mid videoid")
			
 
				-    # print(data)
			
 
				     del data
			
 
				     gc.collect()
			
 
				     # 按序列采样，没有加负采样，会再训练时的batch里进行batch负采样
			
@@ -215,7 +214,6 @@ if __name__ == "__main__":
 
				 
			
 
				     if tf.__version__ >= '2.0.0':
			
 
				         tf.compat.v1.disable_eager_execution()
			
 
				-    # 3.Define Model and train
			
 
				 
			
 
				     model = FM(user_feature_columns, item_feature_columns)
			
 
				     logdir = os.path.join("log_callbacks_fm")  # Tensorboard需要一个文件夹
			
--- a/match_recall/item2Vec-userEmbedding-tzld.py
+++ b/match_recall/item2Vec-userEmbedding-tzld.py
@@ -1,12 +1,9 @@
 
				-#!/usr/bin/env python
			
 
				 # coding: utf-8
			
 
				 import pandas as pd
			
 
				 import time
			
 
				 
			
 
				 begin_time = time.time()
			
 
				 
			
 
				-# In[31]:
			
 
				-
			
 
				 
			
 
				 #df = pd.read_csv("./datas/item2vecTzld1106.csv")
			
 
				 #df = pd.read_csv("/root/xielixun/item2vec_app_20201126.csv")
			
--- a/match_recall/spark-item2Vec-tzld.py
+++ b/match_recall/spark-item2Vec-tzld.py
@@ -1,4 +1,3 @@
 
				-#!/usr/bin/env python
			
 
				 # coding: utf-8
			
 
				 
			
 
				 import pandas as pd
			
@@ -64,7 +63,7 @@ sc = spark.sparkContext
 
				 #df = spark.read.csv("./datas/tzld_uid_videoids_app_20210406.csv", header=True)
			
 
				 df = spark.read.csv("./datas/tzld_uid_videoids_app_20210419.csv", header=True)
			
 
				 df.show(15)
			
 
				-# In[9]:
			
 
				+
			
 
				 from pyspark.sql import functions as F
			
 
				 from pyspark.sql import types as T
			
 
				 
			
--- a/match_recall/youtube_tzld_match.py
+++ b/match_recall/youtube_tzld_match.py
@@ -265,8 +265,7 @@ if __name__ == "__main__":
 
				 
			
 
				     item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)
			
 
				 
			
 
				-    # #### 得到user embedding
			
 
				-
			
 
				+    # 得到user embedding
			
 
				     user_layer_model = tf.keras.models.Model(
			
 
				         inputs=[model.user_input],
			
 
				         # outputs=model.get_layer("user_embedding").output
			
@@ -275,7 +274,7 @@ if __name__ == "__main__":
 
				 
			
 
				     user_embeddings = []
			
 
				 
			
 
				-    # #### 得到video embedding
			
 
				+    # 得到video embedding
			
 
				     video_layer_model = tf.keras.models.Model(
			
 
				         inputs=[model.item_input],
			
 
				         # outputs=model.get_layer("item_embedding").output
			
@@ -292,7 +291,6 @@ if __name__ == "__main__":
 
				             np.reshape(row["videoGenre1"], [1, 1]),
			
 
				             # np.reshape(row["videoGenre2"], [1, 1]),
			
 
				             np.reshape(row["authorid"], [1, 1]),
			
 
				-            #
			
 
				             np.reshape(row["videoRealPlayCount"], [1, 1]),
			
 
				             np.reshape(row["videoDuration"], [1, 1])
			
 
				         ]