Browse Source

release:training code of dssm model

often 6 months ago
parent
commit
96db7e190c

+ 3 - 3
recommend-model-produce/src/main/python/models/dssm/bq_reader_train_ps.py

@@ -12,9 +12,9 @@ class DSSMReader(MultiSlotDataGenerator):
     def line_process(self, line):
         try:
             # 按tab分割样本的各个字段
-            sample_values = line.rstrip('\n').split('    ')
-            if len(sample_values) == 4: # 训练格式
-                sample_id, label, left_features, right_features = sample_values
+            sample_values = line.rstrip('\n').split('\t')
+            if len(sample_values) == 6: # 训练格式
+                sample_id, label, vid_left,vid_right,left_features, right_features = sample_values
                 # 转换label为整数
                 label = int(label)
                 

+ 74 - 12
recommend-model-produce/src/main/python/models/dssm/net.py

@@ -7,35 +7,97 @@ class DSSMLayer(nn.Layer):
     def __init__(self, feature_nums=[5,5,5,5,5], embedding_dim=8, output_dim=16, 
                  hidden_layers=[40, 32], hidden_acts=["relu", "relu"]):
         super(DSSMLayer, self).__init__()
-        
-        self.feature_num = len(feature_nums)
+        tag_features_dict = {
+            "vid":3407301,
+            "cate1":42,
+            "cate2":67,
+            "video_style":6517,
+            "valid_time":728,
+            "captions_color":656,
+            "audience_age_group":65,
+            "audience_value_type":61,
+            "font_size":49,
+            "cover_persons_num":44,
+            "audience_gender":37,
+            "sentiment_tendency":11,
+            "video_type":8,
+            "background_music_type":6,
+            "captions":3,
+            "has_end_credit_guide":2
+        }
+                
+        self.stat_features_num = 47 
+        self.stat_features_num_embeddings = 100
+        
+        self.feature_num = len(list(tag_features_dict.values()))
+        feature_nums = list(tag_features_dict.values())
         self.embedding_dim = embedding_dim
         self.output_dim = output_dim
         # 第一层的输入维度是所有特征的embedding拼接
-        self.hidden_layers = [self.feature_num * embedding_dim] + hidden_layers + [output_dim]
+        self.hidden_layers = [self.feature_num * embedding_dim + 47*3*embedding_dim] + hidden_layers + [output_dim]
         self.hidden_acts = hidden_acts
         
         
         # 为每个特征创建对应维度的Embedding层
-        self.left_embeddings = nn.LayerList([
-            nn.Embedding(
+        self.left_embeddings = nn.LayerList()
+        for i in range(self.feature_num): # 视频的静态特征
+            layer = nn.Embedding(
                 num_embeddings=feature_nums[i],
                 embedding_dim=embedding_dim,
                 weight_attr=paddle.ParamAttr(
                     initializer=paddle.nn.initializer.XavierNormal()
                 )
-            ) for i in range(self.feature_num)
-        ])
+            )
+            self.left_embeddings.append(layer)
+        for i in range(self.stat_features_num): #长短期和品类的表现 分桶后映射成index
+            layer = nn.Embedding(
+                num_embeddings=self.stat_features_num_embeddings,
+                embedding_dim=embedding_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal()
+                )
+            )
+            self.left_embeddings.append(layer)
+        for i in range(self.stat_features_num*2): # 长短期和品类的表现,hive表的原始值映射到0-1之间的小数 + hive表的原始值
+            layer = nn.Linear(
+                in_features=1,
+                out_features=embedding_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal()
+                )
+            )
+            self.left_embeddings.append(layer)                        
+        
 
-        self.right_embeddings = nn.LayerList([
-            nn.Embedding(
-                num_embeddings=feature_nums[i], 
+        self.right_embeddings = nn.LayerList()
+        for i in range(self.feature_num): # 视频的静态特征
+            layer = nn.Embedding(
+                num_embeddings=feature_nums[i],
+                embedding_dim=embedding_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal()
+                )
+            )
+            self.right_embeddings.append(layer)
+        for i in range(self.stat_features_num): #长短期和品类的表现 分桶后映射成index
+            layer = nn.Embedding(
+                num_embeddings=self.stat_features_num_embeddings,
                 embedding_dim=embedding_dim,
                 weight_attr=paddle.ParamAttr(
                     initializer=paddle.nn.initializer.XavierNormal()
                 )
-            ) for i in range(self.feature_num)
-        ])
+            )
+            self.right_embeddings.append(layer)
+        for i in range(self.stat_features_num*2): # 长短期和品类的表现,hive表的原始值映射到0-1之间的小数 + hive表的原始值
+            layer = nn.Linear(
+                in_features=1,
+                out_features=embedding_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.XavierNormal()
+                )
+            )
+            self.right_embeddings.append(layer)  
+
 
         # 左视频塔
         self._left_tower = []