| 
					
				 | 
			
			
				@@ -244,8 +244,7 @@ object video_dssm_sampler { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      // 获取tag特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      // 获取左视频tag特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       val tagFeatures = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         val rdd = odpsOps.readTable( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           project = "loghubods", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -262,7 +261,7 @@ object video_dssm_sampler { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      // 获取统计特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      // 获取左视频统计特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       val statFeatures = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         val rdd = odpsOps.readTable( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				           project = "loghubods", 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -304,11 +303,73 @@ object video_dssm_sampler { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         .persist(StorageLevel.MEMORY_AND_DISK) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      // 合并所有右侧特征并生成最终结果 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      // 获取右视频tag特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val tagRightFeatures = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        val rdd = odpsOps.readTable( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          project = "loghubods", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          table = "t_vid_tag_feature", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          partition = s"dt='$dt'", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          transfer = funcTagFeatures, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          numPartition = CONFIG("shuffle.partitions").toInt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        val schema = StructType(Array( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          StructField("vid", StringType, true), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          StructField("vid_right_tag_feature", StringType, true) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        )) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        spark.createDataFrame(rdd.map(t => Row(t._1, t._2)), schema) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      // 获取左视频统计特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val statRightFeatures = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        val rdd = odpsOps.readTable( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          project = "loghubods", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          table = "t_vid_stat_feature", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          partition = s"dt='$dt'", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          transfer = funcStatFeatures, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          numPartition = CONFIG("shuffle.partitions").toInt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        val schema = StructType(Array( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          StructField("vid", StringType, true), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          StructField("vid_right_stat_feature", StringType, true) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        )) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        spark.createDataFrame(rdd.map(t => Row(t._1, t._2)), schema) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val categoryRightWithStats = categoryData 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .join(broadcast(l1CatStatFeatures), categoryData("category1") === l1CatStatFeatures("category1"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .join(broadcast(l2CatStatFeatures), categoryData("category2") === l2CatStatFeatures("category2"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .select( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("vid"), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("cate_l1_feature").as("vid_right_cate_l1_feature"), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("cate_l2_feature").as("vid_right_cate_l2_feature") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       val result = vidLeftFeatures 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      //  .join(broadcast(tagFeatures), col("vid_right") === tagFeatures("vid"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      //  .join(broadcast(statFeatures), col("vid_right") === statFeatures("vid"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      //  .join(broadcast(categoryData), col("vid_right") === categoryData("vid"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .join(broadcast(tagRightFeatures), col("vid_right") === tagRightFeatures("vid"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .drop(tagRightFeatures("vid")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .join(broadcast(statRightFeatures), col("vid_right") === statRightFeatures("vid"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .drop(statRightFeatures("vid")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .join(broadcast(categoryRightWithStats), col("vid_right") === categoryRightWithStats("vid"), "left") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .drop(categoryRightWithStats("vid")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .select( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          vidLeftFeatures("*"), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("vid_right_tag_feature"), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("vid_right_stat_feature"), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("vid_right_cate_l1_feature"), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          col("vid_right_cate_l2_feature") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        .persist(StorageLevel.MEMORY_AND_DISK) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       // 保存结果到HDFS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       val resultWithDt = result.withColumn("dt", lit(s"$dt")) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -321,10 +382,19 @@ object video_dssm_sampler { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       // 8. 清理缓存 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       positivePairs.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      negativeSamplesDF.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       allSamples.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      l1CatStatFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      l2CatStatFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      tagFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      statFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      categoryData.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      categoryWithStats.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       vidLeftFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      tagRightFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      statRightFeatures.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      categoryRightWithStats.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      result.unpersist() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       // 输出统计信息 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       stats.logStats() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 |