11 mēneši atpakaļ · bc5008461b
--- a/pom.xml
+++ b/pom.xml
@@ -169,8 +169,11 @@
 
				             <artifactId>jedis</artifactId>
			
 
				             <version>3.3.0</version>
			
 
				         </dependency>
			
 
				-
			
 
				-
			
 
				+        <dependency>
			
 
				+            <groupId>com.jayway.jsonpath</groupId>
			
 
				+            <artifactId>json-path</artifactId>
			
 
				+            <version>2.8.0</version>
			
 
				+        </dependency>
			
 
				         <dependency>
			
 
				             <groupId>org.projectlombok</groupId>
			
 
				             <artifactId>lombok</artifactId>
			
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/video_dssm_sampler.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/video_dssm_sampler.scala
@@ -1,11 +1,16 @@
 
				 package com.aliyun.odps.spark.examples.makedata_recsys
			
 
				-import org.apache.spark.sql.{DataFrame, SparkSession, Row}
			
 
				+import com.aliyun.odps.TableSchema
			
 
				+import com.aliyun.odps.data.Record
			
 
				+import com.aliyun.odps.spark.examples.myUtils.{ParamUtils, env}
			
 
				+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
			
 
				 import org.apache.spark.sql.functions._
			
 
				 import org.apache.spark.sql.types._
			
 
				 import org.apache.spark.storage.StorageLevel
			
 
				+import com.jayway.jsonpath.JsonPath
			
 
				+
			
 
				 import scala.util.Random
			
 
				 import scala.collection.mutable.ArrayBuffer
			
 
				-import org.apache.log4j.{Logger, Level}
			
 
				+import org.apache.log4j.{Level, Logger}
			
 
				 
			
 
				 object video_dssm_sampler {
			
 
				   private val logger = Logger.getLogger(this.getClass)
			
@@ -48,46 +53,83 @@ object video_dssm_sampler {
 
				     spark
			
 
				   }
			
 
				 
			
 
				+  def funcPositive(record: Record, schema: TableSchema): Row = {
			
 
				+      Row(
			
 
				+      record.getString("dt"),
			
 
				+      record.getString("hh"),
			
 
				+      record.getString("vid_left"),
			
 
				+      record.getString("vid_right"),
			
 
				+      record.getString("extend"),
			
 
				+      record.getString("view_24h"),
			
 
				+      record.getString("total_return_uv"),
			
 
				+      record.getString("ts_right"),
			
 
				+      record.getString("apptype"),
			
 
				+      record.getString("pagesource"),
			
 
				+      record.getString("mid"))
			
 
				+  }
			
 
				+  def funcVids(record: Record, schema: TableSchema): String = {
			
 
				+    record.getString("vid")
			
 
				+  }
			
 
				+  def funcTagFeatures(record: Record, schema: TableSchema): (String, String) = {
			
 
				+    (record.getString("vid"), record.getString("feature"))
			
 
				+  }
			
 
				+  def funcStatFeatures(record: Record, schema: TableSchema): (String, String) = {
			
 
				+    (record.getString("vid"), record.getString("feature"))
			
 
				+  }
			
 
				+  // 获取类别特征
			
 
				+  def funcCategoryFeatures(record: Record, schema: TableSchema): (String, String, String, String) = {
			
 
				+    val feature = record.getString("feature")
			
 
				+    val category1 = JsonPath.read[String](feature, "$.category1")
			
 
				+    val category2 = JsonPath.read[String](feature, "$.category2_1")
			
 
				+    (record.getString("vid"), category1, category2, feature)
			
 
				+  }
			
 
				+
			
 
				   def generateNegativeSamples(spark: SparkSession, dt: String, outputPath: String): Unit = {
			
 
				     val stats = ProcessingStats()
			
 
				 
			
 
				     try {
			
 
				       logger.info(s"Starting negative sample generation for date: $dt")
			
 
				+      val sc = spark.sparkContext
			
 
				 
			
 
				-      // 1. 获取并缓存正样本对数据
			
 
				+      // 2 读取odps+表信息
			
 
				+      val odpsOps = env.getODPS(sc)
			
 
				+      // 1. 获取正样本对数据
			
 
				       val positivePairs = time({
			
 
				-        val df = spark.sql(s"""
			
 
				-          SELECT
			
 
				-            dt,
			
 
				-            hh,
			
 
				-            vid_left,
			
 
				-            vid_right,
			
 
				-            extend,
			
 
				-            view_24h,
			
 
				-            total_return_uv,
			
 
				-            ts_right,
			
 
				-            apptype,
			
 
				-            pagesource,
			
 
				-            mid
			
 
				-          FROM loghubods.alg_dssm_sample
			
 
				-          WHERE dt = '$dt'
			
 
				-        """).persist(StorageLevel.MEMORY_AND_DISK)
			
 
				-
			
 
				+        val schema = StructType(Array(
			
 
				+          StructField("dt", StringType, true),
			
 
				+          StructField("hh", StringType, true),
			
 
				+          StructField("vid_left", StringType, true),
			
 
				+          StructField("vid_right", StringType, true),
			
 
				+          StructField("extend", StringType, true),
			
 
				+          StructField("view_24h", StringType, true),
			
 
				+          StructField("total_return_uv", StringType, true),
			
 
				+          StructField("ts_right", StringType, true),
			
 
				+          StructField("apptype", StringType, true),
			
 
				+          StructField("pagesource", StringType, true),
			
 
				+          StructField("mid", StringType, true)
			
 
				+        ))
			
 
				+
			
 
				+        val rdd = odpsOps.readTable(
			
 
				+          project = "loghubods",
			
 
				+          table = "alg_dssm_sample",
			
 
				+          partition = s"dt='$dt'",
			
 
				+          transfer = funcPositive,
			
 
				+          numPartition = CONFIG("shuffle.partitions").toInt
			
 
				+        ).persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+        val df = spark.createDataFrame(rdd, schema).persist(StorageLevel.MEMORY_AND_DISK)
			
 
				         stats.positiveSamplesCount = df.count()
			
 
				         df
			
 
				       }, "Loading positive pairs")
			
 
				 
			
 
				       // 2. 获取所有可用的vid列表
			
 
				       val allVids = time({
			
 
				-        spark.sql(s"""
			
 
				-          SELECT vid
			
 
				-          FROM loghubods.t_vid_tag_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        """)
			
 
				-          .select("vid")
			
 
				-          .collect()
			
 
				-          .map(_.getString(0))
			
 
				-          .toSet
			
 
				+        odpsOps.readTable(
			
 
				+          project = "loghubods",
			
 
				+          table = "t_vid_tag_feature",
			
 
				+          partition = s"dt='$dt'",
			
 
				+          transfer = funcVids,
			
 
				+          numPartition = CONFIG("shuffle.partitions").toInt
			
 
				+        ).collect().toSet
			
 
				       }, "Loading all vids")
			
 
				 
			
 
				       // 3. 定义UDF函数来生成负样本
			
@@ -134,98 +176,74 @@ object video_dssm_sampler {
 
				 
			
 
				       // 6. 获取左侧特征
			
 
				       // 获取tag特征
			
 
				-      val tagFeatures = spark.sql(s"""
			
 
				-        SELECT vid, feature as vid_left_tag_feature
			
 
				-        FROM loghubods.t_vid_tag_feature
			
 
				-        WHERE dt = '$dt'
			
 
				-      """)
			
 
				 
			
 
				-      // 获取统计特征
			
 
				-      val statFeatures = spark.sql(s"""
			
 
				-        SELECT vid, feature as vid_left_stat_feature
			
 
				-        FROM loghubods.t_vid_stat_feature
			
 
				-        WHERE dt = '$dt'
			
 
				-      """)
			
 
				-
			
 
				-      // 获取类别特征
			
 
				-      val categoryFeatures = spark.sql(s"""
			
 
				-        SELECT
			
 
				-          a.vid,
			
 
				-          b.feature as vid_left_cate_l1_feature,
			
 
				-          c.feature as vid_left_cate_l2_feature
			
 
				-        FROM (
			
 
				-          SELECT
			
 
				-            vid,
			
 
				-            get_json_object(feature,"$$.category1") as category1,
			
 
				-            get_json_object(feature,"$$.category2_1") as category2
			
 
				-          FROM loghubods.t_vid_tag_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        ) a
			
 
				-        LEFT JOIN (
			
 
				-          SELECT category1, feature
			
 
				-          FROM t_vid_l1_cat_stat_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        ) b ON a.category1 = b.category1
			
 
				-        LEFT JOIN (
			
 
				-          SELECT category2, feature
			
 
				-          FROM t_vid_l2_cat_stat_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        ) c ON a.category2 = c.category2
			
 
				-      """)
			
 
				+      val tagFeatures = {
			
 
				+        val rdd = odpsOps.readTable(
			
 
				+          project = "loghubods",
			
 
				+          table = "t_vid_tag_feature",
			
 
				+          partition = s"dt='$dt'",
			
 
				+          transfer = funcTagFeatures,
			
 
				+          numPartition = CONFIG("shuffle.partitions").toInt
			
 
				+        )
			
 
				+        val schema = StructType(Array(
			
 
				+          StructField("vid", StringType, true),
			
 
				+          StructField("vid_left_tag_feature", StringType, true)
			
 
				+        ))
			
 
				+        spark.createDataFrame(rdd.map(t => Row(t._1, t._2)), schema)
			
 
				+      }
			
 
				 
			
 
				+
			
 
				+      // 获取统计特征
			
 
				+      val statFeatures = {
			
 
				+        val rdd = odpsOps.readTable(
			
 
				+          project = "loghubods",
			
 
				+          table = "t_vid_stat_feature",
			
 
				+          partition = s"dt='$dt'",
			
 
				+          transfer = funcStatFeatures,
			
 
				+          numPartition = CONFIG("shuffle.partitions").toInt
			
 
				+        )
			
 
				+        val schema = StructType(Array(
			
 
				+          StructField("vid", StringType, true),
			
 
				+          StructField("vid_left_stat_feature", StringType, true)
			
 
				+        ))
			
 
				+        spark.createDataFrame(rdd.map(t => Row(t._1, t._2)), schema)
			
 
				+      }
			
 
				+
			
 
				+
			
 
				+      // 读取并处理类别特征
			
 
				+      val categoryData = {
			
 
				+        val rdd = odpsOps.readTable(
			
 
				+          project = "loghubods",
			
 
				+          table = "t_vid_tag_feature",
			
 
				+          partition = s"dt='$dt'",
			
 
				+          transfer = funcCategoryFeatures,
			
 
				+          numPartition = CONFIG("shuffle.partitions").toInt
			
 
				+        )
			
 
				+
			
 
				+        // 定义 schema
			
 
				+        val schema = StructType(Array(
			
 
				+          StructField("vid", StringType, true),
			
 
				+          StructField("category1", StringType, true),
			
 
				+          StructField("category2", StringType, true),
			
 
				+          StructField("feature", StringType, true)
			
 
				+        ))
			
 
				+
			
 
				+        // 将 RDD 转换为 DataFrame
			
 
				+        spark.createDataFrame(rdd.map(t => Row(t._1, t._2, t._3, t._4)), schema)
			
 
				+      }
			
 
				       // 合并所有左侧特征
			
 
				       val vidLeftFeatures = allSamples
			
 
				         .join(broadcast(tagFeatures), col("vid_left") === tagFeatures("vid"), "left")
			
 
				         .join(broadcast(statFeatures), col("vid_left") === statFeatures("vid"), "left")
			
 
				-        .join(broadcast(categoryFeatures), col("vid_left") === categoryFeatures("vid"), "left")
			
 
				+        .join(broadcast(categoryData), col("vid_left") === categoryData("vid"), "left")
			
 
				         .persist(StorageLevel.MEMORY_AND_DISK)
			
 
				 
			
 
				-      // 7. 获取右侧特征并生成最终结果
			
 
				-      // 获取tag特征
			
 
				-      val rightTagFeatures = spark.sql(s"""
			
 
				-        SELECT vid, feature as vid_right_tag_feature
			
 
				-        FROM loghubods.t_vid_tag_feature
			
 
				-        WHERE dt = '$dt'
			
 
				-      """)
			
 
				-
			
 
				-      // 获取统计特征
			
 
				-      val rightStatFeatures = spark.sql(s"""
			
 
				-        SELECT vid, feature as vid_right_stat_feature
			
 
				-        FROM loghubods.t_vid_stat_feature
			
 
				-        WHERE dt = '$dt'
			
 
				-      """)
			
 
				-
			
 
				-      // 获取类别特征
			
 
				-      val rightCategoryFeatures = spark.sql(s"""
			
 
				-        SELECT
			
 
				-          a.vid,
			
 
				-          b.feature as vid_right_cate_l1_feature,
			
 
				-          c.feature as vid_right_cate_l2_feature
			
 
				-        FROM (
			
 
				-          SELECT
			
 
				-            vid,
			
 
				-            get_json_object(feature,"$$.category1") as category1,
			
 
				-            get_json_object(feature,"$$.category2_1") as category2
			
 
				-          FROM loghubods.t_vid_tag_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        ) a
			
 
				-        LEFT JOIN (
			
 
				-          SELECT category1, feature
			
 
				-          FROM t_vid_l1_cat_stat_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        ) b ON a.category1 = b.category1
			
 
				-        LEFT JOIN (
			
 
				-          SELECT category2, feature
			
 
				-          FROM t_vid_l2_cat_stat_feature
			
 
				-          WHERE dt = '$dt'
			
 
				-        ) c ON a.category2 = c.category2
			
 
				-      """)
			
 
				 
			
 
				       // 合并所有右侧特征并生成最终结果
			
 
				       val result = vidLeftFeatures
			
 
				-        .join(broadcast(rightTagFeatures), col("vid_right") === rightTagFeatures("vid"), "left")
			
 
				-        .join(broadcast(rightStatFeatures), col("vid_right") === rightStatFeatures("vid"), "left")
			
 
				-        .join(broadcast(rightCategoryFeatures), col("vid_right") === rightCategoryFeatures("vid"), "left")
			
 
				+        .join(broadcast(tagFeatures), col("vid_right") === tagFeatures("vid"), "left")
			
 
				+        .join(broadcast(statFeatures), col("vid_right") === statFeatures("vid"), "left")
			
 
				+        .join(broadcast(categoryData), col("vid_right") === categoryData("vid"), "left")
			
 
				 
			
 
				       // 保存结果到HDFS
			
 
				       result.write
			
@@ -239,7 +257,7 @@ object video_dssm_sampler {
 
				       positivePairs.unpersist()
			
 
				       allSamples.unpersist()
			
 
				       vidLeftFeatures.unpersist()
			
 
				-      rightTagFeatures.unpersist()
			
 
				+
			
 
				 
			
 
				       // 输出统计信息
			
 
				       stats.logStats()