7 kuukautta sitten · f8b0d7e904
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/video_dssm_sampler.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/video_dssm_sampler.scala
@@ -129,66 +129,72 @@ object video_dssm_sampler {
 
				         partition = s"dt=20241124,hh=08",
			
 
				         transfer = funcPositive,
			
 
				         numPartition = CONFIG("shuffle.partitions").toInt
			
 
				-      ).sample(false, 0.001) // 随机抽样千分之一的数据
			
 
				-       .persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				+      ).persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				       println("开始执行partiton:" + partition)
			
 
				 
			
 
				       val positivePairs = spark.createDataFrame(rdd, schema).persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				       stats.positiveSamplesCount = positivePairs.count()
			
 
				       logger.info(s"start read vid list for date $dt")
			
 
				 
			
 
				-      val numberedPos = positivePairs
			
 
				-        .withColumn("pos_id", row_number().over(Window.orderBy("vid_left", "vid_right")))
			
 
				-      // 2. 给候选视频表按100条分组编号
			
 
				-      val vidsRDD = odpsOps.readTable(
			
 
				-        project = "loghubods",
			
 
				-        table = "t_vid_tag_feature",
			
 
				-        partition = s"dt='$dt'",
			
 
				-        transfer = funcVids,
			
 
				-        numPartition = CONFIG("shuffle.partitions").toInt
			
 
				-      )
			
 
				+      // 2. 获取所有可用的vid列表
			
 
				+      val allVids = time({
			
 
				+        odpsOps.readTable(
			
 
				+          project = "loghubods",
			
 
				+          table = "t_vid_tag_feature",
			
 
				+          partition = s"dt='$dt'",
			
 
				+          transfer = funcVids,
			
 
				+          numPartition = CONFIG("shuffle.partitions").toInt
			
 
				+        ).collect().toSet
			
 
				+      }, "Loading all vids")
			
 
				+
			
 
				+
			
 
				+      val allVidsBroadcast = spark.sparkContext.broadcast(allVids)
			
 
				+
			
 
				+      // 4. 生成负样本
			
 
				+      val negativeSamplesRDD = time({
			
 
				+        positivePairs.rdd
			
 
				+          .mapPartitions { iter =>
			
 
				+            // 每次处理一批数据，而不是单条处理
			
 
				+            val batchSize = 100
			
 
				+            val localVids = allVidsBroadcast.value.toArray // 只转换一次为数组
			
 
				+            val random = new Random()
			
 
				+
			
 
				+            iter.grouped(batchSize).flatMap { batch =>
			
 
				+              // 为整批数据一次性生成负样本池
			
 
				+              val excludeVids = batch.flatMap(row =>
			
 
				+                Seq(row.getAs[String]("vid_left"), row.getAs[String]("vid_right"))
			
 
				+              ).toSet
			
 
				+
			
 
				+              // 预先过滤一次负样本池
			
 
				+              val candidatePool = localVids.filter(vid => !excludeVids.contains(vid))
			
 
				+
			
 
				+              batch.flatMap { row =>
			
 
				+                val vid_left = row.getAs[String]("vid_left")
			
 
				+                // 从已过滤的池中快速采样
			
 
				+                random.shuffle(candidatePool.take(1000))
			
 
				+                  .take(20)
			
 
				+                  .map(negative_vid => Row(vid_left, negative_vid, null, null, null, null, null, null, null))
			
 
				+              }
			
 
				+            }
			
 
				+          }
			
 
				+      }, "Generating negative samples")
			
 
				+      // 转换回DataFrame
			
 
				+      val negativeSamplesDF = spark.createDataFrame(negativeSamplesRDD, schema)
			
 
				+      stats.negativeSamplesCount = negativeSamplesRDD.count()
			
 
				 
			
 
				-      // 将RDD转换为DataFrame
			
 
				-      val numberedVid = spark.createDataFrame(vidsRDD.map(vid => Row(vid)),
			
 
				-          StructType(Seq(StructField("vid", StringType, true))))
			
 
				-        .withColumn("group_id", floor((row_number().over(Window.orderBy(rand())) - 1) / 100))
			
 
				-        .persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				-
			
 
				-      val negativeSamples = numberedPos
			
 
				-        .join(numberedVid,
			
 
				-          floor((col("pos_id") - 1) / 100) === col("group_id"))
			
 
				-        .where(col("vid") =!= col("vid_left") && col("vid") =!= col("vid_right"))
			
 
				-        .select(
			
 
				-          lit(null).as("extend"),
			
 
				-          lit(null).as("view_24h"),
			
 
				-          lit(null).as("total_return_uv"),
			
 
				-          lit(null).as("ts_right"),
			
 
				-          lit(null).as("apptype"),
			
 
				-          lit(null).as("pagesource"),
			
 
				-          lit(null).as("mid"),
			
 
				-          col("vid_left"),
			
 
				-          col("vid").as("vid_right")
			
 
				-        )
			
 
				-        .withColumn("rn", row_number().over(Window.partitionBy("vid_left").orderBy(rand())))
			
 
				-      // 4. 筛选每个vid_left对应的2个负样本
			
 
				-      val filteredNegative = negativeSamples
			
 
				-        .where(col("rn") <= 2)
			
 
				-        .drop("rn")
			
 
				 
			
 
				       // 5. 合并正负样本
			
 
				       val allSamples = positivePairs
			
 
				         .withColumn("label", lit(1))
			
 
				         .withColumn("logid", concat(lit("pos_"), monotonically_increasing_id()))
			
 
				         .union(
			
 
				-          filteredNegative
			
 
				+          negativeSamplesDF
			
 
				             .withColumn("label", lit(0))
			
 
				             .withColumn("logid", concat(lit("neg_"), monotonically_increasing_id()))
			
 
				-        )
			
 
				-        .persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				+        ).persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				 
			
 
				       // 6. 获取左侧特征
			
 
				       // 读取L1类别统计特征
			
 
				-      /*
			
 
				       val l1CatStatFeatures = {
			
 
				         val rdd = odpsOps.readTable(
			
 
				           project = "loghubods",
			
@@ -365,12 +371,11 @@ object video_dssm_sampler {
 
				           col("vid_right_cate_l2_feature")
			
 
				         )
			
 
				         .persist(StorageLevel.MEMORY_AND_DISK_SER)
			
 
				-      */
			
 
				+
			
 
				 
			
 
				 
			
 
				       // 保存结果到HDFS
			
 
				-      //val resultWithDt = result.withColumn("dt", lit(s"$dt"))
			
 
				-      val resultWithDt = allSamples.withColumn("dt", lit(s"$dt"))
			
 
				+      val resultWithDt = result.withColumn("dt", lit(s"$dt"))
			
 
				       resultWithDt.write
			
 
				         .mode("overwrite")
			
 
				         .partitionBy("dt")
			
@@ -380,9 +385,8 @@ object video_dssm_sampler {
 
				 
			
 
				       // 8. 清理缓存
			
 
				       positivePairs.unpersist()
			
 
				-      negativeSamples.unpersist()
			
 
				+
			
 
				       allSamples.unpersist()
			
 
				-      /*
			
 
				       l1CatStatFeatures.unpersist()
			
 
				       l2CatStatFeatures.unpersist()
			
 
				       tagFeatures.unpersist()
			
@@ -394,8 +398,6 @@ object video_dssm_sampler {
 
				       statRightFeatures.unpersist()
			
 
				       categoryRightWithStats.unpersist()
			
 
				       result.unpersist()
			
 
				-      */
			
 
				-
			
 
				       // 输出统计信息
			
 
				       stats.logStats()