Przeglądaj źródła

feat:添加vid维度的ros样本数据生产

zhaohaipeng 1 miesiąc temu
rodzic
commit
57a4081354

+ 0 - 22
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_originData_20250218.scala

@@ -9,11 +9,8 @@ import examples.extractor.v20250218.ExtractFeature20250218
 import examples.utils.{FestiveUtil, SimilarityUtils}
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.SparkSession
-import org.xm.Similarity
 
 import java.util
-import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
 
 /*
    20250218 提取特征
@@ -175,23 +172,4 @@ object makedata_recsys_41_originData_20250218 {
     record
   }
 
-  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
-    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
-    val tagsList = tags.split(",")
-    var d1 = 0.0
-    val d2 = new ArrayBuffer[String]()
-    var d3 = 0.0
-    var d4 = 0.0
-    for (tag <- tagsList) {
-      if (title.contains(tag)) {
-        d1 = d1 + 1.0
-        d2.add(tag)
-      }
-      val score = Similarity.conceptSimilarity(tag, title)
-      d3 = if (score > d3) score else d3
-      d4 = d4 + score
-    }
-    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
-    (d1, d2.mkString(","), d3, d4)
-  }
 }

+ 13 - 22
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_vid_ros_train_data_20250324.scala

@@ -7,7 +7,6 @@ import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUt
 import examples.extractor.v20250218.ExtractFeature20250218
 import examples.utils.{FestiveUtil, SimilarityUtils}
 import org.apache.hadoop.io.compress.GzipCodec
-import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 
 import java.time.LocalDateTime
@@ -46,16 +45,12 @@ object makedata_recsys_41_vid_ros_train_data_20250324 {
 
     // 3 循环执行数据生产
     val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
-    val partitions = timeRange.map { dt_hh =>
+    for (dt_hh <- timeRange) {
       val dt = dt_hh.substring(0, 8)
       val hh = dt_hh.substring(8, 10)
-      s"dt=$dt,hh=$hh"
-    }
-
-    var odpsData: RDD[String] = sc.emptyRDD[String] // 初始化空RDD
-    for (partition <- partitions) {
+      val partition = s"dt=$dt,hh=$hh"
       println(s"开始读取分区: $partition")
-      val partitionData = odpsOps.readTable(
+      val odpsData = odpsOps.readTable(
           project = project,
           table = table,
           partition = partition,
@@ -66,10 +61,6 @@ object makedata_recsys_41_vid_ros_train_data_20250324 {
           FestiveUtil.init()
           p.map(record => {
             val featureMap = new JSONObject()
-            val vid = if (record.isNull("vid")) "" else record.getString("vid")
-
-            val hh = record.getString("hh").toInt
-
             // a 视频特征
             val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else JSON.parseObject(record.getString("b1_feature"))
             val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else JSON.parseObject(record.getString("b2_feature"))
@@ -120,17 +111,17 @@ object makedata_recsys_41_vid_ros_train_data_20250324 {
 
           })
         })
-      odpsData = odpsData.union(partitionData)
-    }
 
-    // 4 保存数据到hdfs
-    val hdfsPath = savePath
-    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
-      println("删除路径并开始数据写入:" + hdfsPath)
-      MyHdfsUtils.delete_hdfs_path(hdfsPath)
-      odpsData.coalesce(repartition, shuffle = true).saveAsTextFile(hdfsPath, classOf[GzipCodec])
-    } else {
-      println("路径不合法,无法写入:" + hdfsPath)
+      // 4 保存数据到hdfs
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
     }
   }