il y a 3 mois · 57a4081354
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_originData_20250218.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_originData_20250218.scala
@@ -9,11 +9,8 @@ import examples.extractor.v20250218.ExtractFeature20250218
 
															 import examples.utils.{FestiveUtil, SimilarityUtils}
														
 
															 import org.apache.hadoop.io.compress.GzipCodec
														
 
															 import org.apache.spark.sql.SparkSession
														
 
															-import org.xm.Similarity
														
 
															 import java.util
														
 
															-import scala.collection.JavaConversions._
														
 
															-import scala.collection.mutable.ArrayBuffer
														
 
															 /*
														
 
															    20250218 提取特征
														
@@ -175,23 +172,4 @@ object makedata_recsys_41_originData_20250218 {
 
															     record
														
 
															   }
														
 
															-  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
														
 
															-    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
														
 
															-    val tagsList = tags.split(",")
														
 
															-    var d1 = 0.0
														
 
															-    val d2 = new ArrayBuffer[String]()
														
 
															-    var d3 = 0.0
														
 
															-    var d4 = 0.0
														
 
															-    for (tag <- tagsList) {
														
 
															-      if (title.contains(tag)) {
														
 
															-        d1 = d1 + 1.0
														
 
															-        d2.add(tag)
														
 
															-      }
														
 
															-      val score = Similarity.conceptSimilarity(tag, title)
														
 
															-      d3 = if (score > d3) score else d3
														
 
															-      d4 = d4 + score
														
 
															-    }
														
 
															-    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
														
 
															-    (d1, d2.mkString(","), d3, d4)
														
 
															-  }
														
 
															 }
														
--- a/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_vid_ros_train_data_20250324.scala
+++ b/src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_41_vid_ros_train_data_20250324.scala
@@ -7,7 +7,6 @@ import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUt
 
															 import examples.extractor.v20250218.ExtractFeature20250218
														
 
															 import examples.utils.{FestiveUtil, SimilarityUtils}
														
 
															 import org.apache.hadoop.io.compress.GzipCodec
														
 
															-import org.apache.spark.rdd.RDD
														
 
															 import org.apache.spark.sql.SparkSession
														
 
															 import java.time.LocalDateTime
														
@@ -46,16 +45,12 @@ object makedata_recsys_41_vid_ros_train_data_20250324 {
 
															     // 3 循环执行数据生产
														
 
															     val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
														
 
															-    val partitions = timeRange.map { dt_hh =>
														
 
															+    for (dt_hh <- timeRange) {
														
 
															       val dt = dt_hh.substring(0, 8)
														
 
															       val hh = dt_hh.substring(8, 10)
														
 
															-      s"dt=$dt,hh=$hh"
														
 
															-    }
														
 
															-
														
 
															-    var odpsData: RDD[String] = sc.emptyRDD[String] // 初始化空RDD
														
 
															-    for (partition <- partitions) {
														
 
															+      val partition = s"dt=$dt,hh=$hh"
														
 
															       println(s"开始读取分区: $partition")
														
 
															-      val partitionData = odpsOps.readTable(
														
 
															+      val odpsData = odpsOps.readTable(
														
 
															           project = project,
														
 
															           table = table,
														
 
															           partition = partition,
														
@@ -66,10 +61,6 @@ object makedata_recsys_41_vid_ros_train_data_20250324 {
 
															           FestiveUtil.init()
														
 
															           p.map(record => {
														
 
															             val featureMap = new JSONObject()
														
 
															-            val vid = if (record.isNull("vid")) "" else record.getString("vid")
														
 
															-
														
 
															-            val hh = record.getString("hh").toInt
														
 
															-
														
 
															             // a 视频特征
														
 
															             val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else JSON.parseObject(record.getString("b1_feature"))
														
 
															             val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else JSON.parseObject(record.getString("b2_feature"))
														
@@ -120,17 +111,17 @@ object makedata_recsys_41_vid_ros_train_data_20250324 {
 
															           })
														
 
															         })
														
 
															-      odpsData = odpsData.union(partitionData)
														
 
															-    }
														
 
															-    // 4 保存数据到hdfs
														
 
															-    val hdfsPath = savePath
														
 
															-    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
														
 
															-      println("删除路径并开始数据写入:" + hdfsPath)
														
 
															-      MyHdfsUtils.delete_hdfs_path(hdfsPath)
														
 
															-      odpsData.coalesce(repartition, shuffle = true).saveAsTextFile(hdfsPath, classOf[GzipCodec])
														
 
															-    } else {
														
 
															-      println("路径不合法，无法写入:" + hdfsPath)
														
 
															+      // 4 保存数据到hdfs
														
 
															+      val savePartition = dt + hh
														
 
															+      val hdfsPath = savePath + "/" + savePartition
														
 
															+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
														
 
															+        println("删除路径并开始数据写入:" + hdfsPath)
														
 
															+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
														
 
															+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
														
 
															+      } else {
														
 
															+        println("路径不合法，无法写入:" + hdfsPath)
														
 
															+      }
														
 
															     }
														
 
															   }