Prechádzať zdrojové kódy

feat:添加分桶脚本

zhaohaipeng 2 mesiacov pred
rodič
commit
1cac1766ff

Rozdielové dáta súboru neboli zobrazené, pretože súbor je príliš veľký
+ 0 - 0
src/main/resources/20250218_bucket_322.txt


+ 10 - 5
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/v20250218/makedata_recsys_43_bucketData_20250218.scala

@@ -24,6 +24,7 @@ object makedata_recsys_43_bucketData_20250218 {
     val endStr = param.getOrElse("endStr", "20240703")
     val repartition = param.getOrElse("repartition", "100").toInt
     val filterNames = param.getOrElse("filterNames", "").split(",").filter(_.nonEmpty).toSet
+    val noBucketFeature = param.getOrElse("noBucketFeature", "hour,is_greeting,day_of_week").split(",").filter(_.nonEmpty).toSet
     val whatLabel = param.getOrElse("whatLabel", "is_share")
     val whatApps = param.getOrElse("whatApps", "0,4,2,32,17,18,21,22,24,25,26,27,28,29,3,30,31,33,34,35,36").split(",").toSet
     val fileName = param.getOrElse("fileName", "20250218_bucket_322.txt")
@@ -92,12 +93,16 @@ object makedata_recsys_43_bucketData_20250218 {
                     ""
                   } else {
                     if (score > 1E-8) {
-                      if (bucketsMap.contains(name)) {
-                        val (bucketsNum, buckets) = bucketsMap(name)
-                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
-                        name + ":" + scoreNew.toString
-                      } else {
+                      if (noBucketFeature.nonEmpty && noBucketFeature.contains(name)) {
                         name + ":" + score.toString
+                      } else {
+                        if (bucketsMap.contains(name)) {
+                          val (bucketsNum, buckets) = bucketsMap(name)
+                          val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                          name + ":" + scoreNew.toString
+                        } else {
+                          ""
+                        }
                       }
                     } else {
                       ""

Niektoré súbory nie sú zobrazené, pretože je v týchto rozdielových dátach zmenené mnoho súborov