Sfoglia il codice sorgente

feat:修改20240726分桶脚本

zhaohaipeng 9 mesi fa
parent
commit
d3740c0984

File diff suppressed because it is too large
+ 8 - 0
src/main/resources/20240718_ad_bucket_517.txt


+ 517 - 0
src/main/resources/20240718_ad_feature_name_517.txt

@@ -0,0 +1,517 @@
+cpa
+b2_1h_ctr
+b2_1h_ctcvr
+b2_1h_cvr
+b2_1h_conver
+b2_1h_click
+b2_1h_conver*log(view)
+b2_1h_conver*ctcvr
+b2_2h_ctr
+b2_2h_ctcvr
+b2_2h_cvr
+b2_2h_conver
+b2_2h_click
+b2_2h_conver*log(view)
+b2_2h_conver*ctcvr
+b2_3h_ctr
+b2_3h_ctcvr
+b2_3h_cvr
+b2_3h_conver
+b2_3h_click
+b2_3h_conver*log(view)
+b2_3h_conver*ctcvr
+b2_6h_ctr
+b2_6h_ctcvr
+b2_6h_cvr
+b2_6h_conver
+b2_6h_click
+b2_6h_conver*log(view)
+b2_6h_conver*ctcvr
+b2_12h_ctr
+b2_12h_ctcvr
+b2_12h_cvr
+b2_12h_conver
+b2_12h_click
+b2_12h_conver*log(view)
+b2_12h_conver*ctcvr
+b2_1d_ctr
+b2_1d_ctcvr
+b2_1d_cvr
+b2_1d_conver
+b2_1d_click
+b2_1d_conver*log(view)
+b2_1d_conver*ctcvr
+b2_3d_ctr
+b2_3d_ctcvr
+b2_3d_cvr
+b2_3d_conver
+b2_3d_click
+b2_3d_conver*log(view)
+b2_3d_conver*ctcvr
+b2_7d_ctr
+b2_7d_ctcvr
+b2_7d_cvr
+b2_7d_conver
+b2_7d_click
+b2_7d_conver*log(view)
+b2_7d_conver*ctcvr
+b2_yesterday_ctr
+b2_yesterday_ctcvr
+b2_yesterday_cvr
+b2_yesterday_conver
+b2_yesterday_click
+b2_yesterday_conver*log(view)
+b2_yesterday_conver*ctcvr
+b2_today_ctr
+b2_today_ctcvr
+b2_today_cvr
+b2_today_conver
+b2_today_click
+b2_today_conver*log(view)
+b2_today_conver*ctcvr
+b3_1h_ctr
+b3_1h_ctcvr
+b3_1h_cvr
+b3_1h_conver
+b3_1h_click
+b3_1h_conver*log(view)
+b3_1h_conver*ctcvr
+b3_2h_ctr
+b3_2h_ctcvr
+b3_2h_cvr
+b3_2h_conver
+b3_2h_click
+b3_2h_conver*log(view)
+b3_2h_conver*ctcvr
+b3_3h_ctr
+b3_3h_ctcvr
+b3_3h_cvr
+b3_3h_conver
+b3_3h_click
+b3_3h_conver*log(view)
+b3_3h_conver*ctcvr
+b3_6h_ctr
+b3_6h_ctcvr
+b3_6h_cvr
+b3_6h_conver
+b3_6h_click
+b3_6h_conver*log(view)
+b3_6h_conver*ctcvr
+b3_12h_ctr
+b3_12h_ctcvr
+b3_12h_cvr
+b3_12h_conver
+b3_12h_click
+b3_12h_conver*log(view)
+b3_12h_conver*ctcvr
+b3_1d_ctr
+b3_1d_ctcvr
+b3_1d_cvr
+b3_1d_conver
+b3_1d_click
+b3_1d_conver*log(view)
+b3_1d_conver*ctcvr
+b3_3d_ctr
+b3_3d_ctcvr
+b3_3d_cvr
+b3_3d_conver
+b3_3d_click
+b3_3d_conver*log(view)
+b3_3d_conver*ctcvr
+b3_7d_ctr
+b3_7d_ctcvr
+b3_7d_cvr
+b3_7d_conver
+b3_7d_click
+b3_7d_conver*log(view)
+b3_7d_conver*ctcvr
+b3_yesterday_ctr
+b3_yesterday_ctcvr
+b3_yesterday_cvr
+b3_yesterday_conver
+b3_yesterday_click
+b3_yesterday_conver*log(view)
+b3_yesterday_conver*ctcvr
+b3_today_ctr
+b3_today_ctcvr
+b3_today_cvr
+b3_today_conver
+b3_today_click
+b3_today_conver*log(view)
+b3_today_conver*ctcvr
+b4_1h_ctr
+b4_1h_ctcvr
+b4_1h_cvr
+b4_1h_conver
+b4_1h_click
+b4_1h_conver*log(view)
+b4_1h_conver*ctcvr
+b4_2h_ctr
+b4_2h_ctcvr
+b4_2h_cvr
+b4_2h_conver
+b4_2h_click
+b4_2h_conver*log(view)
+b4_2h_conver*ctcvr
+b4_3h_ctr
+b4_3h_ctcvr
+b4_3h_cvr
+b4_3h_conver
+b4_3h_click
+b4_3h_conver*log(view)
+b4_3h_conver*ctcvr
+b4_6h_ctr
+b4_6h_ctcvr
+b4_6h_cvr
+b4_6h_conver
+b4_6h_click
+b4_6h_conver*log(view)
+b4_6h_conver*ctcvr
+b4_12h_ctr
+b4_12h_ctcvr
+b4_12h_cvr
+b4_12h_conver
+b4_12h_click
+b4_12h_conver*log(view)
+b4_12h_conver*ctcvr
+b4_1d_ctr
+b4_1d_ctcvr
+b4_1d_cvr
+b4_1d_conver
+b4_1d_click
+b4_1d_conver*log(view)
+b4_1d_conver*ctcvr
+b4_3d_ctr
+b4_3d_ctcvr
+b4_3d_cvr
+b4_3d_conver
+b4_3d_click
+b4_3d_conver*log(view)
+b4_3d_conver*ctcvr
+b4_7d_ctr
+b4_7d_ctcvr
+b4_7d_cvr
+b4_7d_conver
+b4_7d_click
+b4_7d_conver*log(view)
+b4_7d_conver*ctcvr
+b4_yesterday_ctr
+b4_yesterday_ctcvr
+b4_yesterday_cvr
+b4_yesterday_conver
+b4_yesterday_click
+b4_yesterday_conver*log(view)
+b4_yesterday_conver*ctcvr
+b4_today_ctr
+b4_today_ctcvr
+b4_today_cvr
+b4_today_conver
+b4_today_click
+b4_today_conver*log(view)
+b4_today_conver*ctcvr
+b5_1h_ctr
+b5_1h_ctcvr
+b5_1h_cvr
+b5_1h_conver
+b5_1h_click
+b5_1h_conver*log(view)
+b5_1h_conver*ctcvr
+b5_2h_ctr
+b5_2h_ctcvr
+b5_2h_cvr
+b5_2h_conver
+b5_2h_click
+b5_2h_conver*log(view)
+b5_2h_conver*ctcvr
+b5_3h_ctr
+b5_3h_ctcvr
+b5_3h_cvr
+b5_3h_conver
+b5_3h_click
+b5_3h_conver*log(view)
+b5_3h_conver*ctcvr
+b5_6h_ctr
+b5_6h_ctcvr
+b5_6h_cvr
+b5_6h_conver
+b5_6h_click
+b5_6h_conver*log(view)
+b5_6h_conver*ctcvr
+b5_12h_ctr
+b5_12h_ctcvr
+b5_12h_cvr
+b5_12h_conver
+b5_12h_click
+b5_12h_conver*log(view)
+b5_12h_conver*ctcvr
+b5_1d_ctr
+b5_1d_ctcvr
+b5_1d_cvr
+b5_1d_conver
+b5_1d_click
+b5_1d_conver*log(view)
+b5_1d_conver*ctcvr
+b5_3d_ctr
+b5_3d_ctcvr
+b5_3d_cvr
+b5_3d_conver
+b5_3d_click
+b5_3d_conver*log(view)
+b5_3d_conver*ctcvr
+b5_7d_ctr
+b5_7d_ctcvr
+b5_7d_cvr
+b5_7d_conver
+b5_7d_click
+b5_7d_conver*log(view)
+b5_7d_conver*ctcvr
+b5_yesterday_ctr
+b5_yesterday_ctcvr
+b5_yesterday_cvr
+b5_yesterday_conver
+b5_yesterday_click
+b5_yesterday_conver*log(view)
+b5_yesterday_conver*ctcvr
+b5_today_ctr
+b5_today_ctcvr
+b5_today_cvr
+b5_today_conver
+b5_today_click
+b5_today_conver*log(view)
+b5_today_conver*ctcvr
+b8_1h_ctr
+b8_1h_ctcvr
+b8_1h_cvr
+b8_1h_conver
+b8_1h_click
+b8_1h_conver*log(view)
+b8_1h_conver*ctcvr
+b8_2h_ctr
+b8_2h_ctcvr
+b8_2h_cvr
+b8_2h_conver
+b8_2h_click
+b8_2h_conver*log(view)
+b8_2h_conver*ctcvr
+b8_3h_ctr
+b8_3h_ctcvr
+b8_3h_cvr
+b8_3h_conver
+b8_3h_click
+b8_3h_conver*log(view)
+b8_3h_conver*ctcvr
+b8_6h_ctr
+b8_6h_ctcvr
+b8_6h_cvr
+b8_6h_conver
+b8_6h_click
+b8_6h_conver*log(view)
+b8_6h_conver*ctcvr
+b8_12h_ctr
+b8_12h_ctcvr
+b8_12h_cvr
+b8_12h_conver
+b8_12h_click
+b8_12h_conver*log(view)
+b8_12h_conver*ctcvr
+b8_1d_ctr
+b8_1d_ctcvr
+b8_1d_cvr
+b8_1d_conver
+b8_1d_click
+b8_1d_conver*log(view)
+b8_1d_conver*ctcvr
+b8_3d_ctr
+b8_3d_ctcvr
+b8_3d_cvr
+b8_3d_conver
+b8_3d_click
+b8_3d_conver*log(view)
+b8_3d_conver*ctcvr
+b8_7d_ctr
+b8_7d_ctcvr
+b8_7d_cvr
+b8_7d_conver
+b8_7d_click
+b8_7d_conver*log(view)
+b8_7d_conver*ctcvr
+b8_yesterday_ctr
+b8_yesterday_ctcvr
+b8_yesterday_cvr
+b8_yesterday_conver
+b8_yesterday_click
+b8_yesterday_conver*log(view)
+b8_yesterday_conver*ctcvr
+b8_today_ctr
+b8_today_ctcvr
+b8_today_cvr
+b8_today_conver
+b8_today_click
+b8_today_conver*log(view)
+b8_today_conver*ctcvr
+b9_1h_ctr
+b9_1h_ctcvr
+b9_1h_cvr
+b9_1h_conver
+b9_1h_click
+b9_1h_conver*log(view)
+b9_1h_conver*ctcvr
+b9_2h_ctr
+b9_2h_ctcvr
+b9_2h_cvr
+b9_2h_conver
+b9_2h_click
+b9_2h_conver*log(view)
+b9_2h_conver*ctcvr
+b9_3h_ctr
+b9_3h_ctcvr
+b9_3h_cvr
+b9_3h_conver
+b9_3h_click
+b9_3h_conver*log(view)
+b9_3h_conver*ctcvr
+b9_6h_ctr
+b9_6h_ctcvr
+b9_6h_cvr
+b9_6h_conver
+b9_6h_click
+b9_6h_conver*log(view)
+b9_6h_conver*ctcvr
+b9_12h_ctr
+b9_12h_ctcvr
+b9_12h_cvr
+b9_12h_conver
+b9_12h_click
+b9_12h_conver*log(view)
+b9_12h_conver*ctcvr
+b9_1d_ctr
+b9_1d_ctcvr
+b9_1d_cvr
+b9_1d_conver
+b9_1d_click
+b9_1d_conver*log(view)
+b9_1d_conver*ctcvr
+b9_3d_ctr
+b9_3d_ctcvr
+b9_3d_cvr
+b9_3d_conver
+b9_3d_click
+b9_3d_conver*log(view)
+b9_3d_conver*ctcvr
+b9_7d_ctr
+b9_7d_ctcvr
+b9_7d_cvr
+b9_7d_conver
+b9_7d_click
+b9_7d_conver*log(view)
+b9_7d_conver*ctcvr
+b9_yesterday_ctr
+b9_yesterday_ctcvr
+b9_yesterday_cvr
+b9_yesterday_conver
+b9_yesterday_click
+b9_yesterday_conver*log(view)
+b9_yesterday_conver*ctcvr
+b9_today_ctr
+b9_today_ctcvr
+b9_today_cvr
+b9_today_conver
+b9_today_click
+b9_today_conver*log(view)
+b9_today_conver*ctcvr
+b6_7d_ctr
+b6_7d_ctcvr
+b6_7d_cvr
+b6_7d_conver
+b6_7d_click
+b6_7d_conver*log(view)
+b6_7d_conver*ctcvr
+b6_14d_ctr
+b6_14d_ctcvr
+b6_14d_cvr
+b6_14d_conver
+b6_14d_click
+b6_14d_conver*log(view)
+b6_14d_conver*ctcvr
+b7_7d_ctr
+b7_7d_ctcvr
+b7_7d_cvr
+b7_7d_conver
+b7_7d_click
+b7_7d_conver*log(view)
+b7_7d_conver*ctcvr
+b7_14d_ctr
+b7_14d_ctcvr
+b7_14d_cvr
+b7_14d_conver
+b7_14d_click
+b7_14d_conver*log(view)
+b7_14d_conver*ctcvr
+viewAll
+clickAll
+converAll
+incomeAll
+ctr_all
+ctcvr_all
+cvr_all
+timediff_view
+timediff_click
+timediff_conver
+actionstatic_view
+actionstatic_click
+actionstatic_conver
+actionstatic_income
+actionstatic_ctr
+actionstatic_ctcvr
+actionstatic_cvr
+e1_tags_3d_matchnum
+e1_tags_3d_maxscore
+e1_tags_3d_avgscore
+e1_tags_7d_matchnum
+e1_tags_7d_maxscore
+e1_tags_7d_avgscore
+e1_tags_14d_matchnum
+e1_tags_14d_maxscore
+e1_tags_14d_avgscore
+e2_tags_3d_matchnum
+e2_tags_3d_maxscore
+e2_tags_3d_avgscore
+e2_tags_7d_matchnum
+e2_tags_7d_maxscore
+e2_tags_7d_avgscore
+e2_tags_14d_matchnum
+e2_tags_14d_maxscore
+e2_tags_14d_avgscore
+d1_feature_3h_ctr
+d1_feature_3h_ctcvr
+d1_feature_3h_cvr
+d1_feature_3h_conver
+d1_feature_6h_ctr
+d1_feature_6h_ctcvr
+d1_feature_6h_cvr
+d1_feature_6h_conver
+d1_feature_12h_ctr
+d1_feature_12h_ctcvr
+d1_feature_12h_cvr
+d1_feature_12h_conver
+d1_feature_1d_ctr
+d1_feature_1d_ctcvr
+d1_feature_1d_cvr
+d1_feature_1d_conver
+d1_feature_3d_ctr
+d1_feature_3d_ctcvr
+d1_feature_3d_cvr
+d1_feature_3d_conver
+d1_feature_7d_ctr
+d1_feature_7d_ctcvr
+d1_feature_7d_cvr
+d1_feature_7d_conver
+vid_rank_ctr_1d
+vid_rank_ctr_3d
+vid_rank_ctr_7d
+vid_rank_ctr_14d
+vid_rank_ctcvr_1d
+vid_rank_ctcvr_3d
+vid_rank_ctcvr_7d
+vid_rank_ctcvr_14d
+ctitle_vtitle_similarity

+ 25 - 25
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240726.scala

@@ -33,12 +33,12 @@ object makedata_ad_33_bucketData_20240726 {
     val repartition = param.getOrElse("repartition", "100").toInt
     val filterNames = param.getOrElse("filterNames", "").split(",").toSet
     val whatLabel = param.getOrElse("whatLabel", "ad_is_conversion")
-    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name.txt");
+    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name_517.txt");
 
 
     val loader = getClass.getClassLoader
 
-    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_688.txt")
+    val resourceUrlBucket = loader.getResource("20240718_ad_bucket_517.txt")
     val buckets =
       if (resourceUrlBucket != null) {
         val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
@@ -87,6 +87,13 @@ object makedata_ad_33_bucketData_20240726 {
           jsons.foreach(r => {
             features.put(r._1, jsons.getDoubleValue(r._1))
           })
+
+          for (name <- contentList) {
+            if (!features.contains(name)) {
+              features.put(name, 0)
+            }
+          }
+
           (logKey, labelKey, features)
         })
         .filter {
@@ -106,39 +113,32 @@ object makedata_ad_33_bucketData_20240726 {
           val bucketsMap = bucketsMap_br.value
           row.foreach {
             case (label, features) =>
-              val featuresBucket = new ArrayBuffer[String]()
-              for (name <- contentList) {
-                var ifFilter = false
-                if (filterNames.nonEmpty) {
-                  filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
-                    ifFilter = true
-                  })
-                }
-                if (!ifFilter) {
-                  if (features.contains(name)) {
-                    val score = features(name)
+              val featuresBucket = features.map {
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty) {
+                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
+                      ifFilter = true
+                    })
+                  }
+                  if (ifFilter) {
+                    ""
+                  } else {
                     if (score > 1E-8) {
                       if (bucketsMap.contains(name)) {
                         val (bucketsNum, buckets) = bucketsMap(name)
-                        val scoreNew = 0.01 + 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
-                        featuresBucket.add(name + ":" + scoreNew.toString)
+                        val scoreNew = 0.01 + (1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0))
+                        name + ":" + scoreNew.toString
                       } else {
-                        featuresBucket.add(name + ":" + score.toString)
+                        name + ":" + score.toString
                       }
                     } else {
-                      featuresBucket.add(name + ":" + "0.01")
+                      name + ":" + "0.01"
                     }
-
-                  } else {
-                    featuresBucket.add(name + ":" + "0.01")
                   }
-                }
-
-              }
-
+              }.filter(_.nonEmpty)
               result.add(label + "\t" + featuresBucket.mkString("\t"))
           }
-
           result.iterator
         })
 

Some files were not shown because too many files changed in this diff