Explorar el Código

Update makedata_ad_33_bucketDataFromOriginToHive_20250228: add smooth

StrayWarrior hace 1 mes
padre
commit
6d2e07c611

+ 31 - 19
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataFromOriginToHive_20250228.scala

@@ -16,6 +16,10 @@ import scala.language.postfixOps
 import scala.util.Random
 
 object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
+  val CTR_SMOOTH_BETA_FACTOR = 25
+  val CVR_SMOOTH_BETA_FACTOR = 10
+  val CTCVR_SMOOTH_BETA_FACTOR = 100
+
   def main(args: Array[String]): Unit = {
     val spark = SparkSession
       .builder()
@@ -53,7 +57,8 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
       .filter(r => r.nonEmpty)
       .map(r => {
         val rList = r.split("\t")
-        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+        val featureName = rList(0).replace("*", "_x_").replace("(view)", "_view")
+        (featureName, (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
       }).toMap
     val bucketsMap_br = sc.broadcast(bucketsMap)
     val denseFeatureNames = bucketsMap.keySet
@@ -186,11 +191,17 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
                   val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
                   val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
                   val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
-                  val f1 = RankExtractorFeature_20240530.calDiv(click, view)
-                  val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
-                  val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                  // NOTE(zhoutian):
+                  // 这里cpc只是为了计算cpm的平滑的工具量,没有实际业务意义,因为cpm并非比率,本身不适合直接计算Wilson平滑
+                  // 不使用cpa的原因是未来可能出现广告采用cpc计费的情况或者无法获取转化量的情况,用点击更为稳定
+                  // 其它几组特征亦采用相同逻辑
+                  // 2025-02-17改为增加固定分母平滑,income实际已经可以直接参与cpm平滑计算
+                  val cpc = if (click == 0) 0D else income / click
+                  val f1 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR)
+                  val f2 = RankExtractorFeature_20240530.divSmooth2(conver, view, CTCVR_SMOOTH_BETA_FACTOR)
+                  val f3 = RankExtractorFeature_20240530.divSmooth2(conver, click, CVR_SMOOTH_BETA_FACTOR)
                   val f4 = conver
-                  val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                  val f5 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR) * cpc * 1000
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
@@ -198,8 +209,8 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
 
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
-                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
-                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
+                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_log_view", conver * RankExtractorFeature_20240530.calLog(view))
+                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_ctcvr", conver * f2)
                 }
               }
 
@@ -213,11 +224,12 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
                   val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
                   val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
                   val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
-                  val f1 = RankExtractorFeature_20240530.calDiv(click, view)
-                  val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
-                  val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                  val cpc = if (click == 0) 0D else income / click
+                  val f1 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR)
+                  val f2 = RankExtractorFeature_20240530.divSmooth2(conver, view, CTCVR_SMOOTH_BETA_FACTOR)
+                  val f3 = RankExtractorFeature_20240530.divSmooth2(conver, click, CVR_SMOOTH_BETA_FACTOR)
                   val f4 = conver
-                  val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                  val f5 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR) * cpc * 1000
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
@@ -225,8 +237,8 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
 
                   featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
-                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
-                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
+                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_log_view", conver * RankExtractorFeature_20240530.calLog(view))
+                  featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_ctcvr", conver * f2)
                 }
               }
 
@@ -385,11 +397,12 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
                   val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
                   val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
                   val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
-                  val f1 = RankExtractorFeature_20240530.calDiv(click, view)
-                  val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
-                  val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
+                  val cpc = if (click == 0) 0D else income / click
+                  val f1 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR)
+                  val f2 = RankExtractorFeature_20240530.divSmooth2(conver, view, CTCVR_SMOOTH_BETA_FACTOR)
+                  val f3 = RankExtractorFeature_20240530.divSmooth2(conver, click, CVR_SMOOTH_BETA_FACTOR)
                   val f4 = conver
-                  val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
+                  val f5 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR) * cpc * 1000
                   featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
                   featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
                   featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
@@ -510,14 +523,13 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
             val bucketsMap = bucketsMap_br.value
             var resultMap = denseFeatures.collect {
               case (name, score) if !filterNames.exists(name.contains) && score > 1E-8 =>
-                var key = name.replace("*", "_x_").replace("(view)", "_view")
                 val value = if (bucketsMap.contains(name)) {
                   val (bucketsNum, buckets) = bucketsMap(name)
                   1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
                 } else {
                   score
                 }
-                key -> value.toString
+                name -> value.toString
             }.toMap
             sparseFeatures.foreach(kv => {
               resultMap += (kv._1 -> kv._2)