|
@@ -16,6 +16,10 @@ import scala.language.postfixOps
|
|
|
import scala.util.Random
|
|
|
|
|
|
object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
+ val CTR_SMOOTH_BETA_FACTOR = 25
|
|
|
+ val CVR_SMOOTH_BETA_FACTOR = 10
|
|
|
+ val CTCVR_SMOOTH_BETA_FACTOR = 100
|
|
|
+
|
|
|
def main(args: Array[String]): Unit = {
|
|
|
val spark = SparkSession
|
|
|
.builder()
|
|
@@ -53,7 +57,8 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
.filter(r => r.nonEmpty)
|
|
|
.map(r => {
|
|
|
val rList = r.split("\t")
|
|
|
- (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
+ val featureName = rList(0).replace("*", "_x_").replace("(view)", "_view")
|
|
|
+ (featureName, (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
}).toMap
|
|
|
val bucketsMap_br = sc.broadcast(bucketsMap)
|
|
|
val denseFeatureNames = bucketsMap.keySet
|
|
@@ -186,11 +191,17 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
|
|
|
val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
|
|
|
val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
|
|
|
- val f1 = RankExtractorFeature_20240530.calDiv(click, view)
|
|
|
- val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
|
|
|
- val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
|
|
|
+ // NOTE(zhoutian):
|
|
|
+ // 这里cpc只是为了计算cpm的平滑的工具量,没有实际业务意义,因为cpm并非比率,本身不适合直接计算Wilson平滑
|
|
|
+ // 不使用cpa的原因是未来可能出现广告采用cpc计费的情况或者无法获取转化量的情况,用点击更为稳定
|
|
|
+ // 其它几组特征亦采用相同逻辑
|
|
|
+ // 2025-02-17改为增加固定分母平滑,income实际已经可以直接参与cpm平滑计算
|
|
|
+ val cpc = if (click == 0) 0D else income / click
|
|
|
+ val f1 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR)
|
|
|
+ val f2 = RankExtractorFeature_20240530.divSmooth2(conver, view, CTCVR_SMOOTH_BETA_FACTOR)
|
|
|
+ val f3 = RankExtractorFeature_20240530.divSmooth2(conver, click, CVR_SMOOTH_BETA_FACTOR)
|
|
|
val f4 = conver
|
|
|
- val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
|
|
|
+ val f5 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR) * cpc * 1000
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
|
|
@@ -198,8 +209,8 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
|
|
|
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
|
|
|
- featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
|
|
|
- featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
|
|
|
+ featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_log_view", conver * RankExtractorFeature_20240530.calLog(view))
|
|
|
+ featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_ctcvr", conver * f2)
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -213,11 +224,12 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
val click = if (bn.isEmpty) 0D else bn.getIntValue("ad_click_" + prefix2).toDouble
|
|
|
val conver = if (bn.isEmpty) 0D else bn.getIntValue("ad_conversion_" + prefix2).toDouble
|
|
|
val income = if (bn.isEmpty) 0D else bn.getIntValue("ad_income_" + prefix2).toDouble
|
|
|
- val f1 = RankExtractorFeature_20240530.calDiv(click, view)
|
|
|
- val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
|
|
|
- val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
|
|
|
+ val cpc = if (click == 0) 0D else income / click
|
|
|
+ val f1 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR)
|
|
|
+ val f2 = RankExtractorFeature_20240530.divSmooth2(conver, view, CTCVR_SMOOTH_BETA_FACTOR)
|
|
|
+ val f3 = RankExtractorFeature_20240530.divSmooth2(conver, click, CVR_SMOOTH_BETA_FACTOR)
|
|
|
val f4 = conver
|
|
|
- val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
|
|
|
+ val f5 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR) * cpc * 1000
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctr", f1)
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "ctcvr", f2)
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "cvr", f3)
|
|
@@ -225,8 +237,8 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "ecpm", f5)
|
|
|
|
|
|
featureMap.put(prefix1 + "_" + prefix2 + "_" + "click", click)
|
|
|
- featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*log(view)", conver * RankExtractorFeature_20240530.calLog(view))
|
|
|
- featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver*ctcvr", conver * f2)
|
|
|
+ featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_log_view", conver * RankExtractorFeature_20240530.calLog(view))
|
|
|
+ featureMap.put(prefix1 + "_" + prefix2 + "_" + "conver_x_ctcvr", conver * f2)
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -385,11 +397,12 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
val click = if (!d1.containsKey("ad_click_" + prefix)) 0D else d1.getIntValue("ad_click_" + prefix).toDouble
|
|
|
val conver = if (!d1.containsKey("ad_conversion_" + prefix)) 0D else d1.getIntValue("ad_conversion_" + prefix).toDouble
|
|
|
val income = if (!d1.containsKey("ad_income_" + prefix)) 0D else d1.getIntValue("ad_income_" + prefix).toDouble
|
|
|
- val f1 = RankExtractorFeature_20240530.calDiv(click, view)
|
|
|
- val f2 = RankExtractorFeature_20240530.calDiv(conver, view)
|
|
|
- val f3 = RankExtractorFeature_20240530.calDiv(conver, click)
|
|
|
+ val cpc = if (click == 0) 0D else income / click
|
|
|
+ val f1 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR)
|
|
|
+ val f2 = RankExtractorFeature_20240530.divSmooth2(conver, view, CTCVR_SMOOTH_BETA_FACTOR)
|
|
|
+ val f3 = RankExtractorFeature_20240530.divSmooth2(conver, click, CVR_SMOOTH_BETA_FACTOR)
|
|
|
val f4 = conver
|
|
|
- val f5 = RankExtractorFeature_20240530.calDiv(income * 1000, view)
|
|
|
+ val f5 = RankExtractorFeature_20240530.divSmooth2(click, view, CTR_SMOOTH_BETA_FACTOR) * cpc * 1000
|
|
|
featureMap.put("d1_feature" + "_" + prefix + "_" + "ctr", f1)
|
|
|
featureMap.put("d1_feature" + "_" + prefix + "_" + "ctcvr", f2)
|
|
|
featureMap.put("d1_feature" + "_" + prefix + "_" + "cvr", f3)
|
|
@@ -510,14 +523,13 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250228 {
|
|
|
val bucketsMap = bucketsMap_br.value
|
|
|
var resultMap = denseFeatures.collect {
|
|
|
case (name, score) if !filterNames.exists(name.contains) && score > 1E-8 =>
|
|
|
- var key = name.replace("*", "_x_").replace("(view)", "_view")
|
|
|
val value = if (bucketsMap.contains(name)) {
|
|
|
val (bucketsNum, buckets) = bucketsMap(name)
|
|
|
1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
|
|
|
} else {
|
|
|
score
|
|
|
}
|
|
|
- key -> value.toString
|
|
|
+ name -> value.toString
|
|
|
}.toMap
|
|
|
sparseFeatures.foreach(kv => {
|
|
|
resultMap += (kv._1 -> kv._2)
|