Explorar o código

Merge branch 'dev-xym-realtime-mask-cid' of algorithm/recommend-emr-dataprocess into feature/20250104-zt-update

fengzhoutian hai 19 horas
pai
achega
839d8e3345

+ 8 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataFromOriginToHive_20250522.scala

@@ -45,6 +45,7 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
     val negSampleRate = param.getOrElse("negSampleRate", "1").toDouble
     // 分割样本集的比例,splitRate部分输出至outputTable,补集输出至outputTable2(如果outputTable2不为空)
     val splitRate = param.getOrElse("splitRate", "0.9").toDouble
+    val maskFeatureRate = param.getOrElse("maskFeatureRate", "0.0").toDouble
 
     val loader = getClass.getClassLoader
     val resourceUrlBucket = loader.getResource("20250217_ad_bucket_688.txt")
@@ -584,6 +585,13 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
               }
               featureMap.put("vid", reqFeature.getString("vid"))
 
+              // 随机mask部分特征供模型训练
+              if (Random.nextDouble() < maskFeatureRate) {
+                featureMap.put("cid", "")
+                featureMap.put("adid", "")
+                featureMap.put("adverid", "")
+              }
+
               /*
             广告
               sparse:cid adid adverid targeting_conversion