Przeglądaj źródła

mask部分稀疏特征

xueyiming 20 godzin temu
rodzic
commit
21bf2729d9

+ 17 - 8
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataFromOriginToHive_20250522.scala

@@ -45,6 +45,9 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
     val negSampleRate = param.getOrElse("negSampleRate", "1").toDouble
     // 分割样本集的比例,splitRate部分输出至outputTable,补集输出至outputTable2(如果outputTable2不为空)
     val splitRate = param.getOrElse("splitRate", "0.9").toDouble
+    val maskFeature = param.getOrElse("maskFeature", "1").toInt
+    val maskFeatureRate = param.getOrElse("maskFeatureRate", "0.0005").toDouble
+
 
     val loader = getClass.getClassLoader
     val resourceUrlBucket = loader.getResource("20250217_ad_bucket_688.txt")
@@ -221,16 +224,22 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
                 featureMap.put("abcode_" + extend.getString("abcode"), idDefaultValue)
               }
 
-              if (reqFeature.containsKey("cid") && reqFeature.getString("cid").nonEmpty) {
-                featureMap.put("cid", reqFeature.getString("cid"))
-              }
+              if (maskFeature > 0 && Random.nextDouble() < maskFeatureRate) {
+                featureMap.put("cid", "")
+                featureMap.put("adid", "")
+                featureMap.put("adverid", "")
+              } else {
+                if (reqFeature.containsKey("cid") && reqFeature.getString("cid").nonEmpty) {
+                  featureMap.put("cid", reqFeature.getString("cid"))
+                }
 
-              if (reqFeature.containsKey("adid") && reqFeature.getString("adid").nonEmpty) {
-                featureMap.put("adid", reqFeature.getString("adid"))
-              }
+                if (reqFeature.containsKey("adid") && reqFeature.getString("adid").nonEmpty) {
+                  featureMap.put("adid", reqFeature.getString("adid"))
+                }
 
-              if (reqFeature.containsKey("adverid") && reqFeature.getString("adverid").nonEmpty) {
-                featureMap.put("adverid", reqFeature.getString("adverid"))
+                if (reqFeature.containsKey("adverid") && reqFeature.getString("adverid").nonEmpty) {
+                  featureMap.put("adverid", reqFeature.getString("adverid"))
+                }
               }
 
               if (reqFeature.containsKey("profession") && reqFeature.getString("profession").nonEmpty) {