Przeglądaj źródła

优化代码 & 设置参数

xueyiming 10 godzin temu
rodzic
commit
3c42ecaab0

+ 3 - 1
ad/25_xgb_make_data_origin_bucket.sh

@@ -109,6 +109,7 @@ make_bucket_feature_to_hive() {
 make_bucket_feature_from_origin_to_hive() {
   local step_start_time=$(date +%s)
   neg_sample_rate=${NEG_SAMPLE_RATE:-0.04}
+  mask_feature_rate=0.0005
   
   /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
   --class com.aliyun.odps.spark.examples.makedata_ad.v20240718.makedata_ad_33_bucketDataFromOriginToHive_20250228 \
@@ -139,7 +140,8 @@ make_bucket_feature_from_origin_to_hive() {
   filterNames:_4h_,_5h_,adid_,targeting_conversion_ \
   outputTable:${outputTable1} \
   inputTable:alg_recsys_ad_sample_all \
-  negSampleRate:${neg_sample_rate}
+  negSampleRate:${neg_sample_rate} \
+  maskFeatureRate:${mask_feature_rate}
   local task2=$!
 
   wait ${task1}

+ 14 - 40
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_33_bucketDataFromOriginToHive_20250522.scala

@@ -222,47 +222,22 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
                 featureMap.put("abcode_" + extend.getString("abcode"), idDefaultValue)
               }
 
-              if (reqFeature.containsKey("cid") && reqFeature.getString("cid").nonEmpty) {
-                featureMap.put("cid", reqFeature.getString("cid"))
-              }
-
-              if (reqFeature.containsKey("adid") && reqFeature.getString("adid").nonEmpty) {
-                featureMap.put("adid", reqFeature.getString("adid"))
-              }
-
-              if (reqFeature.containsKey("adverid") && reqFeature.getString("adverid").nonEmpty) {
-                featureMap.put("adverid", reqFeature.getString("adverid"))
-              }
-
-              if (reqFeature.containsKey("profession") && reqFeature.getString("profession").nonEmpty) {
-                featureMap.put("profession", reqFeature.getString("profession"))
-              }
-
-              if (reqFeature.containsKey("region") && reqFeature.getString("region").nonEmpty) {
-                featureMap.put("region", reqFeature.getString("region"))
-              }
-
-              if (reqFeature.containsKey("city") && reqFeature.getString("city").nonEmpty) {
-                featureMap.put("city", reqFeature.getString("city"))
-              }
-
-              if (reqFeature.containsKey("is_first_layer") && reqFeature.getString("is_first_layer").nonEmpty) {
-                featureMap.put("is_first_layer", reqFeature.getString("is_first_layer"))
-              }
-
-              if (reqFeature.containsKey("root_source_scene") && reqFeature.getString("root_source_scene").nonEmpty) {
-                featureMap.put("root_source_scene", reqFeature.getString("root_source_scene"))
-              }
-
-              if (reqFeature.containsKey("root_source_channel") && reqFeature.getString("root_source_channel").nonEmpty) {
-                featureMap.put("root_source_channel", reqFeature.getString("root_source_channel"))
-              }
-
-              if (reqFeature.containsKey("brand") && reqFeature.getString("brand").nonEmpty) {
-                featureMap.put("brand", reqFeature.getString("brand"))
+              // 定义需要处理的键名列表
+              val reqFeatureKeys = List(
+                "cid", "adid", "adverid", "profession", "region",
+                "city", "is_first_layer", "root_source_scene",
+                "root_source_channel", "brand", "vid"
+              )
+
+              // 使用函数式方式处理所有键
+              reqFeatureKeys.foreach { key =>
+                reqFeature.get(key) match {
+                  case Some(value) if value != null && value.toString.nonEmpty =>
+                    featureMap.put(key, value.toString)
+                  case _ => // 不做任何操作
+                }
               }
 
-
               if (b1.containsKey("cpa")) {
                 featureMap.put("cpa", b1.getString("cpa").toDouble)
               }
@@ -583,7 +558,6 @@ object makedata_ad_33_bucketDataFromOriginToHive_20250522 {
                 featureMap.put("cate2", d3.getOrDefault("merge_second_level_cate", ""))
                 featureMap.put("title_split", d3.getOrDefault("title_split", ""))
               }
-              featureMap.put("vid", reqFeature.getString("vid"))
 
               // 随机mask部分特征供模型训练
               if (Random.nextDouble() < maskFeatureRate) {