Browse Source

删除sparse特征

zhangbo 10 months ago
parent
commit
16362a5d8d

+ 16 - 9
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala

@@ -51,6 +51,7 @@ object makedata_ad_33_bucketData_20240622 {
     val beginStr = param.getOrElse("beginStr", "20240620")
     val endStr = param.getOrElse("endStr", "20240620")
     val repartition = param.getOrElse("repartition", "200").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").toSet
 
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (date <- dateRange) {
@@ -84,16 +85,22 @@ object makedata_ad_33_bucketData_20240622 {
             case (label, features) =>
               val featuresBucket = features.map{
                 case (name, score) =>
-                  if (score > 1E-8) {
-                    if (bucketsMap.contains(name)){
-                      val (_, buckets) = bucketsMap(name)
-                      val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
-                      name + ":" + scoreNew.toString
-                    }else{
-                      name + ":" + score.toString
-                    }
-                  } else {
+                  var ifFilter = false
+                  filterNames.foreach(r=> if (!ifFilter && name.startsWith(r)) {ifFilter = true} )
+                  if (ifFilter){
                     ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (_, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
                   }
               }.filter(_.nonEmpty)
               result.add(label + "\t" + featuresBucket.mkString("\t"))

+ 5 - 3
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告

@@ -6,10 +6,10 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:32 \
-beginStr:2024062009 endStr:2024062023 \
+beginStr:2024062308 endStr:2024062311 \
 savePath:/dw/recommend/model/31_ad_sample_data/ \
 table:alg_recsys_ad_sample_all \
-> p31_2024062008.log 2>&1 &
+> p31_2024062308.log 2>&1 &
 
 
 
@@ -26,7 +26,9 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketData_20240622 \
 --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-beginStr:20240620 endStr:20240620 repartition:400 \
+beginStr:20240623 endStr:20240623 repartition:400 \
+filterNames:cid_,adid_,adverid_,targeting_conversion_ \
+savePath:/dw/recommend/model/33_ad_train_data_nosparse/ \
 > p33_data.log 2>&1 &
 
 

+ 8 - 0
zhangbo/01_train.sh

@@ -14,3 +14,11 @@ $HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_train -
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka8 1,1,8 >p1_model_aka8.log 2>&1 &
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka0 1,1,0 >p1_model_aka0.log 2>&1 &
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka4 1,1,4 >p1_model_aka4.log 2>&1 &
+
+
+
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb8_2 1,1,8 >p1_model_bkb8_2.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb0_2 1,1,0 >p1_model_bkb0_2.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb4 1,1,4 >p1_model_bkb4.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb12 1,1,12 >p1_model_bkb12.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb16 1,1,16 >p1_model_bkb16.log 2>&1 &

+ 3 - 0
zhangbo/02_train_go.sh

@@ -23,3 +23,6 @@ while [[ "$current_date" != "$end_date" ]]; do
 done
 
 # nohup sh 02_train_go.sh 20240615 20240616 model_aka8 /dw/recommend/model/16_train_data/ 1,1,8 >p2_model_aka8.log 2>&1 &
+
+
+# nohup sh 02_train_go.sh 20240622 20240623 model_bkb0 /dw/recommend/model/33_ad_train_data/ 1,1,0 >p2_model_bkb0.log 2>&1 &

+ 9 - 0
zhangbo/03_predict.sh

@@ -31,3 +31,12 @@ cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v4/ model_aka8_20240608.txt v4 8 >v4.log 2>&1 &
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v5/ model_aka8_20240608.txt v4 8 >v5.log 2>&1 &
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v6/ model_aka8_20240608.txt v4 8 >v6.log 2>&1 &
+
+
+
+
+# nohup sh 03_predict.sh 20240623 /dw/recommend/model/33_ad_train_data/ model_bkb0_20240622.txt model_bkb0_20240622 0 >p3_model_bkb0.log 2>&1 &
+# nohup sh 03_predict.sh 20240621 /dw/recommend/model/33_ad_train_data/ model_bkb4_20240620.txt model_bkb4_20240620 4 >p3_model_bkb4.log 2>&1 &
+# nohup sh 03_predict.sh 20240623 /dw/recommend/model/33_ad_train_data/ model_bkb8_20240622.txt model_bkb8_20240622 8 >p3_model_bkb8.log 2>&1 &
+# nohup sh 03_predict.sh 20240621 /dw/recommend/model/33_ad_train_data/ model_bkb12_20240620.txt model_bkb12_20240620 12 >p3_model_bkb12.log 2>&1 &
+# nohup sh 03_predict.sh 20240621 /dw/recommend/model/33_ad_train_data/ model_bkb16_20240620.txt model_bkb16_20240620 16 >p3_model_bkb16.log 2>&1 &

+ 8 - 0
zhangbo/04_upload.sh

@@ -23,3 +23,11 @@ awk -F " " '{
 }' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt
 
 dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_aka8.txt
+
+
+
+
+-----广告-----
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622.txt | awk -F " " '{print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622_change.txt
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_bkb0.txt