瀏覽代碼

Merge branch 'feature/zhangbo_makedata_v2' into feature_zhaohaipeng

zhaohaipeng 10 月之前
父節點
當前提交
de3898c85d

+ 2 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_17_bucketDataPrint_20240617.scala

@@ -236,7 +236,8 @@ object makedata_17_bucketDataPrint_20240617 {
         }).filter{
           case (apptype, pagesource, level, label, abcode, allfeaturemap, featureMap, flag) =>
             apptype.equals("3") && pagesource.endsWith("recommend") &&
-            Set("ab0", "ab1", "ab2", "ab3").contains(abcode) && level.equals("0") && !flag
+//            Set("ab0", "ab1", "ab2", "ab3").contains(abcode) &&
+              level.equals("0") && !flag
         }.mapPartitions(row => {
           val result = new ArrayBuffer[String]()
           val bucketsMap = bucketsMap_br.value

+ 3 - 3
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_31_originData_20240620.scala

@@ -215,16 +215,16 @@ object makedata_ad_31_originData_20240620 {
               midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
             ))
           }
-          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("timediff_conver_" + cid)) {
+          if (midActionStatic.contains("actionstatic_view_" + cid) && midActionStatic.contains("actionstatic_conver_" + cid)) {
             featureMap.put("actionstatic_ctcvr", RankExtractorFeature_20240530.calDiv(
-              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0),
+              midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0),
               midActionStatic.getOrDefault("actionstatic_view_" + cid, 0.0)
             ))
           }
           if (midActionStatic.contains("actionstatic_conver_" + cid) && midActionStatic.contains("actionstatic_click_" + cid)) {
             featureMap.put("actionstatic_cvr", RankExtractorFeature_20240530.calDiv(
               midActionStatic.getOrDefault("actionstatic_click_" + cid, 0.0),
-              midActionStatic.getOrDefault("timediff_conver_" + cid, 0.0)
+              midActionStatic.getOrDefault("actionstatic_conver_" + cid, 0.0)
             ))
           }
 

File diff suppressed because it is too large
+ 396 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketDataPrint_20240628.scala


+ 18 - 9
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala

@@ -51,6 +51,7 @@ object makedata_ad_33_bucketData_20240622 {
     val beginStr = param.getOrElse("beginStr", "20240620")
     val endStr = param.getOrElse("endStr", "20240620")
     val repartition = param.getOrElse("repartition", "200").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").toSet
 
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (date <- dateRange) {
@@ -84,16 +85,24 @@ object makedata_ad_33_bucketData_20240622 {
             case (label, features) =>
               val featuresBucket = features.map{
                 case (name, score) =>
-                  if (score > 1E-8) {
-                    if (bucketsMap.contains(name)){
-                      val (_, buckets) = bucketsMap(name)
-                      val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
-                      name + ":" + scoreNew.toString
-                    }else{
-                      name + ":" + score.toString
-                    }
-                  } else {
+                  var ifFilter = false
+                  if (filterNames.nonEmpty){
+                    filterNames.foreach(r=> if (!ifFilter && name.startsWith(r)) {ifFilter = true} )
+                  }
+                  if (ifFilter){
                     ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (_, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / (buckets.length + 1) * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
                   }
               }.filter(_.nonEmpty)
               result.add(label + "\t" + featuresBucket.mkString("\t"))

+ 28 - 3
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告

@@ -6,8 +6,8 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:32 \
-beginStr:2024062009 endStr:2024062023 \
-savePath:/dw/recommend/model/31_ad_sample_data/ \
+beginStr:2024062008 endStr:2024062223 \
+savePath:/dw/recommend/model/31_ad_sample_data_fix/ \
 table:alg_recsys_ad_sample_all \
 > p31_2024062008.log 2>&1 &
 
@@ -19,6 +19,8 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
 --conf spark.driver.maxResultSize=16G \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+fileName:20240620_100_fix \
+savePath:/dw/recommend/model/32_bucket_data/ \
 > p32_data.log 2>&1 &
 
 
@@ -26,9 +28,32 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketData_20240622 \
 --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-beginStr:20240620 endStr:20240620 repartition:400 \
+beginStr:20240623 endStr:20240623 repartition:400 \
+filterNames:XXXXXXX \
 > p33_data.log 2>&1 &
 
+filterNames:cid_,adid_,adverid_,targeting_conversion_ \
+savePath:/dw/recommend/model/33_ad_train_data_nosparse/ \
+
 
 /dw/recommend/model/31_ad_sample_data/
 /dw/recommend/model/33_ad_train_data/
+
+/dw/recommend/model/31_ad_sample_data_fix/
+
+
+
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_33_bucketDataPrint_20240628 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+beginStr:2024062717 endStr:2024062723 \
+readDate:20240627 \
+table:alg_recsys_ad_sample_all_new \
+savePath:/dw/recommend/model/33_for_check/ \
+> p33_data_check.log 2>&1 &
+
+
+/dw/recommend/model/33_for_check_v1/

+ 5 - 3
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本 → src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-推荐

@@ -156,6 +156,8 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --class com.aliyun.odps.spark.examples.makedata.makedata_17_bucketDataPrint_20240617 \
 --master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
-beginStr:2024061800 endStr:2024061814 \
-readDate:20240618 \
-> p17_data_check.log 2>&1 &
+beginStr:2024062700 endStr:2024062723 \
+readDate:20240627 \
+> p17_data_check.log 2>&1 &
+
+/dw/recommend/model/17_for_check/

+ 7 - 0
zhangbo/01_train.sh

@@ -14,3 +14,10 @@ $HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_train -
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka8 1,1,8 >p1_model_aka8.log 2>&1 &
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka0 1,1,0 >p1_model_aka0.log 2>&1 &
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka4 1,1,4 >p1_model_aka4.log 2>&1 &
+
+
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data_nosparse/ model_bkb0_3 1,1,0 >p1_model_bkb0.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb8_2 1,1,8 >p1_model_bkb8_2.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb4 1,1,4 >p1_model_bkb4.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb12 1,1,12 >p1_model_bkb12.log 2>&1 &
+# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb16 1,1,16 >p1_model_bkb16.log 2>&1 &

+ 5 - 0
zhangbo/02_train_go.sh

@@ -23,3 +23,8 @@ while [[ "$current_date" != "$end_date" ]]; do
 done
 
 # nohup sh 02_train_go.sh 20240615 20240616 model_aka8 /dw/recommend/model/16_train_data/ 1,1,8 >p2_model_aka8.log 2>&1 &
+
+
+# nohup sh 02_train_go.sh 20240623 20240624 model_bkb8 /dw/recommend/model/33_ad_train_data/ 1,1,8 >p2_model_bkb8.log 2>&1 &
+
+# nohup sh 02_train_go.sh 20240621 20240623 model_bkb0_3 /dw/recommend/model/33_ad_train_data_nosparse/ 1,1,0 >p2_model_bkb0.log 2>&1 &

+ 10 - 1
zhangbo/03_predict.sh

@@ -22,7 +22,7 @@ cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
 
 
 
-# cat tmpfile | /root/sunmingze/alphaFM/bin/fm_predict -m model/model_aka8_20240608.txt -dim 8 -core 1 -out tmpfile_out.txt
+# cat tmpfile | /root/sunmingze/alphaFM/bin/fm_predict -m model/model_bkb0_20240622.txt -dim 0 -core 1 -out tmpfile_out.txt
 
 
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v1/ model_aka8_20240608.txt v1 8 >v1.log 2>&1 &
@@ -31,3 +31,12 @@ cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v4/ model_aka8_20240608.txt v4 8 >v4.log 2>&1 &
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v5/ model_aka8_20240608.txt v4 8 >v5.log 2>&1 &
 # nohup sh 03_predict.sh 20240618 /dw/recommend/model/17_for_check_v6/ model_aka8_20240608.txt v4 8 >v6.log 2>&1 &
+
+
+
+
+# nohup sh 03_predict.sh 20240623 /dw/recommend/model/33_ad_train_data/ model_bkb0_20240622.txt model_bkb0_20240622 0 >p3_model_bkb0.log 2>&1 &
+# nohup sh 03_predict.sh 20240621 /dw/recommend/model/33_ad_train_data/ model_bkb4_20240620.txt model_bkb4_20240620 4 >p3_model_bkb4.log 2>&1 &
+# nohup sh 03_predict.sh 20240624 /dw/recommend/model/33_ad_train_data/ model_bkb8_20240622.txt model_bkb8_20240622 8 >p3_model_bkb8.log 2>&1 &
+# nohup sh 03_predict.sh 20240621 /dw/recommend/model/33_ad_train_data/ model_bkb12_20240620.txt model_bkb12_20240620 12 >p3_model_bkb12.log 2>&1 &
+# nohup sh 03_predict.sh 20240621 /dw/recommend/model/33_ad_train_data/ model_bkb16_20240620.txt model_bkb16_20240620 16 >p3_model_bkb16.log 2>&1 &

+ 8 - 0
zhangbo/04_upload.sh

@@ -23,3 +23,11 @@ awk -F " " '{
 }' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt
 
 dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_20240608_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_aka8.txt
+
+
+
+
+-----广告-----
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622.txt | awk -F " " '{print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622_change.txt
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_bkb0.txt

Some files were not shown because too many files changed in this diff