zhangbo 8 meses atrás
pai
commit
a26db94b2c

+ 47 - 1
01-脚本记录

@@ -70,7 +70,7 @@ oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_xgb_1000.tar.gz
 
 
 nohup /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
---class com.tzld.piaoquan.recommend.model.pred_01_xgb_ad_20240813 \
+--class com.tzld.piaoquan.recommend.model.pred_01_xgb_ad_jsonfile_20240813 \
 --master yarn --driver-memory 6G --executor-memory 6G --executor-cores 1 --num-executors 32 \
 --conf spark.yarn.executor.memoryoverhead=1024 \
 --conf spark.shuffle.service.enabled=true \
@@ -85,3 +85,49 @@ featureFile:20240809_ad_feature_name_517.txt \
 savePath:/dw/recommend/model/34_ad_predict_data/case_tmp/ \
 modelPath:/dw/recommend/model/35_ad_model/model_xgb_1000 \
 > p5.log 2>&1 &
+
+nohup /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
+--class com.tzld.piaoquan.recommend.model.pred_01_xgb_ad_hdfsfile_20240813 \
+--master yarn --driver-memory 5G --executor-memory 5G --executor-cores 1 --num-executors 30 \
+--conf spark.yarn.executor.memoryoverhead=1024 \
+--conf spark.shuffle.service.enabled=true \
+--conf spark.shuffle.service.port=7337 \
+--conf spark.shuffle.consolidateFiles=true \
+--conf spark.shuffle.manager=sort \
+--conf spark.storage.memoryFraction=0.4 \
+--conf spark.shuffle.memoryFraction=0.5 \
+--conf spark.default.parallelism=200 \
+./target/recommend-model-produce-jar-with-dependencies.jar \
+featureFile:20240809_ad_feature_name_517.txt \
+testPath:/dw/recommend/model/33_ad_train_data_v4/20240814/ \
+savePath:/dw/recommend/model/34_ad_predict_data/20240814_v1/ \
+modelPath:/dw/recommend/model/35_ad_model/model_xgb_1000 \
+> p1_pred_20240814.log 2>&1 &
+
+
+nohup /opt/apps/SPARK3/spark-3.3.1-hadoop3.2-1.0.5/bin/spark-class org.apache.spark.deploy.SparkSubmit \
+--class com.tzld.piaoquan.recommend.model.train_01_xgb_ad_20240808 \
+--master yarn --driver-memory 6G --executor-memory 9G --executor-cores 1 --num-executors 31 \
+--conf spark.yarn.executor.memoryoverhead=1000 \
+--conf spark.shuffle.service.enabled=true \
+--conf spark.shuffle.service.port=7337 \
+--conf spark.shuffle.consolidateFiles=true \
+--conf spark.shuffle.manager=sort \
+--conf spark.storage.memoryFraction=0.4 \
+--conf spark.shuffle.memoryFraction=0.5 \
+--conf spark.default.parallelism=200 \
+./target/recommend-model-produce-jar-with-dependencies.jar \
+featureFile:20240809_ad_feature_name_517.txt \
+trainPath:/dw/recommend/model/33_ad_train_data_v4/2024080[7-9],/dw/recommend/model/33_ad_train_data_v4/2024081[0-3] \
+testPath:/dw/recommend/model/33_ad_train_data_v4/20240814/ \
+savePath:/dw/recommend/model/34_ad_predict_data/20240814_1000/ \
+modelPath:/dw/recommend/model/35_ad_model/model_xgb_7day \
+eta:0.01 gamma:0.0 max_depth:5 num_round:1000 num_worker:30 \
+repartition:20 \
+> p1_train_0814.log 2>&1 &
+
+dfs -get /dw/recommend/model/35_ad_model/model_xgb_7day ./
+tar -czvf model_xgb_1000.tar.gz -C model_xgb_7day .
+rm -rf .model.tar.gz.crc
+dfs -rm -r -skipTrash oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model.tar.gz
+dfs -put model_xgb_1000.tar.gz oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/

+ 45 - 1
recommend-model-produce/src/main/scala/com/tzld/piaoquan/recommend/model/pred_01_xgb_ad_hdfsfile_20240813.scala

@@ -93,6 +93,10 @@ object pred_01_xgb_ad_hdfsfile_20240813{
     val auc = evaluator.evaluate(predictions.select("label", "probability"))
     println("zhangbo:auc:" + auc)
 
+    println("---------------------------------\n")
+    println("----------zhangbo-------------\n")
+    println("---------------------------------\n")
+
     // 统计分cid的分数
     sc.textFile(hdfsPath).map(r => {
       val rList = r.split("\t")
@@ -108,7 +112,35 @@ object pred_01_xgb_ad_hdfsfile_20240813{
         (cid, all, zheng, scores, zheng / all, scores / all)
     }.collect().sortBy(-_._2).map(_.productIterator.mkString("\t")).foreach(println)
 
+    println("---------------------------------\n")
+    println("----------zhangbo-------------\n")
+    println("---------------------------------\n")
 
+    sc.textFile(hdfsPath).map(r => {
+      val rList = r.split("\t")
+      val cid_hour_apptype_abcode = rList(3).split("_")
+      val cid = cid_hour_apptype_abcode(0)
+      val hour = cid_hour_apptype_abcode(1)
+      val apptype = cid_hour_apptype_abcode(2)
+      var abcode = cid_hour_apptype_abcode(3)
+      if (Set("ab0", "ab1", "ab2", "ab3", "ab4", "ab5", "ab6", "ab7").contains(abcode)){
+        abcode = "实验组"
+      }else{
+        abcode = "基线组"
+      }
+
+      val score = rList(2).replace("[", "").replace("]", "")
+        .split(",")(1).toDouble
+      val label = rList(0).toDouble
+      ((cid, apptype, abcode), (1, label, score))
+    }).reduceByKey {
+      case (a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3)
+    }.map {
+      case ((cid, apptype, abcode), (all, zheng, scores)) =>
+        (cid, apptype, abcode, all, zheng, scores, zheng / all, scores / all)
+    }.collect().sortBy {
+        case (cid, apptype, abcode, all, _, _, _, _) => (cid, apptype, abcode, -all)
+      }.map(_.productIterator.mkString("\t")).foreach(println)
 
   }
 
@@ -121,12 +153,24 @@ object pred_01_xgb_ad_hdfsfile_20240813{
       val label: Int = NumberUtils.toInt(line(0))
       val map: util.Map[String, Double] = new util.HashMap[String, Double]
       var cid = "-1"
+      var hour = "-1"
+      var apptype = "-1"
+      var abcode = "-1"
       for (i <- 1 until line.length) {
         val fv: Array[String] = StringUtils.split(line(i), ':')
         map.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
         if(fv(0).startsWith("cid_")){
           cid = fv(0).split("_")(1)
         }
+        if (fv(0).startsWith("hour_")) {
+          hour = fv(0).split("_")(1)
+        }
+        if (fv(0).startsWith("apptype_")) {
+          apptype = fv(0).split("_")(1)
+        }
+        if (fv(0).startsWith("abcode_")) {
+          abcode = fv(0).split("_")(1)
+        }
       }
 
       val v: Array[Any] = new Array[Any](features.length + 2)
@@ -134,7 +178,7 @@ object pred_01_xgb_ad_hdfsfile_20240813{
       for (i <- 0 until features.length) {
         v(i + 1) = map.getOrDefault(features(i), 0.0d)
       }
-      v(features.length + 1) = cid
+      v(features.length + 1) = (cid, hour, apptype, abcode).productIterator.mkString("_")
       Row(v: _*)
     })
   }