瀏覽代碼

scala train

zhangbo 8 月之前
父節點
當前提交
321561a456

+ 351 - 0
recommend-model-produce/src/main/resources/20240703_ad_feature_name.txt

@@ -0,0 +1,351 @@
+cpa
+b2_3h_ctr
+b2_3h_ctcvr
+b2_3h_cvr
+b2_3h_conver
+b2_3h_ecpm
+b2_3h_click
+b2_3h_conver*log(view)
+b2_3h_conver*ctcvr
+b2_6h_ctr
+b2_6h_ctcvr
+b2_6h_cvr
+b2_6h_conver
+b2_6h_ecpm
+b2_6h_click
+b2_6h_conver*log(view)
+b2_6h_conver*ctcvr
+b2_12h_ctr
+b2_12h_ctcvr
+b2_12h_cvr
+b2_12h_conver
+b2_12h_ecpm
+b2_12h_click
+b2_12h_conver*log(view)
+b2_12h_conver*ctcvr
+b2_1d_ctr
+b2_1d_ctcvr
+b2_1d_cvr
+b2_1d_conver
+b2_1d_ecpm
+b2_1d_click
+b2_1d_conver*log(view)
+b2_1d_conver*ctcvr
+b2_3d_ctr
+b2_3d_ctcvr
+b2_3d_cvr
+b2_3d_conver
+b2_3d_ecpm
+b2_3d_click
+b2_3d_conver*log(view)
+b2_3d_conver*ctcvr
+b2_7d_ctr
+b2_7d_ctcvr
+b2_7d_cvr
+b2_7d_conver
+b2_7d_ecpm
+b2_7d_click
+b2_7d_conver*log(view)
+b2_7d_conver*ctcvr
+b3_3h_ctr
+b3_3h_ctcvr
+b3_3h_cvr
+b3_3h_conver
+b3_3h_ecpm
+b3_3h_click
+b3_3h_conver*log(view)
+b3_3h_conver*ctcvr
+b3_6h_ctr
+b3_6h_ctcvr
+b3_6h_cvr
+b3_6h_conver
+b3_6h_ecpm
+b3_6h_click
+b3_6h_conver*log(view)
+b3_6h_conver*ctcvr
+b3_12h_ctr
+b3_12h_ctcvr
+b3_12h_cvr
+b3_12h_conver
+b3_12h_ecpm
+b3_12h_click
+b3_12h_conver*log(view)
+b3_12h_conver*ctcvr
+b3_1d_ctr
+b3_1d_ctcvr
+b3_1d_cvr
+b3_1d_conver
+b3_1d_ecpm
+b3_1d_click
+b3_1d_conver*log(view)
+b3_1d_conver*ctcvr
+b3_3d_ctr
+b3_3d_ctcvr
+b3_3d_cvr
+b3_3d_conver
+b3_3d_ecpm
+b3_3d_click
+b3_3d_conver*log(view)
+b3_3d_conver*ctcvr
+b3_7d_ctr
+b3_7d_ctcvr
+b3_7d_cvr
+b3_7d_conver
+b3_7d_ecpm
+b3_7d_click
+b3_7d_conver*log(view)
+b3_7d_conver*ctcvr
+b4_3h_ctr
+b4_3h_ctcvr
+b4_3h_cvr
+b4_3h_conver
+b4_3h_ecpm
+b4_3h_click
+b4_3h_conver*log(view)
+b4_3h_conver*ctcvr
+b4_6h_ctr
+b4_6h_ctcvr
+b4_6h_cvr
+b4_6h_conver
+b4_6h_ecpm
+b4_6h_click
+b4_6h_conver*log(view)
+b4_6h_conver*ctcvr
+b4_12h_ctr
+b4_12h_ctcvr
+b4_12h_cvr
+b4_12h_conver
+b4_12h_ecpm
+b4_12h_click
+b4_12h_conver*log(view)
+b4_12h_conver*ctcvr
+b4_1d_ctr
+b4_1d_ctcvr
+b4_1d_cvr
+b4_1d_conver
+b4_1d_ecpm
+b4_1d_click
+b4_1d_conver*log(view)
+b4_1d_conver*ctcvr
+b4_3d_ctr
+b4_3d_ctcvr
+b4_3d_cvr
+b4_3d_conver
+b4_3d_ecpm
+b4_3d_click
+b4_3d_conver*log(view)
+b4_3d_conver*ctcvr
+b4_7d_ctr
+b4_7d_ctcvr
+b4_7d_cvr
+b4_7d_conver
+b4_7d_ecpm
+b4_7d_click
+b4_7d_conver*log(view)
+b4_7d_conver*ctcvr
+b5_3h_ctr
+b5_3h_ctcvr
+b5_3h_cvr
+b5_3h_conver
+b5_3h_ecpm
+b5_3h_click
+b5_3h_conver*log(view)
+b5_3h_conver*ctcvr
+b5_6h_ctr
+b5_6h_ctcvr
+b5_6h_cvr
+b5_6h_conver
+b5_6h_ecpm
+b5_6h_click
+b5_6h_conver*log(view)
+b5_6h_conver*ctcvr
+b5_12h_ctr
+b5_12h_ctcvr
+b5_12h_cvr
+b5_12h_conver
+b5_12h_ecpm
+b5_12h_click
+b5_12h_conver*log(view)
+b5_12h_conver*ctcvr
+b5_1d_ctr
+b5_1d_ctcvr
+b5_1d_cvr
+b5_1d_conver
+b5_1d_ecpm
+b5_1d_click
+b5_1d_conver*log(view)
+b5_1d_conver*ctcvr
+b5_3d_ctr
+b5_3d_ctcvr
+b5_3d_cvr
+b5_3d_conver
+b5_3d_ecpm
+b5_3d_click
+b5_3d_conver*log(view)
+b5_3d_conver*ctcvr
+b5_7d_ctr
+b5_7d_ctcvr
+b5_7d_cvr
+b5_7d_conver
+b5_7d_ecpm
+b5_7d_click
+b5_7d_conver*log(view)
+b5_7d_conver*ctcvr
+b8_3h_ctr
+b8_3h_ctcvr
+b8_3h_cvr
+b8_3h_conver
+b8_3h_ecpm
+b8_3h_click
+b8_3h_conver*log(view)
+b8_3h_conver*ctcvr
+b8_6h_ctr
+b8_6h_ctcvr
+b8_6h_cvr
+b8_6h_conver
+b8_6h_ecpm
+b8_6h_click
+b8_6h_conver*log(view)
+b8_6h_conver*ctcvr
+b8_12h_ctr
+b8_12h_ctcvr
+b8_12h_cvr
+b8_12h_conver
+b8_12h_ecpm
+b8_12h_click
+b8_12h_conver*log(view)
+b8_12h_conver*ctcvr
+b8_1d_ctr
+b8_1d_ctcvr
+b8_1d_cvr
+b8_1d_conver
+b8_1d_ecpm
+b8_1d_click
+b8_1d_conver*log(view)
+b8_1d_conver*ctcvr
+b8_3d_ctr
+b8_3d_ctcvr
+b8_3d_cvr
+b8_3d_conver
+b8_3d_ecpm
+b8_3d_click
+b8_3d_conver*log(view)
+b8_3d_conver*ctcvr
+b8_7d_ctr
+b8_7d_ctcvr
+b8_7d_cvr
+b8_7d_conver
+b8_7d_ecpm
+b8_7d_click
+b8_7d_conver*log(view)
+b8_7d_conver*ctcvr
+b6_7d_ctr
+b6_7d_ctcvr
+b6_7d_cvr
+b6_7d_conver
+b6_7d_ecpm
+b6_7d_click
+b6_7d_conver*log(view)
+b6_7d_conver*ctcvr
+b6_14d_ctr
+b6_14d_ctcvr
+b6_14d_cvr
+b6_14d_conver
+b6_14d_ecpm
+b6_14d_click
+b6_14d_conver*log(view)
+b6_14d_conver*ctcvr
+b7_7d_ctr
+b7_7d_ctcvr
+b7_7d_cvr
+b7_7d_conver
+b7_7d_ecpm
+b7_7d_click
+b7_7d_conver*log(view)
+b7_7d_conver*ctcvr
+b7_14d_ctr
+b7_14d_ctcvr
+b7_14d_cvr
+b7_14d_conver
+b7_14d_ecpm
+b7_14d_click
+b7_14d_conver*log(view)
+b7_14d_conver*ctcvr
+viewAll
+clickAll
+converAll
+incomeAll
+ctr_all
+ctcvr_all
+cvr_all
+ecpm_all
+timediff_view
+timediff_click
+timediff_conver
+actionstatic_view
+actionstatic_click
+actionstatic_conver
+actionstatic_income
+actionstatic_ctr
+actionstatic_ctcvr
+actionstatic_cvr
+e1_tags_3d_matchnum
+e1_tags_3d_maxscore
+e1_tags_3d_avgscore
+e1_tags_7d_matchnum
+e1_tags_7d_maxscore
+e1_tags_7d_avgscore
+e1_tags_14d_matchnum
+e1_tags_14d_maxscore
+e1_tags_14d_avgscore
+e2_tags_3d_matchnum
+e2_tags_3d_maxscore
+e2_tags_3d_avgscore
+e2_tags_7d_matchnum
+e2_tags_7d_maxscore
+e2_tags_7d_avgscore
+e2_tags_14d_matchnum
+e2_tags_14d_maxscore
+e2_tags_14d_avgscore
+d1_feature_3h_ctr
+d1_feature_3h_ctcvr
+d1_feature_3h_cvr
+d1_feature_3h_conver
+d1_feature_3h_ecpm
+d1_feature_6h_ctr
+d1_feature_6h_ctcvr
+d1_feature_6h_cvr
+d1_feature_6h_conver
+d1_feature_6h_ecpm
+d1_feature_12h_ctr
+d1_feature_12h_ctcvr
+d1_feature_12h_cvr
+d1_feature_12h_conver
+d1_feature_12h_ecpm
+d1_feature_1d_ctr
+d1_feature_1d_ctcvr
+d1_feature_1d_cvr
+d1_feature_1d_conver
+d1_feature_1d_ecpm
+d1_feature_3d_ctr
+d1_feature_3d_ctcvr
+d1_feature_3d_cvr
+d1_feature_3d_conver
+d1_feature_3d_ecpm
+d1_feature_7d_ctr
+d1_feature_7d_ctcvr
+d1_feature_7d_cvr
+d1_feature_7d_conver
+d1_feature_7d_ecpm
+vid_rank_ctr_1d
+vid_rank_ctr_3d
+vid_rank_ctr_7d
+vid_rank_ctr_14d
+vid_rank_ctcvr_1d
+vid_rank_ctcvr_3d
+vid_rank_ctcvr_7d
+vid_rank_ctcvr_14d
+vid_rank_ecpm_1d
+vid_rank_ecpm_3d
+vid_rank_ecpm_7d
+vid_rank_ecpm_14d

+ 38 - 9
recommend-model-produce/src/main/scala/com/tzld/piaoquan/recommend/model/train_01_xgb_ad_20240808.scala

@@ -3,12 +3,14 @@ package com.tzld.piaoquan.recommend.model
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier
 import org.apache.commons.lang.math.NumberUtils
 import org.apache.commons.lang3.StringUtils
+import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.types.{DataTypes, StructField}
 import org.apache.spark.sql.{Dataset, Row, RowFactory, SparkSession}
 
 import java.util
+import scala.io.Source
 
 object train_01_xgb_ad_20240808{
   def main(args: Array[String]): Unit = {
@@ -17,10 +19,26 @@ object train_01_xgb_ad_20240808{
       .appName(this.getClass.getName)
       .getOrCreate()
     val sc = spark.sparkContext
-    val features = Array("cpa", "b2_12h_ctr", "b2_12h_ctcvr", "b2_12h_cvr", "b2_12h_conver", "b2_12h_click", "b2_12h_conver*log(view)", "b2_12h_conver*ctcvr", "b2_7d_ctr", "b2_7d_ctcvr", "b2_7d_cvr", "b2_7d_conver", "b2_7d_click", "b2_7d_conver*log(view)", "b2_7d_conver*ctcvr")
+//    val features = Array("cpa", "b2_12h_ctr", "b2_12h_ctcvr", "b2_12h_cvr", "b2_12h_conver", "b2_12h_click", "b2_12h_conver*log(view)", "b2_12h_conver*ctcvr", "b2_7d_ctr", "b2_7d_ctcvr", "b2_7d_cvr", "b2_7d_conver", "b2_7d_click", "b2_7d_conver*log(view)", "b2_7d_conver*ctcvr")
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240703_ad_feature_name.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val features = content.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+
 
     val trainData = createData(
-      sc.textFile("/dw/recommend/model/33_ad_train_data_v4/20240726/part-00099.gz"),
+      sc.textFile("/dw/recommend/model/33_ad_train_data_v4/20240724"),
       features
     )
     println("train data size:" + trainData.count())
@@ -32,26 +50,29 @@ object train_01_xgb_ad_20240808{
     val trainDataSet: Dataset[Row] = spark.createDataFrame(trainData, schema)
     val vectorAssembler = new VectorAssembler().setInputCols(features).setOutputCol("features")
     val xgbInput = vectorAssembler.transform(trainDataSet).select("features","label")
-    val xgbParam = Map("eta" -> 0.01f,
-      "max_depth" -> 5,
-      "objective" -> "binary:logistic",
-      "num_class" -> 3)
+//    val xgbParam = Map("eta" -> 0.01f,
+//      "max_depth" -> 5,
+//      "objective" -> "binary:logistic",
+//      "num_class" -> 3)
     val xgbClassifier = new XGBoostClassifier()
       .setEta(0.01f)
       .setMissing(0.0f)
       .setMaxDepth(5)
-      .setNumRound(100)
+      .setNumRound(1000)
+      .setSubsample(0.8)
+      .setColsampleBytree(0.8)
+      .setScalePosWeight(1)
       .setObjective("binary:logistic")
       .setEvalMetric("auc")
       .setFeaturesCol("features")
       .setLabelCol("label")
-      .setNthread(1)
+      .setNthread(8)
       .setNumWorkers(1)
     val model = xgbClassifier.fit(xgbInput)
 
 
     val testData = createData(
-      sc.textFile("/dw/recommend/model/33_ad_train_data_v4/20240726/part-00098.gz"),
+      sc.textFile("/dw/recommend/model/33_ad_train_data_v4/20240725"),
       features
     )
     val testDataSet = spark.createDataFrame(testData, schema)
@@ -63,6 +84,14 @@ object train_01_xgb_ad_20240808{
         (r.get(0), r.get(1), r.get(2), r.get(3), r.get(4)).productIterator.mkString("\t")
     })
     saveData.repartition(1).saveAsTextFile("/dw/recommend/model/checkpoint_xgbtest")
+
+
+    val evaluator = new BinaryClassificationEvaluator()
+      .setLabelCol("label")
+      .setRawPredictionCol("probability")
+      .setMetricName("areaUnderROC")
+    val auc = evaluator.evaluate(predictions.select("label", "probability"))
+    println("zhangbo:auc:" + auc)
   }