Browse Source

scala train

zhangbo 8 months ago
parent
commit
42ca7b7f44

+ 13 - 4
recommend-model-produce/src/main/scala/com/tzld/piaoquan/recommend/model/train_01_xgb_ad_20240808.scala

@@ -61,11 +61,15 @@ object train_01_xgb_ad_20240808{
     )
     )
     println("zhangbo:train data size:" + trainData.count())
     println("zhangbo:train data size:" + trainData.count())
 
 
-    val fields = Array(
+    var fields = Array(
       DataTypes.createStructField("label", DataTypes.IntegerType, true)
       DataTypes.createStructField("label", DataTypes.IntegerType, true)
 //      DataTypes.createStructField("logKey", DataTypes.IntegerType, true)
 //      DataTypes.createStructField("logKey", DataTypes.IntegerType, true)
 
 
     ) ++ features.map(f => DataTypes.createStructField(f, DataTypes.DoubleType, true))
     ) ++ features.map(f => DataTypes.createStructField(f, DataTypes.DoubleType, true))
+
+    fields = fields ++ Array(
+      DataTypes.createStructField("logKey", DataTypes.StringType, true)
+    )
     val schema = DataTypes.createStructType(fields)
     val schema = DataTypes.createStructType(fields)
     val trainDataSet: Dataset[Row] = spark.createDataFrame(trainData, schema)
     val trainDataSet: Dataset[Row] = spark.createDataFrame(trainData, schema)
     val vectorAssembler = new VectorAssembler().setInputCols(features).setOutputCol("features")
     val vectorAssembler = new VectorAssembler().setInputCols(features).setOutputCol("features")
@@ -100,9 +104,9 @@ object train_01_xgb_ad_20240808{
     val testDataSetTrans = vectorAssembler.transform(testDataSet).select("features","label")
     val testDataSetTrans = vectorAssembler.transform(testDataSet).select("features","label")
     val predictions = model.transform(testDataSetTrans)
     val predictions = model.transform(testDataSetTrans)
 
 
-    val saveData = predictions.select("label", "rawPrediction", "probability").rdd
+    val saveData = predictions.select("label", "rawPrediction", "probability", "logKey").rdd
       .map(r =>{
       .map(r =>{
-        (r.get(0), r.get(1), r.get(2)).productIterator.mkString("\t")
+        (r.get(0), r.get(1), r.get(2), r.get(3)).productIterator.mkString("\t")
     })
     })
     val hdfsPath = savePath
     val hdfsPath = savePath
     if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
     if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
@@ -148,16 +152,21 @@ object train_01_xgb_ad_20240808{
 val line: Array[String] = StringUtils.split(r, '\t')
 val line: Array[String] = StringUtils.split(r, '\t')
       val label: Int = NumberUtils.toInt(line(0))
       val label: Int = NumberUtils.toInt(line(0))
       val map: util.Map[String, Double] = new util.HashMap[String, Double]
       val map: util.Map[String, Double] = new util.HashMap[String, Double]
+      var cid = "-1"
       for (i <- 1 until line.length) {
       for (i <- 1 until line.length) {
         val fv: Array[String] = StringUtils.split(line(i), ':')
         val fv: Array[String] = StringUtils.split(line(i), ':')
         map.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
         map.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
+        if(fv(0).startsWith("cid_")){
+          cid = fv(0).split("_")(1)
+        }
       }
       }
 
 
-      val v: Array[Any] = new Array[Any](features.length + 1)
+      val v: Array[Any] = new Array[Any](features.length + 2)
       v(0) = label
       v(0) = label
       for (i <- 0 until features.length) {
       for (i <- 0 until features.length) {
         v(i + 1) = map.getOrDefault(features(i), 0.0d)
         v(i + 1) = map.getOrDefault(features(i), 0.0d)
       }
       }
+      v(features.length + 1) = cid
       Row(v: _*)
       Row(v: _*)
     })
     })
   }
   }