丁云鹏 8 months ago
parent
commit
63d8ca9fc5

+ 175 - 0
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/service/XGBoostService.java

@@ -0,0 +1,175 @@
+package com.tzld.piaoquan.recommend.model.produce.service;
+
+import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
+import lombok.extern.slf4j.Slf4j;
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier;
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author dyp
+ */
+@Slf4j
+public class XGBoostService {
+
+
+    public void train(String[] args) {
+        try {
+            Dataset<Row> assembledData = dataset("/dw/recommend/model/33_ad_train_data_v4/20240726/part-00099.gz");
+            log.info("训练样本 show");
+            assembledData.show();
+            // 创建 XGBoostClassifier 对象
+            XGBoostClassifier xgbClassifier = new XGBoostClassifier()
+                    .setEta(0.01f)
+                    .setSubsample(0.8)
+                    .setColsampleBytree(0.8)
+                    .setScalePosWeight(1)
+                    .setSeed(2024)
+                    .setMissing(0.0f)
+                    .setFeaturesCol("features")
+                    .setLabelCol("label")
+                    .setMaxDepth(5)
+                    .setObjective("binary:logistic")
+                    .setNthread(1)
+                    .setNumRound(100)
+                    .setNumWorkers(1);
+
+
+            // 训练模型
+            XGBoostClassificationModel model = xgbClassifier.fit(assembledData);
+
+            // 保存模型
+            String path = "/root/recommend-model/modeltrain";
+            model.write().overwrite().save("file://" + path);
+            String outputPath = "/root/recommend-model/model.tar.gz";
+            CompressUtil.compressDirectoryToGzip(path, outputPath);
+            String bucketName = "art-test-video";
+            String ossPath = "test/model.tar.gz";
+            OSSService ossService = new OSSService();
+            ossService.upload(bucketName, outputPath, ossPath);
+
+        } catch (Throwable e) {
+            log.error("", e);
+        }
+    }
+
+    public void predict(String[] args) {
+        try {
+
+            Dataset<Row> assembledData = dataset("/dw/recommend/model/33_ad_train_data_v4/20240726/part-00098.gz");
+            log.info("测试样本 show");
+            assembledData.show();
+
+            // 保存模型
+            String bucketName = "art-test-video";
+            String objectName = "test/model.tar.gz";
+            OSSService ossService = new OSSService();
+
+            String destPath = "/root/recommend-model/model2.tar.gz";
+            ossService.download(bucketName, destPath, objectName);
+            String destDir = "/root/recommend-model/modelpredict";
+            CompressUtil.decompressGzFile(destPath, destDir);
+
+            // 显示预测结果
+            XGBoostClassificationModel model2 = XGBoostClassificationModel.load("file://" + destDir);
+            Dataset<Row> predictions = model2.transform(assembledData);
+            predictions.select("label", "prediction", "rawPrediction", "probability", "features").show(500);
+
+            // 计算AUC
+            Dataset<Row> selected = predictions.select("label", "rawPrediction");
+            BinaryClassificationEvaluator evaluator = new BinaryClassificationEvaluator()
+                    .setLabelCol("label")
+                    .setRawPredictionCol("rawPrediction")
+                    .setMetricName("areaUnderROC");
+            double auc = evaluator.evaluate(selected);
+            log.info("AUC: {}", auc);
+
+        } catch (Throwable e) {
+            log.error("", e);
+        }
+    }
+
+    private static Dataset<Row> dataset(String path) {
+        String[] features = {"cpa",
+                "b2_12h_ctr",
+                "b2_12h_ctcvr",
+                "b2_12h_cvr",
+                "b2_12h_conver",
+                "b2_12h_click",
+                "b2_12h_conver*log(view)",
+                "b2_12h_conver*ctcvr",
+                "b2_7d_ctr",
+                "b2_7d_ctcvr",
+                "b2_7d_cvr",
+                "b2_7d_conver",
+                "b2_7d_click",
+                "b2_7d_conver*log(view)",
+                "b2_7d_conver*ctcvr"
+        };
+
+
+        SparkSession spark = SparkSession.builder()
+                .appName("XGBoostTrain")
+                .master("local")
+                .getOrCreate();
+
+        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
+        String file = path;
+        JavaRDD<String> rdd = jsc.textFile(file);
+
+        JavaRDD<Row> rowRDD = rdd.map(s -> {
+            String[] line = StringUtils.split(s, '\t');
+            int label = NumberUtils.toInt(line[0]);
+            // 选特征
+            Map<String, Double> map = new HashMap<>();
+            for (int i = 1; i < line.length; i++) {
+                String[] fv = StringUtils.split(line[i], ':');
+                map.put(fv[0], NumberUtils.toDouble(fv[1], 0.0));
+            }
+
+            Object[] v = new Object[features.length + 1];
+            v[0] = label;
+            for (int i = 0; i < features.length; i++) {
+                v[i + 1] = map.getOrDefault(features[i], 0.0d);
+            }
+
+            return RowFactory.create(v);
+        });
+
+        log.info("rowRDD count {}", rowRDD.count());
+        // 将 JavaRDD<Row> 转换为 Dataset<Row>
+        List<StructField> fields = new ArrayList<>();
+        fields.add(DataTypes.createStructField("label", DataTypes.IntegerType, true));
+        for (String f : features) {
+            fields.add(DataTypes.createStructField(f, DataTypes.DoubleType, true));
+        }
+        StructType schema = DataTypes.createStructType(fields);
+        Dataset<Row> dataset = spark.createDataFrame(rowRDD, schema);
+
+        VectorAssembler assembler = new VectorAssembler()
+                .setInputCols(features)
+                .setOutputCol("features");
+
+        Dataset<Row> assembledData = assembler.transform(dataset);
+        assembledData.show();
+        return assembledData;
+    }
+}

+ 3 - 128
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/xgboost/XGBoostPredict.java

@@ -1,28 +1,7 @@
 package com.tzld.piaoquan.recommend.model.produce.xgboost;
 
-import com.tzld.piaoquan.recommend.model.produce.service.OSSService;
-import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
+import com.tzld.piaoquan.recommend.model.produce.service.XGBoostService;
 import lombok.extern.slf4j.Slf4j;
-import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.commons.lang3.RandomUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator;
-import org.apache.spark.ml.feature.VectorAssembler;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
 
 /**
  * @author dyp
@@ -31,111 +10,7 @@ import java.util.Map;
 public class XGBoostPredict {
 
     public static void main(String[] args) {
-        try {
-
-            String[] features = {"cpa",
-                    "b2_1h_ctr",
-                    "b2_1h_ctcvr",
-                    "b2_1h_cvr",
-                    "b2_1h_conver",
-                    "b2_1h_click",
-                    "b2_1h_conver*log(view)",
-                    "b2_1h_conver*ctcvr",
-                    "b2_2h_ctr",
-                    "b2_2h_ctcvr",
-                    "b2_2h_cvr",
-                    "b2_2h_conver",
-                    "b2_2h_click",
-                    "b2_2h_conver*log(view)",
-                    "b2_2h_conver*ctcvr",
-                    "b2_3h_ctr",
-                    "b2_3h_ctcvr",
-                    "b2_3h_cvr",
-                    "b2_3h_conver",
-                    "b2_3h_click",
-                    "b2_3h_conver*log(view)",
-                    "b2_3h_conver*ctcvr",
-                    "b2_6h_ctr",
-                    "b2_6h_ctcvr"
-            };
-
-
-            SparkSession spark = SparkSession.builder()
-                    .appName("XGBoostTrain")
-                    .master("local")
-                    .getOrCreate();
-
-            JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
-            String file = "/dw/recommend/model/33_ad_train_data_v4/20240726/part-00098.gz";
-            JavaRDD<String> rdd = jsc.textFile(file);
-
-            JavaRDD<Row> rowRDD = rdd.map(s -> {
-                String[] line = StringUtils.split(s, '\t');
-                int label = NumberUtils.toInt(line[0]);
-                // 选特征
-                Map<String, Double> map = new HashMap<>();
-                for (int i = 1; i < line.length; i++) {
-                    String[] fv = StringUtils.split(line[i], ':');
-                    map.put(fv[0], NumberUtils.toDouble(fv[1], 0.0));
-                }
-
-                Object[] v = new Object[features.length + 1];
-                v[0] = label;
-                v[0] = RandomUtils.nextInt(0, 2);
-                for (int i = 0; i < features.length; i++) {
-                    v[i + 1] = map.getOrDefault(features[i], 0.0d);
-                }
-
-                return RowFactory.create(v);
-            });
-
-            log.info("rowRDD count {}", rowRDD.count());
-            // 将 JavaRDD<Row> 转换为 Dataset<Row>
-            List<StructField> fields = new ArrayList<>();
-            fields.add(DataTypes.createStructField("label", DataTypes.IntegerType, true));
-            for (String f : features) {
-                fields.add(DataTypes.createStructField(f, DataTypes.DoubleType, true));
-            }
-            StructType schema = DataTypes.createStructType(fields);
-            Dataset<Row> dataset = spark.createDataFrame(rowRDD, schema);
-
-            VectorAssembler assembler = new VectorAssembler()
-                    .setInputCols(features)
-                    .setOutputCol("features");
-
-            Dataset<Row> assembledData = assembler.transform(dataset);
-            assembledData.show();
-
-            // 保存模型
-            String bucketName = "art-test-video";
-            String objectName = "test/model.tar.gz";
-            OSSService ossService = new OSSService();
-
-            String destPath = "/root/recommend-model/model2.tar.gz";
-            ossService.download(bucketName, destPath, objectName);
-            String destDir = "/root/recommend-model/modelpredict";
-            CompressUtil.decompressGzFile(destPath, destDir);
-
-            // 显示预测结果
-
-            XGBoostClassificationModel model2 = XGBoostClassificationModel.load("file://" + destDir);
-            Dataset<Row> predictions = model2.transform(assembledData);
-            predictions.select("label", "prediction", "rawPrediction", "probability", "features").show(500);
-
-            // 计算AUC
-            Dataset<Row> selected = predictions.select("label", "rawPrediction");
-            BinaryClassificationEvaluator evaluator = new BinaryClassificationEvaluator()
-                    .setLabelCol("label")
-                    .setRawPredictionCol("rawPrediction")
-                    .setMetricName("areaUnderROC");
-            double auc = evaluator.evaluate(selected);
-
-            log.info("AUC: {}", auc);
-
-            spark.close();
-
-        } catch (Throwable e) {
-            log.error("", e);
-        }
+        XGBoostService xgb = new XGBoostService();
+        xgb.predict(args);
     }
 }

+ 3 - 132
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/xgboost/XGBoostTrain.java

@@ -1,28 +1,7 @@
 package com.tzld.piaoquan.recommend.model.produce.xgboost;
 
-import com.tzld.piaoquan.recommend.model.produce.service.OSSService;
-import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
+import com.tzld.piaoquan.recommend.model.produce.service.XGBoostService;
 import lombok.extern.slf4j.Slf4j;
-import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
-import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier;
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.commons.lang3.RandomUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.ml.feature.VectorAssembler;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
-import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
 
 /**
  * @author dyp
@@ -31,115 +10,7 @@ import java.util.Map;
 public class XGBoostTrain {
 
     public static void main(String[] args) {
-        try {
-
-            String[] features = {"cpa",
-                    "b2_1h_ctr",
-                    "b2_1h_ctcvr",
-                    "b2_1h_cvr",
-                    "b2_1h_conver",
-                    "b2_1h_click",
-                    "b2_1h_conver*log(view)",
-                    "b2_1h_conver*ctcvr",
-                    "b2_2h_ctr",
-                    "b2_2h_ctcvr",
-                    "b2_2h_cvr",
-                    "b2_2h_conver",
-                    "b2_2h_click",
-                    "b2_2h_conver*log(view)",
-                    "b2_2h_conver*ctcvr",
-                    "b2_3h_ctr",
-                    "b2_3h_ctcvr",
-                    "b2_3h_cvr",
-                    "b2_3h_conver",
-                    "b2_3h_click",
-                    "b2_3h_conver*log(view)",
-                    "b2_3h_conver*ctcvr",
-                    "b2_6h_ctr",
-                    "b2_6h_ctcvr"
-            };
-
-
-            SparkSession spark = SparkSession.builder()
-                    .appName("XGBoostTrain")
-                    .master("local")
-                    .getOrCreate();
-
-            JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
-            String file = "/dw/recommend/model/33_ad_train_data_v4/20240726/part-00099.gz";
-            JavaRDD<String> rdd = jsc.textFile(file);
-
-            JavaRDD<Row> rowRDD = rdd.map(s -> {
-                String[] line = StringUtils.split(s, '\t');
-                int label = NumberUtils.toInt(line[0]);
-                // 选特征
-                Map<String, Double> map = new HashMap<>();
-                for (int i = 1; i < line.length; i++) {
-                    String[] fv = StringUtils.split(line[i], ':');
-                    map.put(fv[0], NumberUtils.toDouble(fv[1], 0.0));
-                }
-
-                Object[] v = new Object[features.length + 1];
-                v[0] = label;
-                v[0] = RandomUtils.nextInt(0, 2);
-                for (int i = 0; i < features.length; i++) {
-                    v[i + 1] = map.getOrDefault(features[i], 0.0d);
-                }
-
-                return RowFactory.create(v);
-            });
-
-            log.info("rowRDD count {}", rowRDD.count());
-            // 将 JavaRDD<Row> 转换为 Dataset<Row>
-            List<StructField> fields = new ArrayList<>();
-            fields.add(DataTypes.createStructField("label", DataTypes.IntegerType, true));
-            for (String f : features) {
-                fields.add(DataTypes.createStructField(f, DataTypes.DoubleType, true));
-            }
-            StructType schema = DataTypes.createStructType(fields);
-            Dataset<Row> dataset = spark.createDataFrame(rowRDD, schema);
-
-            VectorAssembler assembler = new VectorAssembler()
-                    .setInputCols(features)
-                    .setOutputCol("features");
-
-            Dataset<Row> assembledData = assembler.transform(dataset);
-            assembledData.show();
-
-            // 创建 XGBoostClassifier 对象
-            XGBoostClassifier xgbClassifier = new XGBoostClassifier()
-                    .setEta(0.01f)
-                    .setSubsample(0.8)
-                    .setColsampleBytree(0.8)
-                    .setScalePosWeight(1)
-                    .setSeed(2024)
-                    .setMissing(0.0f)
-                    .setFeaturesCol("features")
-                    .setLabelCol("label")
-                    .setMaxDepth(5)
-                    .setObjective("binary:logistic")
-                    .setNthread(1)
-                    .setNumRound(100)
-                    .setNumWorkers(1);
-
-
-            // 训练模型
-            XGBoostClassificationModel model = xgbClassifier.fit(assembledData);
-
-            // 保存模型
-            String path = "/root/recommend-model/modeltrain";
-            model.write().overwrite().save("file://" + path);
-            String outputPath = "/root/recommend-model/model.tar.gz";
-            CompressUtil.compressDirectoryToGzip(path, outputPath);
-            String bucketName = "art-test-video";
-            String ossPath = "test/model.tar.gz";
-            OSSService ossService = new OSSService();
-            ossService.upload(bucketName, outputPath, ossPath);
-
-            spark.close();
-
-        } catch (Throwable e) {
-            log.error("", e);
-        }
+        XGBoostService xgb = new XGBoostService();
+        xgb.train(args);
     }
 }

+ 0 - 3
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/xgboost/XGBoostTrainLocalTest.java

@@ -1,12 +1,9 @@
 package com.tzld.piaoquan.recommend.model.produce.xgboost;
 
-import com.tzld.piaoquan.recommend.model.produce.service.OSSService;
-import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
 import lombok.extern.slf4j.Slf4j;
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier;
 import org.apache.commons.lang.math.NumberUtils;
-import org.apache.commons.lang3.RandomUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;