丁云鹏 8 月之前
父節點
當前提交
e515ad6c3d

+ 0 - 5
recommend-model-produce/pom.xml

@@ -167,11 +167,6 @@
             <artifactId>guava</artifactId>
             <version>14.0.1</version>
         </dependency>
-<!--        <dependency>-->
-<!--            <groupId>io.netty</groupId>-->
-<!--            <artifactId>netty-all</artifactId>-->
-<!--            <version>4.1.17.Final</version>-->
-<!--        </dependency>-->
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-library</artifactId>

+ 44 - 6
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/service/OSSService.java

@@ -2,8 +2,9 @@ package com.tzld.piaoquan.recommend.model.produce.service;
 
 import com.aliyun.oss.OSS;
 import com.aliyun.oss.OSSClientBuilder;
+import com.aliyun.oss.model.GetObjectRequest;
 import com.aliyun.oss.model.PutObjectRequest;
-import com.aliyun.oss.model.PutObjectResult;
+import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
 import lombok.extern.slf4j.Slf4j;
 
 import java.io.File;
@@ -16,18 +17,55 @@ import java.io.Serializable;
 public class OSSService implements Serializable {
     private String accessId = "LTAI5tHMkNaRhpiDB1yWMZPn";
     private String accessKey = "XLi5YUJusVwbbQOaGeGsaRJ1Qyzbui";
-    private String endpoint = "https://oss-cn-hangzhou-internal.aliyuncs.com";
+    //private String endpoint = "https://oss-cn-hangzhou-internal.aliyuncs.com";
+    private String endpoint = "https://oss-cn-hangzhou.aliyuncs.com";
 
-    public void upload(String bucketName, String srcPath, String orcPath) {
+    public void upload(String bucketName, String localFile, String objectName) {
         OSS ossClient = new OSSClientBuilder().build(endpoint, accessId, accessKey);
         try {
-            PutObjectRequest request = new PutObjectRequest(bucketName, orcPath, new File(srcPath));
-            PutObjectResult result = ossClient.putObject(request);
+            PutObjectRequest request = new PutObjectRequest(bucketName, objectName, new File(localFile));
+            ossClient.putObject(request);
         } catch (Exception e) {
-            log.error("upload error bucketName {}, srcPath {}, orcPath {}", bucketName, srcPath, orcPath, e);
+            log.error("upload error bucketName {}, localFile {}, objectName {}", bucketName, localFile, objectName, e);
         }
         if (ossClient != null) {
             ossClient.shutdown();
         }
     }
+
+    public void download(String bucketName, String localFile, String objectName) {
+        OSS ossClient = new OSSClientBuilder().build(endpoint, accessId, accessKey);
+        try {
+            GetObjectRequest request = new GetObjectRequest(bucketName, objectName);
+            ossClient.getObject(request, new File(localFile));
+            System.out.println("");
+        } catch (Exception e) {
+            log.error("download error bucketName {}, localFile {}, objectName {}", bucketName, localFile, objectName, e);
+        }
+        if (ossClient != null) {
+            ossClient.shutdown();
+        }
+    }
+
+    public static void main(String[] args) {
+
+        String bucketName = "art-test-video";
+        String objectName = "test/model.tar.gz";
+        OSSService ossService = new OSSService();
+
+
+//        String inputPath = "/Users/dingyunpeng/Desktop/model";
+//        String outputPath = "/Users/dingyunpeng/Desktop/model.tar.gz";
+//        CompressUtil.compressDirectoryToGzip(inputPath, outputPath);
+//
+//        String ossPath = "test/model.tar.gz";
+//        ossService.upload(bucketName, outputPath, ossPath);
+
+
+        String destPath = "/Users/dingyunpeng/Desktop/model2.tar.gz";
+        ossService.download(bucketName, destPath, objectName);
+        String destDir = "/Users/dingyunpeng/Desktop/model2";
+        CompressUtil.decompressGzFile(destPath, destDir);
+
+    }
 }

+ 87 - 0
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/util/CompressUtil.java

@@ -0,0 +1,87 @@
+package com.tzld.piaoquan.recommend.model.produce.util;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+
+/**
+ * @author dyp
+ */
+public class CompressUtil {
+    public static void compressDirectoryToGzip(String sourceDirPath, String outputFilePath) {
+        // 创建.gz文件的输出流
+        try (OutputStream out = new FileOutputStream(outputFilePath);
+             GzipCompressorOutputStream gzipOut = new GzipCompressorOutputStream(out);
+             TarArchiveOutputStream taos = new TarArchiveOutputStream(gzipOut)) {
+
+            taos.setLongFileMode(TarArchiveOutputStream.LONGFILE_GNU);
+
+            // 遍历目录
+            Files.walk(Paths.get(sourceDirPath))
+                    .filter(Files::isRegularFile)
+                    .forEach(filePath -> {
+                        try {
+                            // 为每个文件创建TarEntry
+                            TarArchiveEntry entry = new TarArchiveEntry(filePath.toFile(), filePath.toString().substring(sourceDirPath.length() + 1));
+                            taos.putArchiveEntry(entry);
+
+                            // 读取文件内容并写入TarArchiveOutputStream
+                            try (InputStream is = Files.newInputStream(filePath)) {
+                                byte[] buffer = new byte[1024];
+                                int len;
+                                while ((len = is.read(buffer)) > 0) {
+                                    taos.write(buffer, 0, len);
+                                }
+                            }
+                            // 关闭entry
+                            taos.closeArchiveEntry();
+                        } catch (IOException e) {
+                            e.printStackTrace();
+                        }
+                    });
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    public static void decompressGzFile(String gzipFilePath, String destDirPath) {
+        try (InputStream gzipIn = new FileInputStream(gzipFilePath);
+             GzipCompressorInputStream gzIn = new GzipCompressorInputStream(gzipIn);
+             TarArchiveInputStream tais = new TarArchiveInputStream(gzIn)) {
+
+            TarArchiveEntry entry;
+            Files.createDirectories(Paths.get(destDirPath));
+            while ((entry = tais.getNextTarEntry()) != null) {
+                if (entry.isDirectory()) {
+                    // 如果是目录,创建目录
+                    Files.createDirectories(Paths.get(destDirPath, entry.getName()));
+                } else {
+                    // 如果是文件,创建文件并写入内容
+                    File outputFile = new File(destDirPath, entry.getName());
+                    if (!outputFile.exists()) {
+                        File parent = outputFile.getParentFile();
+                        if (!parent.exists()) {
+                            parent.mkdirs();
+                        }
+                        outputFile.createNewFile();
+                    }
+                    try (OutputStream out = new FileOutputStream(outputFile)) {
+                        byte[] buffer = new byte[1024];
+                        int len;
+                        while ((len = tais.read(buffer)) > 0) {
+                            out.write(buffer, 0, len);
+                        }
+                    }
+                }
+            }
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+}

+ 127 - 0
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/xgboost/XGBoostPredict.java

@@ -0,0 +1,127 @@
+package com.tzld.piaoquan.recommend.model.produce.xgboost;
+
+import com.tzld.piaoquan.recommend.model.produce.service.OSSService;
+import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
+import lombok.extern.slf4j.Slf4j;
+import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.commons.lang3.RandomUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author dyp
+ */
+@Slf4j
+public class XGBoostPredict {
+
+    public static void main(String[] args) {
+        try {
+
+            String[] features = {"cpa",
+                    "b2_12h_ctr",
+                    "b2_12h_ctcvr",
+                    "b2_12h_cvr",
+                    "b2_12h_conver",
+                    "b2_12h_click",
+                    "b2_12h_conver*log(view)",
+                    "b2_12h_conver*ctcvr",
+                    "b2_7d_ctr",
+                    "b2_7d_ctcvr",
+                    "b2_7d_cvr",
+                    "b2_7d_conver",
+                    "b2_7d_click",
+                    "b2_7d_conver*log(view)",
+                    "b2_7d_conver*ctcvr"
+            };
+
+
+            SparkSession spark = SparkSession.builder()
+                    .appName("XGBoostTrain")
+                    .master("local")
+                    .getOrCreate();
+
+            JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
+            String file = "/dw/recommend/model/33_ad_train_data_v4/20240726/part-00098.gz";
+            JavaRDD<String> rdd = jsc.textFile(file);
+
+            JavaRDD<Row> rowRDD = rdd.map(s -> {
+                String[] line = StringUtils.split(s, '\t');
+                int label = NumberUtils.toInt(line[0]);
+                // 选特征
+                Map<String, Double> map = new HashMap<>();
+                for (int i = 1; i < line.length; i++) {
+                    String[] fv = StringUtils.split(line[i], ':');
+                    map.put(fv[0], NumberUtils.toDouble(fv[1], 0.0));
+                }
+
+                Object[] v = new Object[features.length + 1];
+                v[0] = label;
+                v[0] = RandomUtils.nextInt(0, 2);
+                for (int i = 0; i < features.length; i++) {
+                    v[i + 1] = map.getOrDefault(features[i], 0.0d);
+                }
+
+                return RowFactory.create(v);
+            });
+
+            log.info("rowRDD count {}", rowRDD.count());
+            // 将 JavaRDD<Row> 转换为 Dataset<Row>
+            List<StructField> fields = new ArrayList<>();
+            fields.add(DataTypes.createStructField("label", DataTypes.IntegerType, true));
+            for (String f : features) {
+                fields.add(DataTypes.createStructField(f, DataTypes.DoubleType, true));
+            }
+            StructType schema = DataTypes.createStructType(fields);
+            Dataset<Row> dataset = spark.createDataFrame(rowRDD, schema);
+
+            VectorAssembler assembler = new VectorAssembler()
+                    .setInputCols(features)
+                    .setOutputCol("features");
+
+            Dataset<Row> assembledData = assembler.transform(dataset);
+            assembledData.show();
+            // 划分训练集和测试集
+            Dataset<Row>[] splits = assembledData.randomSplit(new double[]{0.7, 0.3});
+            Dataset<Row> trainData = splits[0];
+            trainData.show(500);
+            Dataset<Row> testData = splits[1];
+            testData.show(500);
+
+            // 保存模型
+            String bucketName = "art-test-video";
+            String objectName = "test/model.tar.gz";
+            OSSService ossService = new OSSService();
+
+            String destPath = "/root/recommend-model/model2.tar.gz";
+            ossService.download(bucketName, destPath, objectName);
+            String destDir = "/root/recommend-model/modelpredict";
+            CompressUtil.decompressGzFile(destPath, destDir);
+
+            // 显示预测结果
+
+            XGBoostClassificationModel model2 = XGBoostClassificationModel.load(destDir);
+            Dataset<Row> predictions = model2.transform(assembledData);
+            predictions.select("label", "prediction", "features", "rawPrediction", "probability").show(500);
+
+            spark.close();
+
+        } catch (Throwable e) {
+            log.error("", e);
+        }
+    }
+}

+ 8 - 21
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/xgboost/XGBoostTrain.java

@@ -1,6 +1,7 @@
 package com.tzld.piaoquan.recommend.model.produce.xgboost;
 
 import com.tzld.piaoquan.recommend.model.produce.service.OSSService;
+import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
 import lombok.extern.slf4j.Slf4j;
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier;
@@ -57,7 +58,6 @@ public class XGBoostTrain {
 
             JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
             String file = "/dw/recommend/model/33_ad_train_data_v4/20240726/part-00099.gz";
-            //file = "/Users/dingyunpeng/Desktop/part-00099.gz";
             JavaRDD<String> rdd = jsc.textFile(file);
 
             JavaRDD<Row> rowRDD = rdd.map(s -> {
@@ -96,15 +96,6 @@ public class XGBoostTrain {
 
             Dataset<Row> assembledData = assembler.transform(dataset);
             assembledData.show();
-            // 划分训练集和测试集
-            Dataset<Row>[] splits = assembledData.randomSplit(new double[]{0.7, 0.3});
-            Dataset<Row> trainData = splits[0];
-            trainData.show(500);
-            Dataset<Row> testData = splits[1];
-            testData.show(500);
-
-            // 参数
-
 
             // 创建 XGBoostClassifier 对象
             XGBoostClassifier xgbClassifier = new XGBoostClassifier()
@@ -123,18 +114,14 @@ public class XGBoostTrain {
             XGBoostClassificationModel model = xgbClassifier.fit(assembledData);
 
             // 保存模型
-            String path = "/root/recommend-model/model";
+            String path = "/root/recommend-model/modeltrain";
             model.save(path);
-
-//            OSSService ossService = new OSSService();
-//            String bucketName = "";
-//            String ossPath = "";
-//            ossService.upload(bucketName, path, ossPath);
-
-            // 显示预测结果
-            XGBoostClassificationModel model2 = XGBoostClassificationModel.load(path);
-            Dataset<Row> predictions = model2.transform(assembledData);
-            predictions.select("label", "prediction", "features", "rawPrediction", "probability").show(500);
+            String outputPath = "/root/recommend-model/model.tar.gz";
+            CompressUtil.compressDirectoryToGzip(path, outputPath);
+            String bucketName = "art-test-video";
+            String ossPath = "test/model.tar.gz";
+            OSSService ossService = new OSSService();
+            ossService.upload(bucketName, outputPath, ossPath);
 
             spark.close();
 

+ 7 - 4
recommend-model-produce/src/main/java/com/tzld/piaoquan/recommend/model/produce/xgboost/XGBoostTrainLocalTest.java

@@ -1,6 +1,7 @@
 package com.tzld.piaoquan.recommend.model.produce.xgboost;
 
 import com.tzld.piaoquan.recommend.model.produce.service.OSSService;
+import com.tzld.piaoquan.recommend.model.produce.util.CompressUtil;
 import lombok.extern.slf4j.Slf4j;
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel;
 import ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier;
@@ -124,12 +125,14 @@ public class XGBoostTrainLocalTest {
 
             // 保存模型
             String path = "/Users/dingyunpeng/Desktop/model";
-            model.save(path);
+            model.write().overwrite().save(path);
 
+            String outputPath = "/Users/dingyunpeng/Desktop/model.tar.gz";
+            CompressUtil.compressDirectoryToGzip(path, outputPath);
+            String bucketName = "art-test-video";
+            String ossPath = "test/model.tar.gz";
             OSSService ossService = new OSSService();
-            String bucketName = "";
-            String ossPath = "";
-            ossService.upload(bucketName, path, ossPath);
+            ossService.upload(bucketName, outputPath, ossPath);
 
             // 显示预测结果
             XGBoostClassificationModel model2 = XGBoostClassificationModel.load(path);