Browse Source

文件保存方式更新

zhangbo 1 year ago
parent
commit
98d6afc5de

+ 3 - 3
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala

@@ -26,13 +26,12 @@ object makedata_13_originData_20240529 {
     // 1 读取参数
     val param = ParamUtils.parseArgs(args)
     val tablePart = param.getOrElse("tablePart", "64").toInt
-    val partitionPrefix = param.getOrElse("partitionPrefix", "dt={},hh={}")
     val beginStr = param.getOrElse("beginStr", "2023010100")
     val endStr = param.getOrElse("endStr", "2023010123")
     val savePath = param.getOrElse("savePath", "/dw/recommend/model/13_sample_data/")
     val project = param.getOrElse("project", "loghubods")
     val table = param.getOrElse("table", "XXXX")
-    val repartition = param.getOrElse("repartition", "20").toInt
+    val repartition = param.getOrElse("repartition", "10").toInt
 
     // 2 读取odps+表信息
     val odpsOps = env.getODPS(sc)
@@ -242,7 +241,8 @@ object makedata_13_originData_20240529 {
         })
 
       // 4 保存数据到hdfs
-      val hdfsPath = savePath + "/" + partition
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
       if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")){
         println("删除路径并开始数据写入:" + hdfsPath)
         MyHdfsUtils.delete_hdfs_path(hdfsPath)

+ 42 - 38
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala

@@ -1,7 +1,7 @@
 package com.aliyun.odps.spark.examples.makedata
 
 import com.alibaba.fastjson.JSON
-import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.SparkSession
 
@@ -41,49 +41,53 @@ object makedata_14_valueData_20240608 {
 
     // 1 读取参数
     val param = ParamUtils.parseArgs(args)
-    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=20240607,hh=00")
-    val date = param.getOrElse("date", "20240607")
+    val beginStr = param.getOrElse("beginStr", "20230101")
+    val endStr = param.getOrElse("endStr", "20230101")
     val readPath = param.getOrElse("readPath", "/dw/recommend/model/13_sample_data/")
     val savePath = param.getOrElse("savePath", "/dw/recommend/model/14_feature_data/")
     val repartition = param.getOrElse("repartition", "200").toInt
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      val data = sc.textFile(readPath + "/" + date + "*")
+      val data1 = data.map(r => {
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val featureKey = rList(2)
+        (logKey, labelKey, featureKey)
+      }).filter(r =>
+        r._1.split(",")(6).equals("0")
+      ).mapPartitions(row => {
+        val result = new ArrayBuffer[String]()
+        val contentList = contentList_bc.value
+        row.foreach {
+          case (logKey, labelKey, featureKey) =>
+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
+            val featureJson = JSON.parseObject(featureKey)
 
-    val data = sc.textFile(readPath + partitionPrefix + "*")
-    val data1 = data.map(r => {
-      val rList = r.split("\t")
-      val logKey = rList(0)
-      val labelKey = rList(1)
-      val featureKey = rList(2)
-      (logKey, labelKey, featureKey)
-    }).filter(r=>
-      r._1.split(",")(6).equals("0")
-    ).mapPartitions(row => {
-      val result = new ArrayBuffer[String]()
-      val contentList = contentList_bc.value
-      row.foreach{
-        case (logKey, labelKey, featureKey) =>
-          val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
-          val featureJson = JSON.parseObject(featureKey)
+            val featureValues = contentList.map(key => {
+              if (featureJson.containsKey(key)) {
+                featureJson.getDouble(key)
+              } else {
+                0.0
+              }
+            })
+            result.add(label + "\t" + featureValues.mkString(","))
+        }
+        result.iterator
+      })
 
-          val featureValues = contentList.map(key=>{
-            if (featureJson.containsKey(key)){
-              featureJson.getDouble(key)
-            }else{
-              0.0
-            }
-          })
-          result.add(label + "\t" + featureValues.mkString(","))
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data1.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
       }
-      result.iterator
-    })
-
-    // 4 保存数据到hdfs
-    val hdfsPath = savePath + "/" + date
-    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
-      println("删除路径并开始数据写入:" + hdfsPath)
-      MyHdfsUtils.delete_hdfs_path(hdfsPath)
-      data1.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
-    } else {
-      println("路径不合法,无法写入:" + hdfsPath)
     }
+
+
   }
 }

+ 4 - 5
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala

@@ -38,13 +38,12 @@ object makedata_15_bucket_20240608 {
 
     // 1 读取参数
     val param = ParamUtils.parseArgs(args)
-    val partitionPrefix = param.getOrElse("partitionPrefix", "dt=20240607,hh=00")
-    val date = param.getOrElse("date", "20240607")
-    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/")
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607")
     val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/")
+    val fileName = param.getOrElse("fileName", "20240607_200")
     val bucketNum = param.getOrElse("bucketNum", "200").toInt
 
-    val data = sc.textFile(readPath + partitionPrefix)
+    val data = sc.textFile(readPath)
     val data1 = data.map(r => {
       val rList = r.split("\t")
       val doubles = rList(1).split(",").map(_.toDouble)
@@ -84,7 +83,7 @@ object makedata_15_bucket_20240608 {
 
 
     // 4 保存数据到hdfs
-    val hdfsPath = savePath + "/" + date
+    val hdfsPath = savePath + "/" + fileName
     if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
       println("删除路径并开始数据写入:" + hdfsPath)
       MyHdfsUtils.delete_hdfs_path(hdfsPath)

+ 1 - 1
zhangbo/01_train.sh

@@ -11,5 +11,5 @@ HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
 $HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_train -m model/${model_name}_${day}.txt -dim ${bias} -core 8
 # -v_l1 ${v_l1} -v_l2 ${v_l2}
 
-# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_fuck 1,1,0 >p1_model_fuck.log 2>&1 &
+# nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_fuck 1,1,8 >p1_model_fuck.log 2>&1 &
 

+ 1 - 1
zhangbo/03_predict.sh

@@ -11,4 +11,4 @@ HADOOP="/opt/apps/HADOOP-COMMON/hadoop-common-current/bin/hadoop"
 $HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_predict -m model/$model_name -dim ${bias} -core 8 -out predict/${output_file}_$day.txt
 cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
 
-# nohup sh 03_predict.sh 20240607 /dw/recommend/model/16_train_data/ model_fuck_20240606.txt model_fuck_20240606 0 >p3_model_fuck.log 2>&1 &
+# nohup sh 03_predict.sh 20240607 /dw/recommend/model/16_train_data/ model_fuck_20240606.txt model_fuck_20240606 8 >p3_model_fuck.log 2>&1 &