Browse Source

Merge branch 'feature/zhangbo_makedata_v2' into feature/qiao_makedata_v2

Joe 9 tháng trước cách đây
mục cha
commit
5b552c7d13

Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 2
src/main/resources/20240609_bucket_274_old.txt


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 0 - 0
src/main/resources/20240609_bucket_314.txt


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 1 - 0
src/main/resources/20240704_ad_bucket_351.txt


+ 0 - 0
src/main/resources/20240709_recsys_bucket_314.txt


+ 314 - 0
src/main/resources/20240709_recsys_feature_name_314.txt

@@ -0,0 +1,314 @@
+b123_1h_STR
+b123_1h_log(share)
+b123_1h_ROV
+b123_1h_log(return)
+b123_1h_ROV*log(return)
+b123_1h_ROS
+b123_2h_STR
+b123_2h_log(share)
+b123_2h_ROV
+b123_2h_log(return)
+b123_2h_ROV*log(return)
+b123_2h_ROS
+b123_3h_STR
+b123_3h_log(share)
+b123_3h_ROV
+b123_3h_log(return)
+b123_3h_ROV*log(return)
+b123_3h_ROS
+b123_4h_STR
+b123_4h_log(share)
+b123_4h_ROV
+b123_4h_log(return)
+b123_4h_ROV*log(return)
+b123_4h_ROS
+b123_12h_STR
+b123_12h_log(share)
+b123_12h_ROV
+b123_12h_log(return)
+b123_12h_ROV*log(return)
+b123_12h_ROS
+b123_1d_STR
+b123_1d_log(share)
+b123_1d_ROV
+b123_1d_log(return)
+b123_1d_ROV*log(return)
+b123_1d_ROS
+b123_3d_STR
+b123_3d_log(share)
+b123_3d_ROV
+b123_3d_log(return)
+b123_3d_ROV*log(return)
+b123_3d_ROS
+b123_7d_STR
+b123_7d_log(share)
+b123_7d_ROV
+b123_7d_log(return)
+b123_7d_ROV*log(return)
+b123_7d_ROS
+b167_1h_STR
+b167_1h_log(share)
+b167_1h_ROV
+b167_1h_log(return)
+b167_1h_ROV*log(return)
+b167_1h_ROS
+b167_2h_STR
+b167_2h_log(share)
+b167_2h_ROV
+b167_2h_log(return)
+b167_2h_ROV*log(return)
+b167_2h_ROS
+b167_3h_STR
+b167_3h_log(share)
+b167_3h_ROV
+b167_3h_log(return)
+b167_3h_ROV*log(return)
+b167_3h_ROS
+b167_4h_STR
+b167_4h_log(share)
+b167_4h_ROV
+b167_4h_log(return)
+b167_4h_ROV*log(return)
+b167_4h_ROS
+b167_12h_STR
+b167_12h_log(share)
+b167_12h_ROV
+b167_12h_log(return)
+b167_12h_ROV*log(return)
+b167_12h_ROS
+b167_1d_STR
+b167_1d_log(share)
+b167_1d_ROV
+b167_1d_log(return)
+b167_1d_ROV*log(return)
+b167_1d_ROS
+b167_3d_STR
+b167_3d_log(share)
+b167_3d_ROV
+b167_3d_log(return)
+b167_3d_ROV*log(return)
+b167_3d_ROS
+b167_7d_STR
+b167_7d_log(share)
+b167_7d_ROV
+b167_7d_log(return)
+b167_7d_ROV*log(return)
+b167_7d_ROS
+b8910_1h_STR
+b8910_1h_log(share)
+b8910_1h_ROV
+b8910_1h_log(return)
+b8910_1h_ROV*log(return)
+b8910_1h_ROS
+b8910_2h_STR
+b8910_2h_log(share)
+b8910_2h_ROV
+b8910_2h_log(return)
+b8910_2h_ROV*log(return)
+b8910_2h_ROS
+b8910_3h_STR
+b8910_3h_log(share)
+b8910_3h_ROV
+b8910_3h_log(return)
+b8910_3h_ROV*log(return)
+b8910_3h_ROS
+b8910_4h_STR
+b8910_4h_log(share)
+b8910_4h_ROV
+b8910_4h_log(return)
+b8910_4h_ROV*log(return)
+b8910_4h_ROS
+b8910_12h_STR
+b8910_12h_log(share)
+b8910_12h_ROV
+b8910_12h_log(return)
+b8910_12h_ROV*log(return)
+b8910_12h_ROS
+b8910_1d_STR
+b8910_1d_log(share)
+b8910_1d_ROV
+b8910_1d_log(return)
+b8910_1d_ROV*log(return)
+b8910_1d_ROS
+b8910_3d_STR
+b8910_3d_log(share)
+b8910_3d_ROV
+b8910_3d_log(return)
+b8910_3d_ROV*log(return)
+b8910_3d_ROS
+b8910_7d_STR
+b8910_7d_log(share)
+b8910_7d_ROV
+b8910_7d_log(return)
+b8910_7d_ROV*log(return)
+b8910_7d_ROS
+b111213_1h_STR
+b111213_1h_log(share)
+b111213_1h_ROV
+b111213_1h_log(return)
+b111213_1h_ROV*log(return)
+b111213_1h_ROS
+b111213_2h_STR
+b111213_2h_log(share)
+b111213_2h_ROV
+b111213_2h_log(return)
+b111213_2h_ROV*log(return)
+b111213_2h_ROS
+b111213_3h_STR
+b111213_3h_log(share)
+b111213_3h_ROV
+b111213_3h_log(return)
+b111213_3h_ROV*log(return)
+b111213_3h_ROS
+b111213_4h_STR
+b111213_4h_log(share)
+b111213_4h_ROV
+b111213_4h_log(return)
+b111213_4h_ROV*log(return)
+b111213_4h_ROS
+b111213_12h_STR
+b111213_12h_log(share)
+b111213_12h_ROV
+b111213_12h_log(return)
+b111213_12h_ROV*log(return)
+b111213_12h_ROS
+b111213_1d_STR
+b111213_1d_log(share)
+b111213_1d_ROV
+b111213_1d_log(return)
+b111213_1d_ROV*log(return)
+b111213_1d_ROS
+b111213_3d_STR
+b111213_3d_log(share)
+b111213_3d_ROV
+b111213_3d_log(return)
+b111213_3d_ROV*log(return)
+b111213_3d_ROS
+b111213_7d_STR
+b111213_7d_log(share)
+b111213_7d_ROV
+b111213_7d_log(return)
+b111213_7d_ROV*log(return)
+b111213_7d_ROS
+b171819_1h_STR
+b171819_1h_log(share)
+b171819_1h_ROV
+b171819_1h_log(return)
+b171819_1h_ROV*log(return)
+b171819_1h_ROS
+b171819_2h_STR
+b171819_2h_log(share)
+b171819_2h_ROV
+b171819_2h_log(return)
+b171819_2h_ROV*log(return)
+b171819_2h_ROS
+b171819_3h_STR
+b171819_3h_log(share)
+b171819_3h_ROV
+b171819_3h_log(return)
+b171819_3h_ROV*log(return)
+b171819_3h_ROS
+b171819_4h_STR
+b171819_4h_log(share)
+b171819_4h_ROV
+b171819_4h_log(return)
+b171819_4h_ROV*log(return)
+b171819_4h_ROS
+b171819_12h_STR
+b171819_12h_log(share)
+b171819_12h_ROV
+b171819_12h_log(return)
+b171819_12h_ROV*log(return)
+b171819_12h_ROS
+b171819_1d_STR
+b171819_1d_log(share)
+b171819_1d_ROV
+b171819_1d_log(return)
+b171819_1d_ROV*log(return)
+b171819_1d_ROS
+b171819_3d_STR
+b171819_3d_log(share)
+b171819_3d_ROV
+b171819_3d_log(return)
+b171819_3d_ROV*log(return)
+b171819_3d_ROS
+b171819_7d_STR
+b171819_7d_log(share)
+b171819_7d_ROV
+b171819_7d_log(return)
+b171819_7d_ROV*log(return)
+b171819_7d_ROS
+total_time
+bit_rate
+playcnt_6h
+playcnt_1d
+playcnt_3d
+playcnt_7d
+share_pv_12h
+share_pv_1d
+share_pv_3d
+share_pv_7d
+return_uv_12h
+return_uv_1d
+return_uv_3d
+return_uv_7d
+c3_feature_tags_1d_matchnum
+c3_feature_tags_1d_maxscore
+c3_feature_tags_1d_avgscore
+c3_feature_tags_3d_matchnum
+c3_feature_tags_3d_maxscore
+c3_feature_tags_3d_avgscore
+c3_feature_tags_7d_matchnum
+c3_feature_tags_7d_maxscore
+c3_feature_tags_7d_avgscore
+c4_feature_tags_1d_matchnum
+c4_feature_tags_1d_maxscore
+c4_feature_tags_1d_avgscore
+c4_feature_tags_3d_matchnum
+c4_feature_tags_3d_maxscore
+c4_feature_tags_3d_avgscore
+c4_feature_tags_7d_matchnum
+c4_feature_tags_7d_maxscore
+c4_feature_tags_7d_avgscore
+c5_feature_tags_1d_matchnum
+c5_feature_tags_1d_maxscore
+c5_feature_tags_1d_avgscore
+c5_feature_tags_3d_matchnum
+c5_feature_tags_3d_maxscore
+c5_feature_tags_3d_avgscore
+c5_feature_tags_7d_matchnum
+c5_feature_tags_7d_maxscore
+c5_feature_tags_7d_avgscore
+c6_feature_tags_1d_matchnum
+c6_feature_tags_1d_maxscore
+c6_feature_tags_1d_avgscore
+c6_feature_tags_3d_matchnum
+c6_feature_tags_3d_maxscore
+c6_feature_tags_3d_avgscore
+c6_feature_tags_7d_matchnum
+c6_feature_tags_7d_maxscore
+c6_feature_tags_7d_avgscore
+c7_feature_tags_1d_matchnum
+c7_feature_tags_1d_maxscore
+c7_feature_tags_1d_avgscore
+c7_feature_tags_3d_matchnum
+c7_feature_tags_3d_maxscore
+c7_feature_tags_3d_avgscore
+c7_feature_tags_7d_matchnum
+c7_feature_tags_7d_maxscore
+c7_feature_tags_7d_avgscore
+c8_feature_share_score
+c8_feature_share_num
+c8_feature_share_rank
+c8_feature_return_score
+c8_feature_return_num
+c8_feature_return_rank
+c9_feature_share_score
+c9_feature_share_num
+c9_feature_share_rank
+c9_feature_return_score
+c9_feature_return_num
+c9_feature_return_rank
+d1_exp
+d1_return_n
+d1_rovn

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_33_bucketData_20240622.scala

@@ -24,7 +24,7 @@ object makedata_ad_33_bucketData_20240622 {
 
     val loader = getClass.getClassLoader
 
-    val resourceUrlBucket = loader.getResource("20240703_ad_bucket_351.txt")
+    val resourceUrlBucket = loader.getResource("20240704_ad_bucket_351.txt")
     val buckets =
       if (resourceUrlBucket != null) {
         val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")

+ 280 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_41_originData_20240709.scala

@@ -0,0 +1,280 @@
+package com.aliyun.odps.spark.examples.makedata_recsys
+
+import com.alibaba.fastjson.{JSON, JSONObject}
+import com.aliyun.odps.TableSchema
+import com.aliyun.odps.data.Record
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils, env}
+import examples.extractor.RankExtractorFeature_20240530
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+import org.xm.Similarity
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+/*
+   20240608 提取特征
+ */
+
+object makedata_recsys_41_originData_20240709 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val tablePart = param.getOrElse("tablePart", "64").toInt
+    val beginStr = param.getOrElse("beginStr", "2023010100")
+    val endStr = param.getOrElse("endStr", "2023010123")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/41_sample_data/")
+    val project = param.getOrElse("project", "loghubods")
+    val table = param.getOrElse("table", "XXXX")
+    val repartition = param.getOrElse("repartition", "32").toInt
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+
+    // 3 循环执行数据生产
+    val timeRange = MyDateUtils.getDateHourRange(beginStr, endStr)
+    for (dt_hh <- timeRange) {
+      val dt = dt_hh.substring(0, 8)
+      val hh = dt_hh.substring(8, 10)
+      val partition = s"dt=$dt,hh=$hh"
+      println("开始执行partiton:" + partition)
+      val odpsData = odpsOps.readTable(project = project,
+          table = table,
+          partition = partition,
+          transfer = func,
+          numPartition = tablePart)
+        .map(record => {
+
+          val featureMap = new JSONObject()
+
+          // a 视频特征
+          val b1: JSONObject = if (record.isNull("b1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b1_feature"))
+          val b2: JSONObject = if (record.isNull("b2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b2_feature"))
+          val b3: JSONObject = if (record.isNull("b3_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b3_feature"))
+          val b6: JSONObject = if (record.isNull("b6_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b6_feature"))
+          val b7: JSONObject = if (record.isNull("b7_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b7_feature"))
+
+          val b8: JSONObject = if (record.isNull("b8_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b8_feature"))
+          val b9: JSONObject = if (record.isNull("b9_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b9_feature"))
+          val b10: JSONObject = if (record.isNull("b10_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b10_feature"))
+          val b11: JSONObject = if (record.isNull("b11_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b11_feature"))
+          val b12: JSONObject = if (record.isNull("b12_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b12_feature"))
+          val b13: JSONObject = if (record.isNull("b13_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b13_feature"))
+          val b17: JSONObject = if (record.isNull("b17_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b17_feature"))
+          val b18: JSONObject = if (record.isNull("b18_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b18_feature"))
+          val b19: JSONObject = if (record.isNull("b19_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("b19_feature"))
+
+
+          val origin_data = List(
+            (b1, b2, b3, "b123"), (b1, b6, b7, "b167"),
+            (b8, b9, b10, "b8910"), (b11, b12, b13, "b111213"),
+            (b17, b18, b19, "b171819")
+          )
+          for ((b_1, b_2, b_3, prefix1) <- origin_data) {
+            for (prefix2 <- List(
+              "1h", "2h", "3h", "4h", "12h", "1d", "3d", "7d"
+            )) {
+              val exp = if (b_1.isEmpty) 0D else b_1.getIntValue("exp_pv_" + prefix2).toDouble
+              val share = if (b_2.isEmpty) 0D else b_2.getIntValue("share_pv_" + prefix2).toDouble
+              val returns = if (b_3.isEmpty) 0D else b_3.getIntValue("return_uv_" + prefix2).toDouble
+              val f1 = RankExtractorFeature_20240530.calDiv(share, exp)
+              val f2 = RankExtractorFeature_20240530.calLog(share)
+              val f3 = RankExtractorFeature_20240530.calDiv(returns, exp)
+              val f4 = RankExtractorFeature_20240530.calLog(returns)
+              val f5 = f3 * f4
+              val f6 = RankExtractorFeature_20240530.calDiv(returns, share)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "STR", f1)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(share)", f2)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV", f3)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "log(return)", f4)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROV*log(return)", f5)
+              featureMap.put(prefix1 + "_" + prefix2 + "_" + "ROS", f6)
+            }
+          }
+
+          val video_info: JSONObject = if (record.isNull("t_v_info_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("t_v_info_feature"))
+          featureMap.put("total_time", if (video_info.containsKey("total_time")) video_info.getIntValue("total_time").toDouble else 0D)
+          featureMap.put("bit_rate", if (video_info.containsKey("bit_rate")) video_info.getIntValue("bit_rate").toDouble else 0D)
+
+          val c1: JSONObject = if (record.isNull("c1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c1_feature"))
+          if (c1.nonEmpty) {
+            featureMap.put("playcnt_6h", if (c1.containsKey("playcnt_6h")) c1.getIntValue("playcnt_6h").toDouble else 0D)
+            featureMap.put("playcnt_1d", if (c1.containsKey("playcnt_1d")) c1.getIntValue("playcnt_1d").toDouble else 0D)
+            featureMap.put("playcnt_3d", if (c1.containsKey("playcnt_3d")) c1.getIntValue("playcnt_3d").toDouble else 0D)
+            featureMap.put("playcnt_7d", if (c1.containsKey("playcnt_7d")) c1.getIntValue("playcnt_7d").toDouble else 0D)
+          }
+          val c2: JSONObject = if (record.isNull("c2_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("c2_feature"))
+          if (c2.nonEmpty) {
+            featureMap.put("share_pv_12h", if (c2.containsKey("share_pv_12h")) c2.getIntValue("share_pv_12h").toDouble else 0D)
+            featureMap.put("share_pv_1d", if (c2.containsKey("share_pv_1d")) c2.getIntValue("share_pv_1d").toDouble else 0D)
+            featureMap.put("share_pv_3d", if (c2.containsKey("share_pv_3d")) c2.getIntValue("share_pv_3d").toDouble else 0D)
+            featureMap.put("share_pv_7d", if (c2.containsKey("share_pv_7d")) c2.getIntValue("share_pv_7d").toDouble else 0D)
+            featureMap.put("return_uv_12h", if (c2.containsKey("return_uv_12h")) c2.getIntValue("return_uv_12h").toDouble else 0D)
+            featureMap.put("return_uv_1d", if (c2.containsKey("return_uv_1d")) c2.getIntValue("return_uv_1d").toDouble else 0D)
+            featureMap.put("return_uv_3d", if (c2.containsKey("return_uv_3d")) c2.getIntValue("return_uv_3d").toDouble else 0D)
+            featureMap.put("return_uv_7d", if (c2.containsKey("return_uv_7d")) c2.getIntValue("return_uv_7d").toDouble else 0D)
+          }
+
+          val title = if (video_info.containsKey("title")) video_info.getString("title") else ""
+          if (!title.equals("")) {
+            for (key_feature <- List("c3_feature", "c4_feature", "c5_feature", "c6_feature", "c7_feature")) {
+              val c34567: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature))
+              for (key_time <- List("tags_1d", "tags_3d", "tags_7d")) {
+                val tags = if (c34567.containsKey(key_time)) c34567.getString(key_time) else ""
+                if (!tags.equals("")) {
+                  val (f1, f2, f3, f4) = funcC34567ForTags(tags, title)
+                  featureMap.put(key_feature + "_" + key_time + "_matchnum", f1)
+                  featureMap.put(key_feature + "_" + key_time + "_maxscore", f3)
+                  featureMap.put(key_feature + "_" + key_time + "_avgscore", f4)
+                }
+              }
+            }
+          }
+
+          val vid = if (record.isNull("vid")) "" else record.getString("vid")
+          if (!vid.equals("")) {
+            for (key_feature <- List("c8_feature", "c9_feature")) {
+              val c89: JSONObject = if (record.isNull(key_feature)) new JSONObject() else
+                JSON.parseObject(record.getString(key_feature))
+              for (key_action <- List("share", "return")) {
+                val cfListStr = if (c89.containsKey(key_action)) c89.getString(key_action) else ""
+                if (!cfListStr.equals("")) {
+                  val cfMap = cfListStr.split(",").map(r => {
+                    val rList = r.split(":")
+                    (rList(0), (rList(1), rList(2), rList(3)))
+                  }).toMap
+                  if (cfMap.contains(vid)) {
+                    val (score, num, rank) = cfMap(vid)
+                    featureMap.put(key_feature + "_" + key_action + "_score", score.toDouble)
+                    featureMap.put(key_feature + "_" + key_action + "_num", num.toDouble)
+                    featureMap.put(key_feature + "_" + key_action + "_rank", 1.0 / rank.toDouble)
+                  }
+                }
+              }
+            }
+          }
+
+          val d1: JSONObject = if (record.isNull("d1_feature")) new JSONObject() else
+            JSON.parseObject(record.getString("d1_feature"))
+          if (d1.nonEmpty) {
+            featureMap.put("d1_exp", if (d1.containsKey("exp")) d1.getString("exp").toDouble else 0D)
+            featureMap.put("d1_return_n", if (d1.containsKey("return_n")) d1.getString("return_n").toDouble else 0D)
+            featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
+          }
+
+
+          /*
+
+
+          视频:
+          曝光使用pv 分享使用pv 回流使用uv --> 1h 2h 3h 4h 12h 1d 3d 7d
+          STR log(share) ROV log(return) ROV*log(return)
+          40个特征组合
+          整体、整体曝光对应、推荐非冷启root、推荐冷启root、分省份root
+          200个特征值
+
+          视频:
+          视频时长、比特率
+
+          人:
+          播放次数 --> 6h 1d 3d 7d --> 4个
+          带回来的分享pv 回流uv --> 12h 1d 3d 7d --> 8个
+          人+vid-title:
+          播放点/回流点/分享点/累积分享/累积回流 --> 1d 3d 7d --> 匹配数量 语义最高相似度分 语义平均相似度分 --> 45个
+          人+vid-cf
+          基于分享行为/基于回流行为 -->  “分享cf”+”回流点击cf“ 相似分 相似数量 相似rank的倒数 --> 12个
+
+          头部视频:
+          曝光 回流 ROVn 3个特征
+
+          场景:
+          小时 星期 apptype city province pagesource 机器型号
+           */
+
+
+          //4 处理label信息。
+          val labels = new JSONObject
+          for (labelKey <- List(
+            "is_play", "is_share", "is_return", "noself_is_return", "return_uv", "noself_return_uv", "total_return_uv",
+            "share_pv", "total_share_uv"
+          )) {
+            if (!record.isNull(labelKey)) {
+              labels.put(labelKey, record.getString(labelKey))
+            }
+          }
+          //5 处理log key表头。
+          val apptype = record.getString("apptype")
+          val pagesource = record.getString("pagesource")
+          val mid = record.getString("mid")
+          // vid 已经提取了
+          val ts = record.getString("ts")
+          val abcode = record.getString("abcode")
+          val level = if (record.isNull("level")) "0" else record.getString("level")
+          val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
+          val labelKey = labels.toString()
+          val featureKey = featureMap.toString()
+          //6 拼接数据,保存。
+          logKey + "\t" + labelKey + "\t" + featureKey
+
+        })
+
+      // 4 保存数据到hdfs
+      val savePartition = dt + hh
+      val hdfsPath = savePath + "/" + savePartition
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        odpsData.coalesce(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+
+  def func(record: Record, schema: TableSchema): Record = {
+    record
+  }
+
+  def funcC34567ForTags(tags: String, title: String): Tuple4[Double, String, Double, Double] = {
+    // 匹配数量 匹配词 语义最高相似度分 语义平均相似度分
+    val tagsList = tags.split(",")
+    var d1 = 0.0
+    val d2 = new ArrayBuffer[String]()
+    var d3 = 0.0
+    var d4 = 0.0
+    for (tag <- tagsList) {
+      if (title.contains(tag)) {
+        d1 = d1 + 1.0
+        d2.add(tag)
+      }
+      val score = Similarity.conceptSimilarity(tag, title)
+      d3 = if (score > d3) score else d3
+      d4 = d4 + score
+    }
+    d4 = if (tagsList.nonEmpty) d4 / tagsList.size else d4
+    (d1, d2.mkString(","), d3, d4)
+  }
+}

+ 103 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_42_bucket_20240709.scala

@@ -0,0 +1,103 @@
+package com.aliyun.odps.spark.examples.makedata_recsys
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_recsys_42_bucket_20240709 {
+  def main(args: Array[String]): Unit = {
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20240709_recsys_feature_name_314.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r=> r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r=> r.nonEmpty).toList
+
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/41_recsys_sample_data_v1/20240705*")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/41_recsys_bucket/")
+    val fileName = param.getOrElse("fileName", "20240705_314_200")
+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
+    val bucketNum = param.getOrElse("bucketNum", "200").toInt
+
+    val data = sc.textFile(readPath)
+    println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
+    val data1 = data.map(r => {
+      val rList = r.split("\t")
+      val jsons = JSON.parseObject(rList(2))
+      val doubles = scala.collection.mutable.Map[String, Double]()
+      jsons.foreach(r =>{
+        doubles.put(r._1, jsons.getDoubleValue(r._1))
+      })
+      doubles
+    }).sample(false, sampleRate ).repartition(20)
+
+    val result = new ArrayBuffer[String]()
+
+    for (i <- contentList.indices){
+      println("特征:" + contentList(i))
+      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
+      val len = data2.length
+      if (len == 0){
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
+      }else{
+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
+        val buffers = new ArrayBuffer[Double]()
+
+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
+        for (j <- 0 until len by oneBucketNum) {
+          val d = data2(j)
+          if (j > 0 && d != lastBucketValue) {
+            // 如果当前切分点不同于上一个切分点,则保存当前切分点
+            buffers += d
+          }
+          lastBucketValue = d // 更新上一个桶的切分点
+        }
+
+        // 最后一个桶的结束点应该是数组的最后一个元素
+        if (!buffers.contains(data2.last)) {
+          buffers += data2.last
+        }
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
+      }
+    }
+    val data3 = sc.parallelize(result)
+
+
+    // 4 保存数据到hdfs
+    val hdfsPath = savePath + "/" + fileName
+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + hdfsPath)
+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + hdfsPath)
+    }
+  }
+}

+ 130 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys/makedata_recsys_43_bucketData_20240709.scala

@@ -0,0 +1,130 @@
+package com.aliyun.odps.spark.examples.makedata_recsys
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUtils}
+import examples.extractor.ExtractorUtils
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+/*
+
+ */
+
+object makedata_recsys_43_bucketData_20240709 {
+  def main(args: Array[String]): Unit = {
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/41_recsys_sample_data_v1/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/43_recsys_train_data_v1/")
+    val beginStr = param.getOrElse("beginStr", "20240703")
+    val endStr = param.getOrElse("endStr", "20240703")
+    val repartition = param.getOrElse("repartition", "100").toInt
+    val filterNames = param.getOrElse("filterNames", "").split(",").toSet
+    val whatLabel = param.getOrElse("whatLabel", "is_return")
+    val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
+    val fileName = param.getOrElse("fileName", "20240709_recsys_bucket_314.txt")
+
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+
+    val resourceUrlBucket = loader.getResource(fileName)
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    println(buckets)
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r =>{
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+
+
+
+
+    val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
+    for (date <- dateRange) {
+      println("开始执行:" + date)
+      val data = sc.textFile(readPath + "/" + date + "*").map(r=>{
+        val rList = r.split("\t")
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val jsons = JSON.parseObject(rList(2))
+        val features = scala.collection.mutable.Map[String, Double]()
+        jsons.foreach(r => {
+          features.put(r._1, jsons.getDoubleValue(r._1))
+        })
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            whatApps.contains(apptype) && pagesource.endsWith("recommend")
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault(whatLabel, "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
+          val result = new ArrayBuffer[String]()
+          val bucketsMap = bucketsMap_br.value
+          row.foreach{
+            case (label, features) =>
+              val featuresBucket = features.map{
+                case (name, score) =>
+                  var ifFilter = false
+                  if (filterNames.nonEmpty){
+                    filterNames.foreach(r=> if (!ifFilter && name.startsWith(r)) {ifFilter = true} )
+                  }
+                  if (ifFilter){
+                    ""
+                  }else{
+                    if (score > 1E-8) {
+                      if (bucketsMap.contains(name)) {
+                        val (bucketsNum, buckets) = bucketsMap(name)
+                        val scoreNew = 1.0 / bucketsNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                        name + ":" + scoreNew.toString
+                      } else {
+                        name + ":" + score.toString
+                      }
+                    } else {
+                      ""
+                    }
+                  }
+              }.filter(_.nonEmpty)
+              result.add(label + "\t" + featuresBucket.mkString("\t"))
+          }
+          result.iterator
+      })
+
+      // 4 保存数据到hdfs
+      val hdfsPath = savePath + "/" + date
+      if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+        println("删除路径并开始数据写入:" + hdfsPath)
+        MyHdfsUtils.delete_hdfs_path(hdfsPath)
+        data.repartition(repartition).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+      } else {
+        println("路径不合法,无法写入:" + hdfsPath)
+      }
+    }
+  }
+}

+ 7 - 9
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-广告

@@ -1,18 +1,14 @@
 
-
-
 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
 --class com.aliyun.odps.spark.examples.makedata_ad.makedata_ad_31_originData_20240620 \
 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:16 \
-beginStr:2024062008 endStr:2024062123 \
+beginStr:2024070108 endStr:2024070323 \
 savePath:/dw/recommend/model/31_ad_sample_data_v3/ \
 table:alg_recsys_ad_sample_all filterHours:00,01,02,03,04,05,06,07 \
 idDefaultValue:0.01 \
-> p31_2024062008.log 2>&1 &
-
-
+> p31_2024070108.log 2>&1 &
 
 
 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
@@ -32,12 +28,14 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 readPath:/dw/recommend/model/31_ad_sample_data_v3/ \
 savePath:/dw/recommend/model/33_ad_train_data_v3/ \
-beginStr:20240620 endStr:20240621 repartition:100 \
+beginStr:20240703 endStr:20240703 repartition:100 \
 filterNames:adid_,targeting_conversion_ \
-> p33_data.log 2>&1 &
+> p33_20240703_.log 2>&1 &
 
+filterNames:adid_,targeting_conversion_ \
 filterNames:cid_,adid_,adverid_,targeting_conversion_ \
-savePath:/dw/recommend/model/33_ad_train_data_nosparse/ \
+filterNames:"XXXXXX,adid_,targeting_conversion_,b2_3h_click,b2_3h_conver*log(view),b2_3h_conver*ctcvr,b2_6h_click,b2_6h_conver*log(view),b2_6h_conver*ctcvr,b2_12h_click,b2_12h_conver*log(view),b2_12h_conver*ctcvr,b2_1d_click,b2_1d_conver*log(view),b2_1d_conver*ctcvr,b2_3d_click,b2_3d_conver*log(view),b2_3d_conver*ctcvr,b2_7d_click,b2_7d_conver*log(view),b2_7d_conver*ctcvr,b3_3h_click,b3_3h_conver*log(view),b3_3h_conver*ctcvr,b3_6h_click,b3_6h_conver*log(view),b3_6h_conver*ctcvr,b3_12h_click,b3_12h_conver*log(view),b3_12h_conver*ctcvr,b3_1d_click,b3_1d_conver*log(view),b3_1d_conver*ctcvr,b3_3d_click,b3_3d_conver*log(view),b3_3d_conver*ctcvr,b3_7d_click,b3_7d_conver*log(view),b3_7d_conver*ctcvr,b4_3h_click,b4_3h_conver*log(view),b4_3h_conver*ctcvr,b4_6h_click,b4_6h_conver*log(view),b4_6h_conver*ctcvr,b4_12h_click,b4_12h_conver*log(view),b4_12h_conver*ctcvr,b4_1d_click,b4_1d_conver*log(view),b4_1d_conver*ctcvr,b4_3d_click,b4_3d_conver*log(view),b4_3d_conver*ctcvr,b4_7d_click,b4_7d_conver*log(view),b4_7d_conver*ctcvr,b5_3h_click,b5_3h_conver*log(view),b5_3h_conver*ctcvr,b5_6h_click,b5_6h_conver*log(view),b5_6h_conver*ctcvr,b5_12h_click,b5_12h_conver*log(view),b5_12h_conver*ctcvr,b5_1d_click,b5_1d_conver*log(view),b5_1d_conver*ctcvr,b5_3d_click,b5_3d_conver*log(view),b5_3d_conver*ctcvr,b5_7d_click,b5_7d_conver*log(view),b5_7d_conver*ctcvr,b8_3h_click,b8_3h_conver*log(view),b8_3h_conver*ctcvr,b8_6h_click,b8_6h_conver*log(view),b8_6h_conver*ctcvr,b8_12h_click,b8_12h_conver*log(view),b8_12h_conver*ctcvr,b8_1d_click,b8_1d_conver*log(view),b8_1d_conver*ctcvr,b8_3d_click,b8_3d_conver*log(view),b8_3d_conver*ctcvr,b8_7d_click,b8_7d_conver*log(view),b8_7d_conver*ctcvr,b6_7d_click,b6_7d_conver*log(view),b6_7d_conver*ctcvr,b6_14d_click,b6_14d_conver*log(view),b6_14d_conver*ctcvr,b7_7d_click,b7_7d_conver*log(view),b7_7d_conver*ctcvr,b7_14d_click,b7_14d_conver*log(view),b7_14d_conver*ctcvr,XXXXXX" \
+
 
 
 /dw/recommend/model/31_ad_sample_data/

+ 49 - 3
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本-推荐

@@ -64,8 +64,14 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 savePath:/dw/recommend/model/04_str_data/ beginStr:20240311 endStr:20240312 featureVersion:v4 ifRepart:100 \
 > p7.log 2>&1 &
-
----
+---------------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------------
+--------------------------------下-----------------------------------------------------------
+--------------------------------面-----------------------------------------------------------
+--------------------------------为-----------------------------------------------------------
+--------------------------------准-----------------------------------------------------------
+---------------------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------------
 
 nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
 --class com.aliyun.odps.spark.examples.makedata.makedata_13_originData_20240529 \
@@ -160,4 +166,44 @@ beginStr:2024062600 endStr:2024062623 \
 readDate:20240626 \
 > p17_20240626.log 2>&1 &
 
-/dw/recommend/model/17_for_check/
+/dw/recommend/model/17_for_check/
+
+
+------------------------------------------------------------------------------------------------------------------------
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_41_originData_20240709 \
+--master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+tablePart:64 repartition:32 \
+beginStr:2024070508 endStr:2024070508 \
+savePath:/dw/recommend/model/41_recsys_sample_data/ \
+table:alg_recsys_sample_all \
+> p41_2024070508.log 2>&1 &
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_42_bucket_20240709 \
+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 16 \
+--conf spark.driver.maxResultSize=16G \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/41_recsys_sample_data_v1/20240705* \
+savePath:/dw/recommend/model/42_recsys_bucket/ \
+fileName:20240705_314_200 \
+bucketNum:200 sampleRate:1.0 \
+> p42.log 2>&1 &
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata_recsys.makedata_recsys_43_bucketData_20240709 \
+--master yarn --driver-memory 2G --executor-memory 4G --executor-cores 1 --num-executors 16 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+readPath:/dw/recommend/model/41_recsys_sample_data/ \
+savePath:/dw/recommend/model/43_recsys_train_data/ \
+beginStr:20240705 endStr:20240705 repartition:100 \
+filterNames:XXXXXXXXX \
+fileName:20240609_bucket_314.txt \
+whatLabel:is_return whatApps:0,4,21,3,6,17,23 \
+> p43_20240705.log 2>&1 &
+
+------------- 20240709_recsys_bucket_314.txt ------------ 20240609_bucket_274.txt -------------
+------------- filterNames:b123_1h_ROS,b123_2h_ROS,b123_3h_ROS,b123_4h_ROS,b123_12h_ROS,b123_1d_ROS,b123_3d_ROS,b123_7d_ROS,b167_1h_ROS,b167_2h_ROS,b167_3h_ROS,b167_4h_ROS,b167_12h_ROS,b167_1d_ROS,b167_3d_ROS,b167_7d_ROS,b8910_1h_ROS,b8910_2h_ROS,b8910_3h_ROS,b8910_4h_ROS,b8910_12h_ROS,b8910_1d_ROS,b8910_3d_ROS,b8910_7d_ROS,b111213_1h_ROS,b111213_2h_ROS,b111213_3h_ROS,b111213_4h_ROS,b111213_12h_ROS,b111213_1d_ROS,b111213_3d_ROS,b111213_7d_ROS,b171819_1h_ROS,b171819_2h_ROS,b171819_3h_ROS,b171819_4h_ROS,b171819_12h_ROS,b171819_1d_ROS,b171819_3d_ROS,b171819_7d_ROS \
+------------- filterNames:XXXXXXXXX \

+ 2 - 2
zhangbo/01_train.sh

@@ -16,8 +16,8 @@ $HADOOP fs -text ${train_path}/${day}/* | /root/sunmingze/alphaFM/bin/fm_train -
 # nohup sh 01_train.sh 20240606 /dw/recommend/model/16_train_data/ model_aka4 1,1,4 >p1_model_aka4.log 2>&1 &
 
 
-# nohup sh 01_train.sh 20240623 /dw/recommend/model/33_ad_train_data_nosparse/ model_test 1,1,0 >p1_model_bkb0.log 2>&1 &
-# nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb8_2 1,1,8 >p1_model_bkb8_2.log 2>&1 &
+# nohup sh 01_train.sh 20240623 /dw/recommend/model/33_ad_train_data_v3/ model_bkb0_v3_23 1,1,0 >p1_model_bkb0_v3_23.log 2>&1 &
+# nohup sh 01_train.sh 20240623 /dw/recommend/model/33_ad_train_data_v3/ model_bkb8_v3_23 1,1,8 >p1_model_bkb8_v3_23.log 2>&1 &
 # nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb4 1,1,4 >p1_model_bkb4.log 2>&1 &
 # nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb12 1,1,12 >p1_model_bkb12.log 2>&1 &
 # nohup sh 01_train.sh 20240620 /dw/recommend/model/33_ad_train_data/ model_bkb16 1,1,16 >p1_model_bkb16.log 2>&1 &

+ 1 - 1
zhangbo/02_train_go.sh

@@ -27,4 +27,4 @@ done
 
 # nohup sh 02_train_go.sh 20240623 20240624 model_bkb8 /dw/recommend/model/33_ad_train_data/ 1,1,8 >p2_model_bkb8.log 2>&1 &
 
-# nohup sh 02_train_go.sh 20240621 20240623 model_bkb0_3 /dw/recommend/model/33_ad_train_data_nosparse/ 1,1,0 >p2_model_bkb0.log 2>&1 &
+# nohup sh 02_train_go.sh 20240703 20240704 model_bkb8_v3 /dw/recommend/model/33_ad_train_data_v3/ 1,1,8 >p2_model_bkb8.log 2>&1 &

+ 1 - 1
zhangbo/03_predict.sh

@@ -17,7 +17,7 @@ cat predict/${output_file}_$day.txt | /root/sunmingze/AUC/AUC
 # nohup sh 03_predict.sh 20240613 /dw/recommend/model/16_train_data/ model_aka8_20240612.txt model_aka8_20240612 8 >p3_model_aka8_12.log 2>&1 &
 
 
-# nohup sh 03_predict.sh 20240624 /dw/recommend/model/33_ad_train_data_nosparse/ model_test_20240623.txt model_test_20240623 0 >p3_model_aka8_on.log 2>&1 &
+# nohup sh 03_predict.sh 20240703 /dw/recommend/model/33_ad_train_data_v3/ model_bkb0_v3_20240702.txt model_bkb0_v3_20240702 0 >p3_model_bkb0_v3.log 2>&1 &
 
 
 

+ 17 - 0
zhangbo/04_upload.sh

@@ -31,3 +31,20 @@ dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_aka8_202406
 
 cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622.txt | awk -F " " '{print $1"\t"$2}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622_change.txt
 dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb0_20240622_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_bkb0.txt
+
+cat /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb8_v3_20240702.txt |
+awk -F " " '{
+    if (NR == 1) {
+        print $1"\t"$2
+    } else {
+        split($0, fields, " ");
+        OFS="\t";
+        line=""
+        for (i = 1; i <= 10 && i <= length(fields); i++) {
+            line = (line ? line "\t" : "") fields[i];
+        }
+        print line
+    }
+}' > /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb8_v3_20240702_change.txt
+
+dfs -put /root/zhangbo/recommend-emr-dataprocess/zhangbo/model/model_bkb8_v3_20240702_change.txt oss://art-recommend.oss-cn-hangzhou.aliyuncs.com/zhangbo/model_bkb8_v3.txt

Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác