zhangbo 10 місяців тому
батько
коміт
6e1bf37a60

+ 2 - 2
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala

@@ -62,7 +62,7 @@ object makedata_14_valueData_20240608 {
         val contentList = contentList_bc.value
         row.foreach {
           case (logKey, labelKey, featureKey) =>
-            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
+//            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
             val featureJson = JSON.parseObject(featureKey)
 
             val featureValues = contentList.map(key => {
@@ -72,7 +72,7 @@ object makedata_14_valueData_20240608 {
                 0.0
               }
             })
-            result.add(label + "\t" + featureValues.mkString(","))
+            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(","))
         }
         result.iterator
       })

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala

@@ -46,7 +46,7 @@ object makedata_15_bucket_20240608 {
     val data = sc.textFile(readPath)
     val data1 = data.map(r => {
       val rList = r.split("\t")
-      val doubles = rList(1).split(",").map(_.toDouble)
+      val doubles = rList(2).split(",").map(_.toDouble)
       doubles
     })
 

+ 19 - 4
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala

@@ -5,6 +5,7 @@ import examples.extractor.ExtractorUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.SparkSession
 
+import com.alibaba.fastjson.JSON
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
@@ -70,10 +71,24 @@ object makedata_16_bucketData_20240609 {
       println("开始执行:" + date)
       val data = sc.textFile(readPath + date).map(r=>{
         val rList = r.split("\t")
-        val label = rList(0)
-        val features = rList(1).split(",").map(_.toDouble)
-        (label, features)
-      }).mapPartitions(row => {
+        val logKey = rList(0)
+        val labelKey = rList(1)
+        val features = rList(2).split(",").map(_.toDouble)
+        (logKey, labelKey, features)
+      })
+        .filter{
+          case (logKey, labelKey, features) =>
+            val logKeyList = logKey.split(",")
+            val apptype = logKeyList(0)
+            val pagesource = logKeyList(1)
+            Set("0", "4", "5", "21", "3", "6").contains(apptype) && pagesource.endsWith("recommend")
+        }
+        .map{
+          case (logKey, labelKey, features) =>
+            val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
+            (label, features)
+        }
+        .mapPartitions(row => {
         val result = new ArrayBuffer[String]()
         val contentList = contentList_br.value
         val bucketsMap = bucketsMap_br.value