Browse Source

feature check

丁云鹏 1 year ago
parent
commit
5ae9787062

+ 1 - 1
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala

@@ -234,7 +234,7 @@ object makedata_13_originData_20240529 {
           val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
           val labelKey = labels.toString()
           val featureKey = featureMap.toString()
-          val allFeatureKey = record.getString("allfeaturemap")
+          val allFeatureKey = if (record.isNull("allfeaturemap")) "{}" else record.getString("allfeaturemap")
           //6 拼接数据,保存。
           logKey + "\t" + labelKey + "\t" + featureKey + "\t" + allFeatureKey
 

+ 4 - 3
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_14_valueData_20240608.scala

@@ -54,14 +54,15 @@ object makedata_14_valueData_20240608 {
         val logKey = rList(0)
         val labelKey = rList(1)
         val featureKey = rList(2)
-        (logKey, labelKey, featureKey)
+        val allFeature = rList(3)
+        (logKey, labelKey, featureKey, allFeature)
       }).filter(r =>
         r._1.split(",")(6).equals("0")
       ).mapPartitions(row => {
         val result = new ArrayBuffer[String]()
         val contentList = contentList_bc.value
         row.foreach {
-          case (logKey, labelKey, featureKey) =>
+          case (logKey, labelKey, featureKey, allFeature) =>
             val featureJson = JSON.parseObject(featureKey)
 
             val featureValues = contentList.map(key => {
@@ -71,7 +72,7 @@ object makedata_14_valueData_20240608 {
                 0.0
               }
             })
-            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(","))
+            result.add(logKey + "\t" + labelKey + "\t" + featureValues.mkString(",") + "\t" + allFeature)
         }
         result.iterator
       })

+ 12 - 7
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala

@@ -4,8 +4,8 @@ import com.aliyun.odps.spark.examples.myUtils.{MyDateUtils, MyHdfsUtils, ParamUt
 import examples.extractor.ExtractorUtils
 import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.sql.SparkSession
+import com.alibaba.fastjson.{JSON, JSONObject}
 
-import com.alibaba.fastjson.JSON
 import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
@@ -76,32 +76,37 @@ object makedata_16_bucketData_20240609 {
         val features = rList(2).split(",").map(_.toDouble)
         val allFeature: JSONObject = if (rList(3).equals("\\\\N")) new JSONObject() else
                                      JSON.parseObject(rList(3))
-        (logKey, labelKey, features, allFeatureKey)
+        (logKey, labelKey, features, allFeature)
       })
         .filter{
-          case (logKey, labelKey, features) =>
+          case (logKey, labelKey, features, allFeature) =>
             val logKeyList = logKey.split(",")
             val apptype = logKeyList(0)
             val pagesource = logKeyList(1)
             Set("0", "4", "5", "21", "3", "6").contains(apptype) && pagesource.endsWith("recommend")
         }
         .map{
-          case (logKey, labelKey, features) =>
+          case (logKey, labelKey, features, allFeature) =>
             val label = JSON.parseObject(labelKey).getOrDefault("is_return", "0").toString
-            (label, features)
+            (label, features, allFeature)
         }
         .mapPartitions(row => {
         val result = new ArrayBuffer[String]()
         val contentList = contentList_br.value
         val bucketsMap = bucketsMap_br.value
         row.foreach{
-          case (label, features) =>
+          case (label, features, allFeature) =>
             val featuresBucket = contentList.indices.map(i =>{
               val featureName = contentList(i)
               val score = features(i)
               // 用户
               if (featureName.startsWith("c")) {
-                allFeature.getOrDefault(featureName, "0").toString
+                val scoreNew = allFeature.getOrDefault(featureName, "").toString
+                if (scoreNew.equals("")) {
+                  ""
+                } else {
+                  featureName + ":" + scoreNew.toString
+                }
               } else {
                 if (score > 1E-8) {
                   val (bucketNum, buckets) = bucketsMap(featureName)