Ver código fonte

特征验证

丁云鹏 1 ano atrás
pai
commit
390ff54904

+ 2 - 2
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_13_originData_20240529.scala

@@ -183,7 +183,6 @@ object makedata_13_originData_20240529 {
             featureMap.put("d1_rovn", if (d1.containsKey("rovn")) d1.getString("rovn").toDouble else 0D)
           }
 
-
           /*
 
 
@@ -235,8 +234,9 @@ object makedata_13_originData_20240529 {
           val logKey = (apptype, pagesource, mid, vid, ts, abcode, level).productIterator.mkString(",")
           val labelKey = labels.toString()
           val featureKey = featureMap.toString()
+          val allFeatureKey = record.getString("allfeaturemap")
           //6 拼接数据,保存。
-          logKey + "\t" + labelKey + "\t" + featureKey
+          logKey + "\t" + labelKey + "\t" + featureKey + "\t" + allFeatureKey
 
         })
 

+ 14 - 7
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_16_bucketData_20240609.scala

@@ -74,7 +74,9 @@ object makedata_16_bucketData_20240609 {
         val logKey = rList(0)
         val labelKey = rList(1)
         val features = rList(2).split(",").map(_.toDouble)
-        (logKey, labelKey, features)
+        val allFeature: JSONObject = if (rList(3).equals("\\\\N")) new JSONObject() else
+                                     JSON.parseObject(rList(3))
+        (logKey, labelKey, features, allFeatureKey)
       })
         .filter{
           case (logKey, labelKey, features) =>
@@ -97,12 +99,17 @@ object makedata_16_bucketData_20240609 {
             val featuresBucket = contentList.indices.map(i =>{
               val featureName = contentList(i)
               val score = features(i)
-              if (score > 1E-8){
-                val (bucketNum, buckets) = bucketsMap(featureName)
-                val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
-                featureName + ":" + scoreNew.toString
-              }else{
-                ""
+              // 用户
+              if (featureName.startsWith("c")) {
+                allFeature.getOrDefault(featureName, "0").toString
+              } else {
+                if (score > 1E-8) {
+                  val (bucketNum, buckets) = bucketsMap(featureName)
+                  val scoreNew = 1.0 / bucketNum * (ExtractorUtils.findInsertPosition(buckets, score).toDouble + 1.0)
+                  featureName + ":" + scoreNew.toString
+                } else {
+                  ""
+                }
               }
             }).filter(_.nonEmpty)
             result.add(label + "\t" + featuresBucket.mkString("\t"))