Kaynağa Gözat

rov和nor样本特征选择

jch 4 ay önce
ebeveyn
işleme
3942f7ceb7

+ 40 - 29
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys_r_rate/makedata_recsys_61_nor_sample_20241209.scala

@@ -21,11 +21,11 @@ object makedata_recsys_61_nor_sample_20241209 {
     val beginStr = param.getOrElse("beginStr", "20241210")
     val endStr = param.getOrElse("endStr", "20241210")
     val repartition = param.getOrElse("repartition", "100").toInt
-    val filterNames = param.getOrElse("filterNames", "XXXXXXXXXX").split(",").filter(_.nonEmpty).toSet
     val whatLabel = param.getOrElse("whatLabel", "total_return_uv_new")
     val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
     val fuSampleRate = param.getOrElse("fuSampleRate", "-1.0").toDouble
-    val fileName = param.getOrElse("fileName", "20241209_recsys_nor_bucket.txt")
+    val featureNameFile = param.getOrElse("featureName", "20241209_recsys_nor_name.txt")
+    val featureBucketFile = param.getOrElse("featureBucket", "20241209_recsys_nor_bucket.txt")
 
     val spark = SparkSession
       .builder()
@@ -34,26 +34,9 @@ object makedata_recsys_61_nor_sample_20241209 {
     val sc = spark.sparkContext
 
     val loader = getClass.getClassLoader
-
-    val resourceUrlBucket = loader.getResource(fileName)
-    val buckets =
-      if (resourceUrlBucket != null) {
-        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
-        Source.fromURL(resourceUrlBucket).close()
-        buckets
-      } else {
-        ""
-      }
-    println(buckets)
-    val bucketsMap = buckets.split("\n")
-      .map(r => r.replace(" ", "").replaceAll("\n", ""))
-      .filter(r => r.nonEmpty)
-      .map(r => {
-        val rList = r.split("\t")
-        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
-      }).toMap
-    val bucketsMap_br = sc.broadcast(bucketsMap)
-
+    val featureNameSet = loadUseFeatureNames(loader, featureNameFile)
+    val featureBucketMap = loadUseFeatureBuckets(loader, featureBucketFile)
+    val bucketsMap_br = sc.broadcast(featureBucketMap)
 
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (date <- dateRange) {
@@ -92,13 +75,7 @@ object makedata_recsys_61_nor_sample_20241209 {
             case (label, features) =>
               val featuresBucket = features.map {
                 case (name, score) =>
-                  var ifFilter = false
-                  if (filterNames.nonEmpty) {
-                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
-                      ifFilter = true
-                    })
-                  }
-                  if (ifFilter) {
+                  if (!featureNameSet.contains(name)) {
                     ""
                   } else {
                     if (score > 1E-8) {
@@ -130,4 +107,38 @@ object makedata_recsys_61_nor_sample_20241209 {
       }
     }
   }
+
+  private def loadFileData(loader: ClassLoader, nameFile: String): String = {
+    val resourceUrlBucket = loader.getResource(nameFile)
+    val data =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    data
+  }
+
+  private def loadUseFeatureNames(loader: ClassLoader, nameFile: String): Set[String] = {
+    val names = loadFileData(loader, nameFile)
+    println(names)
+    names.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .toSet
+  }
+
+  private def loadUseFeatureBuckets(loader: ClassLoader, nameFile: String): Map[String, (Double, Array[Double])] = {
+    val buckets = loadFileData(loader, nameFile)
+    println(buckets)
+    buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+  }
 }

+ 40 - 29
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys_r_rate/makedata_recsys_61_rov_sample_20241209.scala

@@ -25,11 +25,11 @@ object makedata_recsys_61_rov_sample_20241209 {
     val beginStr = param.getOrElse("beginStr", "20241210")
     val endStr = param.getOrElse("endStr", "20241210")
     val repartition = param.getOrElse("repartition", "100").toInt
-    val filterNames = param.getOrElse("filterNames", "XXXXXXXXXX").split(",").filter(_.nonEmpty).toSet
     val whatLabel = param.getOrElse("whatLabel", "is_return")
     val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
     val fuSampleRate = param.getOrElse("fuSampleRate", "1.0").toDouble
-    val fileName = param.getOrElse("fileName", "20241209_recsys_rov_bucket.txt")
+    val featureNameFile = param.getOrElse("featureName", "20241209_recsys_rov_name.txt")
+    val featureBucketFile = param.getOrElse("featureBucket", "20241209_recsys_rov_bucket.txt")
 
     val spark = SparkSession
       .builder()
@@ -38,26 +38,9 @@ object makedata_recsys_61_rov_sample_20241209 {
     val sc = spark.sparkContext
 
     val loader = getClass.getClassLoader
-
-    val resourceUrlBucket = loader.getResource(fileName)
-    val buckets =
-      if (resourceUrlBucket != null) {
-        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
-        Source.fromURL(resourceUrlBucket).close()
-        buckets
-      } else {
-        ""
-      }
-    println(buckets)
-    val bucketsMap = buckets.split("\n")
-      .map(r => r.replace(" ", "").replaceAll("\n", ""))
-      .filter(r => r.nonEmpty)
-      .map(r => {
-        val rList = r.split("\t")
-        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
-      }).toMap
-    val bucketsMap_br = sc.broadcast(bucketsMap)
-
+    val featureNameSet = loadUseFeatureNames(loader, featureNameFile)
+    val featureBucketMap = loadUseFeatureBuckets(loader, featureBucketFile)
+    val bucketsMap_br = sc.broadcast(featureBucketMap)
 
     val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
     for (date <- dateRange) {
@@ -96,13 +79,7 @@ object makedata_recsys_61_rov_sample_20241209 {
             case (label, features) =>
               val featuresBucket = features.map {
                 case (name, score) =>
-                  var ifFilter = false
-                  if (filterNames.nonEmpty) {
-                    filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
-                      ifFilter = true
-                    })
-                  }
-                  if (ifFilter) {
+                  if (!featureNameSet.contains(name)) {
                     ""
                   } else {
                     if (score > 1E-8) {
@@ -134,4 +111,38 @@ object makedata_recsys_61_rov_sample_20241209 {
       }
     }
   }
+
+  private def loadFileData(loader: ClassLoader, nameFile: String): String = {
+    val resourceUrlBucket = loader.getResource(nameFile)
+    val data =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    data
+  }
+
+  private def loadUseFeatureNames(loader: ClassLoader, nameFile: String): Set[String] = {
+    val names = loadFileData(loader, nameFile)
+    println(names)
+    names.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .toSet
+  }
+
+  private def loadUseFeatureBuckets(loader: ClassLoader, nameFile: String): Map[String, (Double, Array[Double])] = {
+    val buckets = loadFileData(loader, nameFile)
+    println(buckets)
+    buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+  }
 }