jch 1 napja
szülő
commit
fc8096bc88

+ 12 - 5
recommend-model-produce/src/main/scala/com/tzld/piaoquan/recommend/model/pred_profile_gender_xgb_20251114.scala

@@ -27,6 +27,7 @@ object pred_profile_gender_xgb_20251114 {
     val testPath = param.getOrElse("testPath", "")
     val featureFile = param.getOrElse("featureFile", "20241209_recsys_rov_name.txt")
     val minCnt = param.getOrElse("minCnt", "10").toDouble
+    val minFeatCnt = param.getOrElse("minFeatCnt", "1").toDouble
     val savePath = param.getOrElse("savePath", "/dw/recommend/model/user_profile/gender/result")
     val repartition = param.getOrElse("repartition", "20").toInt
 
@@ -45,6 +46,7 @@ object pred_profile_gender_xgb_20251114 {
     model.setMissing(0.0f).setFeaturesCol("features")
     val testData = createData(
       minCnt,
+      minFeatCnt,
       sc.textFile(testPath),
       features
     )
@@ -76,26 +78,31 @@ object pred_profile_gender_xgb_20251114 {
     println("---------------------------------\n")
   }
 
-  def createData(minCnt: Double, data: RDD[String], features: Array[String]): RDD[Row] = {
+  def createData(minCnt: Double, minFeatCnt: Double, data: RDD[String], features: Array[String]): RDD[Row] = {
+    val featureSet = features.toSet
     data
       .map(row => {
         val cells: Array[String] = StringUtils.split(row, '\t')
         val mid = cells(0)
         val label = NumberUtils.toInt(cells(1))
         val featureMap: util.Map[String, Double] = new util.HashMap[String, Double]
+        var featCnt = 0
         for (i <- 2 until cells.length) {
           val fv: Array[String] = StringUtils.split(cells(i), ':')
           featureMap.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
+          if (featureSet.contains(fv(0))) {
+            featCnt += 1
+          }
         }
-        (mid, label, featureMap)
+        (mid, label, featureMap, featCnt)
       })
       .filter {
-        case (mid, label, featureMap) =>
+        case (mid, label, featureMap, featCnt) =>
           val cnt = featureMap.getOrDefault("cnt", 0.0d)
-          cnt >= minCnt
+          cnt >= minCnt && featCnt >= minFeatCnt
       }
       .map {
-        case (mid, label, featureMap) =>
+        case (mid, label, featureMap, featCnt) =>
           val v: Array[Any] = new Array[Any](features.length + 3)
           v(0) = label
           for (i <- features.indices) {

+ 12 - 5
recommend-model-produce/src/main/scala/com/tzld/piaoquan/recommend/model/train_profile_gender_xgb_20251114.scala

@@ -23,6 +23,7 @@ object train_profile_gender_xgb_20251114 {
     val trainPath = param.getOrElse("trainPath", "/dw/recommend/model/user_profile/gender/sample/train/2025_y")
     val featureFile = param.getOrElse("featureFile", "20241209_recsys_nor_name.txt")
     val minCnt = param.getOrElse("minCnt", "10").toDouble
+    val minFeatCnt = param.getOrElse("minFeatCnt", "1").toDouble
     val eta = param.getOrElse("eta", "0.01").toDouble
     val gamma = param.getOrElse("gamma", "0.0").toDouble
     val max_depth = param.getOrElse("max_depth", "5").toInt
@@ -35,6 +36,7 @@ object train_profile_gender_xgb_20251114 {
     val features = loadFeatureNames(featureFile)
     val trainData = createData(
       minCnt,
+      minFeatCnt,
       sc.textFile(trainPath),
       features
     )
@@ -71,26 +73,31 @@ object train_profile_gender_xgb_20251114 {
     }
   }
 
-  def createData(minCnt: Double, data: RDD[String], features: Array[String]): RDD[Row] = {
+  def createData(minCnt: Double, minFeatCnt: Double, data: RDD[String], features: Array[String]): RDD[Row] = {
+    val featureSet = features.toSet
     data
       .map(row => {
         val cells: Array[String] = StringUtils.split(row, '\t')
         val mid = cells(0)
         val label = NumberUtils.toInt(cells(1))
         val featureMap: util.Map[String, Double] = new util.HashMap[String, Double]
+        var featCnt = 0
         for (i <- 2 until cells.length) {
           val fv: Array[String] = StringUtils.split(cells(i), ':')
           featureMap.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
+          if (featureSet.contains(fv(0))) {
+            featCnt += 1
+          }
         }
-        (mid, label, featureMap)
+        (mid, label, featureMap, featCnt)
       })
       .filter {
-        case (mid, label, featureMap) =>
+        case (mid, label, featureMap, featCnt) =>
           val cnt = featureMap.getOrDefault("cnt", 0.0d)
-          cnt >= minCnt
+          cnt >= minCnt && featCnt >= minFeatCnt
       }
       .map {
-        case (mid, label, featureMap) =>
+        case (mid, label, featureMap, featCnt) =>
           val v: Array[Any] = new Array[Any](features.length + 1)
           v(0) = label
           for (i <- features.indices) {