|
@@ -23,6 +23,7 @@ object train_profile_gender_xgb_20251114 {
|
|
|
val trainPath = param.getOrElse("trainPath", "/dw/recommend/model/user_profile/gender/sample/train/2025_y")
|
|
val trainPath = param.getOrElse("trainPath", "/dw/recommend/model/user_profile/gender/sample/train/2025_y")
|
|
|
val featureFile = param.getOrElse("featureFile", "20241209_recsys_nor_name.txt")
|
|
val featureFile = param.getOrElse("featureFile", "20241209_recsys_nor_name.txt")
|
|
|
val minCnt = param.getOrElse("minCnt", "10").toDouble
|
|
val minCnt = param.getOrElse("minCnt", "10").toDouble
|
|
|
|
|
+ val minFeatCnt = param.getOrElse("minFeatCnt", "1").toDouble
|
|
|
val eta = param.getOrElse("eta", "0.01").toDouble
|
|
val eta = param.getOrElse("eta", "0.01").toDouble
|
|
|
val gamma = param.getOrElse("gamma", "0.0").toDouble
|
|
val gamma = param.getOrElse("gamma", "0.0").toDouble
|
|
|
val max_depth = param.getOrElse("max_depth", "5").toInt
|
|
val max_depth = param.getOrElse("max_depth", "5").toInt
|
|
@@ -35,6 +36,7 @@ object train_profile_gender_xgb_20251114 {
|
|
|
val features = loadFeatureNames(featureFile)
|
|
val features = loadFeatureNames(featureFile)
|
|
|
val trainData = createData(
|
|
val trainData = createData(
|
|
|
minCnt,
|
|
minCnt,
|
|
|
|
|
+ minFeatCnt,
|
|
|
sc.textFile(trainPath),
|
|
sc.textFile(trainPath),
|
|
|
features
|
|
features
|
|
|
)
|
|
)
|
|
@@ -71,26 +73,31 @@ object train_profile_gender_xgb_20251114 {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- def createData(minCnt: Double, data: RDD[String], features: Array[String]): RDD[Row] = {
|
|
|
|
|
|
|
+ def createData(minCnt: Double, minFeatCnt: Double, data: RDD[String], features: Array[String]): RDD[Row] = {
|
|
|
|
|
+ val featureSet = features.toSet
|
|
|
data
|
|
data
|
|
|
.map(row => {
|
|
.map(row => {
|
|
|
val cells: Array[String] = StringUtils.split(row, '\t')
|
|
val cells: Array[String] = StringUtils.split(row, '\t')
|
|
|
val mid = cells(0)
|
|
val mid = cells(0)
|
|
|
val label = NumberUtils.toInt(cells(1))
|
|
val label = NumberUtils.toInt(cells(1))
|
|
|
val featureMap: util.Map[String, Double] = new util.HashMap[String, Double]
|
|
val featureMap: util.Map[String, Double] = new util.HashMap[String, Double]
|
|
|
|
|
+ var featCnt = 0
|
|
|
for (i <- 2 until cells.length) {
|
|
for (i <- 2 until cells.length) {
|
|
|
val fv: Array[String] = StringUtils.split(cells(i), ':')
|
|
val fv: Array[String] = StringUtils.split(cells(i), ':')
|
|
|
featureMap.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
|
|
featureMap.put(fv(0), NumberUtils.toDouble(fv(1), 0.0))
|
|
|
|
|
+ if (featureSet.contains(fv(0))) {
|
|
|
|
|
+ featCnt += 1
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
- (mid, label, featureMap)
|
|
|
|
|
|
|
+ (mid, label, featureMap, featCnt)
|
|
|
})
|
|
})
|
|
|
.filter {
|
|
.filter {
|
|
|
- case (mid, label, featureMap) =>
|
|
|
|
|
|
|
+ case (mid, label, featureMap, featCnt) =>
|
|
|
val cnt = featureMap.getOrDefault("cnt", 0.0d)
|
|
val cnt = featureMap.getOrDefault("cnt", 0.0d)
|
|
|
- cnt >= minCnt
|
|
|
|
|
|
|
+ cnt >= minCnt && featCnt >= minFeatCnt
|
|
|
}
|
|
}
|
|
|
.map {
|
|
.map {
|
|
|
- case (mid, label, featureMap) =>
|
|
|
|
|
|
|
+ case (mid, label, featureMap, featCnt) =>
|
|
|
val v: Array[Any] = new Array[Any](features.length + 1)
|
|
val v: Array[Any] = new Array[Any](features.length + 1)
|
|
|
v(0) = label
|
|
v(0) = label
|
|
|
for (i <- features.indices) {
|
|
for (i <- features.indices) {
|