|
@@ -18,7 +18,9 @@ object makedata_profile_gender_sample_20251114 {
|
|
|
val suffixSet = param.getOrElse("suffix", "y,8,4,0,e,a,c,k,o,w,g,s,u,q,i,m").split(",").toSet
|
|
val suffixSet = param.getOrElse("suffix", "y,8,4,0,e,a,c,k,o,w,g,s,u,q,i,m").split(",").toSet
|
|
|
val whatLabel = param.getOrElse("whatLabel", "gender")
|
|
val whatLabel = param.getOrElse("whatLabel", "gender")
|
|
|
val classSet = param.getOrElse("class", "1,2").split(",").toSet
|
|
val classSet = param.getOrElse("class", "1,2").split(",").toSet
|
|
|
|
|
+ val notUseBucket = param.getOrElse("notUseBucket", "1").toInt
|
|
|
val featureFile = param.getOrElse("featureFile", "20241209_recsys_nor_name.txt")
|
|
val featureFile = param.getOrElse("featureFile", "20241209_recsys_nor_name.txt")
|
|
|
|
|
+ val featureBucketFile = param.getOrElse("featureBucket", "20241209_recsys_nor_bucket.txt")
|
|
|
val minCnt = param.getOrElse("minCnt", "10").toDouble
|
|
val minCnt = param.getOrElse("minCnt", "10").toDouble
|
|
|
val repartition = param.getOrElse("repartition", "100").toInt
|
|
val repartition = param.getOrElse("repartition", "100").toInt
|
|
|
val savePath = param.getOrElse("savePath", "/dw/recommend/model/user_profile/gender/sample/")
|
|
val savePath = param.getOrElse("savePath", "/dw/recommend/model/user_profile/gender/sample/")
|
|
@@ -33,7 +35,7 @@ object makedata_profile_gender_sample_20251114 {
|
|
|
// 3. 处理数据
|
|
// 3. 处理数据
|
|
|
val loader = getClass.getClassLoader
|
|
val loader = getClass.getClassLoader
|
|
|
val featureSet = loadFeatureNames(featureFile)
|
|
val featureSet = loadFeatureNames(featureFile)
|
|
|
- val featureBucketMap = DataUtils.loadUseFeatureBuckets(loader, 1, "")
|
|
|
|
|
|
|
+ val featureBucketMap = DataUtils.loadUseFeatureBuckets(loader, notUseBucket, featureBucketFile)
|
|
|
val bucketsMap_br = sc.broadcast(featureBucketMap)
|
|
val bucketsMap_br = sc.broadcast(featureBucketMap)
|
|
|
for (suffix <- suffixSet) {
|
|
for (suffix <- suffixSet) {
|
|
|
val partition = "%s_%s".format(year, suffix)
|
|
val partition = "%s_%s".format(year, suffix)
|