|
|
@@ -33,9 +33,8 @@ object makedata_profile_gender_sample_20251114 {
|
|
|
val sc = spark.sparkContext
|
|
|
|
|
|
// 3. 处理数据
|
|
|
- val loader = getClass.getClassLoader
|
|
|
val featureSet = loadFeatureNames(featureFile)
|
|
|
- val featureBucketMap = DataUtils.loadUseFeatureBuckets(loader, notUseBucket, featureBucketFile)
|
|
|
+ val featureBucketMap = loadUseFeatureBuckets(notUseBucket, featureBucketFile)
|
|
|
val bucketsMap_br = sc.broadcast(featureBucketMap)
|
|
|
for (suffix <- suffixSet) {
|
|
|
val partition = "%s_%s".format(year, suffix)
|
|
|
@@ -101,4 +100,24 @@ object makedata_profile_gender_sample_20251114 {
|
|
|
println(featSet)
|
|
|
featSet
|
|
|
}
|
|
|
+
|
|
|
+ def loadUseFeatureBuckets(notUseBucket: Int, bucketFile: String): Map[String, (Double, Array[Double])] = {
|
|
|
+ if (notUseBucket > 0) {
|
|
|
+ return Map[String, (Double, Array[Double])]()
|
|
|
+ }
|
|
|
+
|
|
|
+ val buffer = Source.fromFile(bucketFile)
|
|
|
+ val lines = buffer.getLines().mkString("\n")
|
|
|
+ buffer.close()
|
|
|
+ val bucketMap = lines.split("\n")
|
|
|
+ .map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
+ .filter(r => r.nonEmpty)
|
|
|
+ .map(r => {
|
|
|
+ val rList = r.split("\t")
|
|
|
+ (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
+ }).toMap
|
|
|
+ println("bucketMap.size=" + bucketMap.size)
|
|
|
+ println(bucketMap)
|
|
|
+ bucketMap
|
|
|
+ }
|
|
|
}
|