|
@@ -21,11 +21,11 @@ object makedata_recsys_61_nor_sample_20241209 {
|
|
|
val beginStr = param.getOrElse("beginStr", "20241210")
|
|
|
val endStr = param.getOrElse("endStr", "20241210")
|
|
|
val repartition = param.getOrElse("repartition", "100").toInt
|
|
|
- val filterNames = param.getOrElse("filterNames", "XXXXXXXXXX").split(",").filter(_.nonEmpty).toSet
|
|
|
val whatLabel = param.getOrElse("whatLabel", "total_return_uv_new")
|
|
|
val whatApps = param.getOrElse("whatApps", "0,4,5,21,3,6").split(",").toSet
|
|
|
val fuSampleRate = param.getOrElse("fuSampleRate", "-1.0").toDouble
|
|
|
- val fileName = param.getOrElse("fileName", "20241209_recsys_nor_bucket.txt")
|
|
|
+ val featureNameFile = param.getOrElse("featureName", "20241209_recsys_nor_name.txt")
|
|
|
+ val featureBucketFile = param.getOrElse("featureBucket", "20241209_recsys_nor_bucket.txt")
|
|
|
|
|
|
val spark = SparkSession
|
|
|
.builder()
|
|
@@ -34,26 +34,9 @@ object makedata_recsys_61_nor_sample_20241209 {
|
|
|
val sc = spark.sparkContext
|
|
|
|
|
|
val loader = getClass.getClassLoader
|
|
|
-
|
|
|
- val resourceUrlBucket = loader.getResource(fileName)
|
|
|
- val buckets =
|
|
|
- if (resourceUrlBucket != null) {
|
|
|
- val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
|
|
|
- Source.fromURL(resourceUrlBucket).close()
|
|
|
- buckets
|
|
|
- } else {
|
|
|
- ""
|
|
|
- }
|
|
|
- println(buckets)
|
|
|
- val bucketsMap = buckets.split("\n")
|
|
|
- .map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
- .filter(r => r.nonEmpty)
|
|
|
- .map(r => {
|
|
|
- val rList = r.split("\t")
|
|
|
- (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
- }).toMap
|
|
|
- val bucketsMap_br = sc.broadcast(bucketsMap)
|
|
|
-
|
|
|
+ val featureNameSet = loadUseFeatureNames(loader, featureNameFile)
|
|
|
+ val featureBucketMap = loadUseFeatureBuckets(loader, featureBucketFile)
|
|
|
+ val bucketsMap_br = sc.broadcast(featureBucketMap)
|
|
|
|
|
|
val dateRange = MyDateUtils.getDateRange(beginStr, endStr)
|
|
|
for (date <- dateRange) {
|
|
@@ -92,13 +75,7 @@ object makedata_recsys_61_nor_sample_20241209 {
|
|
|
case (label, features) =>
|
|
|
val featuresBucket = features.map {
|
|
|
case (name, score) =>
|
|
|
- var ifFilter = false
|
|
|
- if (filterNames.nonEmpty) {
|
|
|
- filterNames.foreach(r => if (!ifFilter && name.contains(r)) {
|
|
|
- ifFilter = true
|
|
|
- })
|
|
|
- }
|
|
|
- if (ifFilter) {
|
|
|
+ if (!featureNameSet.contains(name)) {
|
|
|
""
|
|
|
} else {
|
|
|
if (score > 1E-8) {
|
|
@@ -130,4 +107,38 @@ object makedata_recsys_61_nor_sample_20241209 {
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ private def loadFileData(loader: ClassLoader, nameFile: String): String = {
|
|
|
+ val resourceUrlBucket = loader.getResource(nameFile)
|
|
|
+ val data =
|
|
|
+ if (resourceUrlBucket != null) {
|
|
|
+ val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
|
|
|
+ Source.fromURL(resourceUrlBucket).close()
|
|
|
+ buckets
|
|
|
+ } else {
|
|
|
+ ""
|
|
|
+ }
|
|
|
+ data
|
|
|
+ }
|
|
|
+
|
|
|
+ private def loadUseFeatureNames(loader: ClassLoader, nameFile: String): Set[String] = {
|
|
|
+ val names = loadFileData(loader, nameFile)
|
|
|
+ println(names)
|
|
|
+ names.split("\n")
|
|
|
+ .map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
+ .filter(r => r.nonEmpty)
|
|
|
+ .toSet
|
|
|
+ }
|
|
|
+
|
|
|
+ private def loadUseFeatureBuckets(loader: ClassLoader, nameFile: String): Map[String, (Double, Array[Double])] = {
|
|
|
+ val buckets = loadFileData(loader, nameFile)
|
|
|
+ println(buckets)
|
|
|
+ buckets.split("\n")
|
|
|
+ .map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
+ .filter(r => r.nonEmpty)
|
|
|
+ .map(r => {
|
|
|
+ val rList = r.split("\t")
|
|
|
+ (rList(0), (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
+ }).toMap
|
|
|
+ }
|
|
|
}
|