|
@@ -33,7 +33,7 @@ object makedata_ad_32_bucket_20240718 {
|
|
|
|
|
|
val loader = getClass.getClassLoader
|
|
|
val resourceUrl = loader.getResource(featureNameFile)
|
|
|
- val content =
|
|
|
+ val featureNameContent =
|
|
|
if (resourceUrl != null) {
|
|
|
val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
|
|
|
Source.fromURL(resourceUrl).close()
|
|
@@ -41,15 +41,13 @@ object makedata_ad_32_bucket_20240718 {
|
|
|
} else {
|
|
|
""
|
|
|
}
|
|
|
- println(content)
|
|
|
- val contentList = content.split("\n")
|
|
|
+ println(featureNameContent)
|
|
|
+ val featureNames = featureNameContent.split("\n")
|
|
|
.map(r=> r.replace(" ", "").replaceAll("\n", ""))
|
|
|
.filter(r=> r.nonEmpty).toList
|
|
|
|
|
|
-
|
|
|
-
|
|
|
val data = sc.textFile(readPath)
|
|
|
- println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
|
|
|
+
|
|
|
val data1 = data.map(r => {
|
|
|
val rList = r.split("\t")
|
|
|
val jsons = JSON.parseObject(rList(2))
|
|
@@ -58,16 +56,16 @@ object makedata_ad_32_bucket_20240718 {
|
|
|
doubles.put(r._1, jsons.getDoubleValue(r._1))
|
|
|
})
|
|
|
doubles
|
|
|
- }).sample(false, sampleRate ).repartition(20)
|
|
|
+ }).sample(false, sampleRate)
|
|
|
|
|
|
val result = new ArrayBuffer[String]()
|
|
|
|
|
|
- for (i <- contentList.indices){
|
|
|
- println("特征:" + contentList(i))
|
|
|
- val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
|
|
|
+ for (i <- featureNames.indices){
|
|
|
+ println("特征:" + featureNames(i))
|
|
|
+ val data2 = data1.map(r => r.getOrDefault(featureNames(i), 0D)).filter(_ > 1E-8).collect().sorted
|
|
|
val len = data2.length
|
|
|
if (len == 0){
|
|
|
- result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
|
|
|
+ result.add(featureNames(i) + "\t" + bucketNum.toString + "\t" + "0")
|
|
|
}else{
|
|
|
val oneBucketNum = (len - 1) / (bucketNum - 1) + 1
|
|
|
val buffers = new ArrayBuffer[Double]()
|
|
@@ -86,7 +84,7 @@ object makedata_ad_32_bucket_20240718 {
|
|
|
if (!buffers.contains(data2.last)) {
|
|
|
buffers += data2.last
|
|
|
}
|
|
|
- result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
|
|
|
+ result.add(featureNames(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
|
|
|
}
|
|
|
}
|
|
|
val data3 = sc.parallelize(result)
|