|  | @@ -33,7 +33,7 @@ object makedata_ad_32_bucket_20240718 {
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      val loader = getClass.getClassLoader
 | 
	
		
			
				|  |  |      val resourceUrl = loader.getResource(featureNameFile)
 | 
	
		
			
				|  |  | -    val content =
 | 
	
		
			
				|  |  | +    val featureNameContent =
 | 
	
		
			
				|  |  |        if (resourceUrl != null) {
 | 
	
		
			
				|  |  |          val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
 | 
	
		
			
				|  |  |          Source.fromURL(resourceUrl).close()
 | 
	
	
		
			
				|  | @@ -41,15 +41,13 @@ object makedata_ad_32_bucket_20240718 {
 | 
	
		
			
				|  |  |        } else {
 | 
	
		
			
				|  |  |          ""
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  | -    println(content)
 | 
	
		
			
				|  |  | -    val contentList = content.split("\n")
 | 
	
		
			
				|  |  | +    println(featureNameContent)
 | 
	
		
			
				|  |  | +    val featureNames = featureNameContent.split("\n")
 | 
	
		
			
				|  |  |        .map(r=> r.replace(" ", "").replaceAll("\n", ""))
 | 
	
		
			
				|  |  |        .filter(r=> r.nonEmpty).toList
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  |      val data = sc.textFile(readPath)
 | 
	
		
			
				|  |  | -    println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
 | 
	
		
			
				|  |  | +    // println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
 | 
	
		
			
				|  |  |      val data1 = data.map(r => {
 | 
	
		
			
				|  |  |        val rList = r.split("\t")
 | 
	
		
			
				|  |  |        val jsons = JSON.parseObject(rList(2))
 | 
	
	
		
			
				|  | @@ -58,16 +56,16 @@ object makedata_ad_32_bucket_20240718 {
 | 
	
		
			
				|  |  |          doubles.put(r._1, jsons.getDoubleValue(r._1))
 | 
	
		
			
				|  |  |        })
 | 
	
		
			
				|  |  |        doubles
 | 
	
		
			
				|  |  | -    }).sample(false, sampleRate ).repartition(20)
 | 
	
		
			
				|  |  | +    }).sample(false, sampleRate)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      val result = new ArrayBuffer[String]()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    for (i <- contentList.indices){
 | 
	
		
			
				|  |  | -      println("特征:" + contentList(i))
 | 
	
		
			
				|  |  | -      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
 | 
	
		
			
				|  |  | +    for (i <- featureNames.indices){
 | 
	
		
			
				|  |  | +      println("特征:" + featureNames(i))
 | 
	
		
			
				|  |  | +      val data2 = data1.map(r => r.getOrDefault(featureNames(i), 0D)).filter(_ > 1E-8).collect().sorted
 | 
	
		
			
				|  |  |        val len = data2.length
 | 
	
		
			
				|  |  |        if (len == 0){
 | 
	
		
			
				|  |  | -        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
 | 
	
		
			
				|  |  | +        result.add(featureNames(i) + "\t" + bucketNum.toString + "\t" + "0")
 | 
	
		
			
				|  |  |        }else{
 | 
	
		
			
				|  |  |          val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
 | 
	
		
			
				|  |  |          val buffers = new ArrayBuffer[Double]()
 | 
	
	
		
			
				|  | @@ -86,7 +84,7 @@ object makedata_ad_32_bucket_20240718 {
 | 
	
		
			
				|  |  |          if (!buffers.contains(data2.last)) {
 | 
	
		
			
				|  |  |            buffers += data2.last
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  | -        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
 | 
	
		
			
				|  |  | +        result.add(featureNames(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |      val data3 = sc.parallelize(result)
 |