| 
					
				 | 
			
			
				@@ -41,6 +41,7 @@ object makedata_15_bucket_20240608 { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val fileName = param.getOrElse("fileName", "20240607_200") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val bucketNum = param.getOrElse("bucketNum", "200").toInt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val data = sc.textFile(readPath) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -48,37 +49,33 @@ object makedata_15_bucket_20240608 { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       val rList = r.split("\t") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       val doubles = rList(2).split(",").map(_.toDouble) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       doubles 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-    }) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    }).sample(false, sampleRate).collect() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val result = new ArrayBuffer[String]() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     for (i <- contentList.indices){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       println("特征:" + contentList(i)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-      val data2 = data1.map(r => r(i)).filter(_ > 1E-8).collect().sorted 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//      if (data2.map(_.toString).toSet.size < bucketNum*10){ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//        println("无法分桶:" + data2.map(_.toString).toSet.size.toString) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//      }else{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        val len = data2.length 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        val buffers = new ArrayBuffer[Double]() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val data2 = data1.map(r => r(i)).filter(_ > 1E-8).sorted 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val len = data2.length 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      val buffers = new ArrayBuffer[Double]() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        var lastBucketValue = data2(0) // 记录上一个桶的切分点 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        for (j <- 0 until len by oneBucketNum) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          val d = data2(j) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          if (j > 0 && d != lastBucketValue) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            // 如果当前切分点不同于上一个切分点,则保存当前切分点 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-            buffers += d 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          lastBucketValue = d // 更新上一个桶的切分点 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      var lastBucketValue = data2(0) // 记录上一个桶的切分点 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      for (j <- 0 until len by oneBucketNum) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        val d = data2(j) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if (j > 0 && d != lastBucketValue) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          // 如果当前切分点不同于上一个切分点,则保存当前切分点 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+          buffers += d 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        lastBucketValue = d // 更新上一个桶的切分点 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        // 最后一个桶的结束点应该是数组的最后一个元素 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        if (!buffers.contains(data2.last)) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-          buffers += data2.last 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(",")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      // 最后一个桶的结束点应该是数组的最后一个元素 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      if (!buffers.contains(data2.last)) { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        buffers += data2.last 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				       } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-//    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+      result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(",")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				     val data3 = sc.parallelize(result) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 |