| 
														
															@@ -41,6 +41,7 @@ object makedata_15_bucket_20240608 { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607") 
														 | 
														
														 | 
														
															     val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/") 
														 | 
														
														 | 
														
															     val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val fileName = param.getOrElse("fileName", "20240607_200") 
														 | 
														
														 | 
														
															     val fileName = param.getOrElse("fileName", "20240607_200") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val bucketNum = param.getOrElse("bucketNum", "200").toInt 
														 | 
														
														 | 
														
															     val bucketNum = param.getOrElse("bucketNum", "200").toInt 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val data = sc.textFile(readPath) 
														 | 
														
														 | 
														
															     val data = sc.textFile(readPath) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -48,37 +49,33 @@ object makedata_15_bucket_20240608 { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															       val rList = r.split("\t") 
														 | 
														
														 | 
														
															       val rList = r.split("\t") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															       val doubles = rList(2).split(",").map(_.toDouble) 
														 | 
														
														 | 
														
															       val doubles = rList(2).split(",").map(_.toDouble) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															       doubles 
														 | 
														
														 | 
														
															       doubles 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    }) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    }).sample(false, sampleRate).collect() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val result = new ArrayBuffer[String]() 
														 | 
														
														 | 
														
															     val result = new ArrayBuffer[String]() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     for (i <- contentList.indices){ 
														 | 
														
														 | 
														
															     for (i <- contentList.indices){ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															       println("特征:" + contentList(i)) 
														 | 
														
														 | 
														
															       println("特征:" + contentList(i)) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-      val data2 = data1.map(r => r(i)).filter(_ > 1E-8).collect().sorted 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-//      if (data2.map(_.toString).toSet.size < bucketNum*10){ 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-//        println("无法分桶:" + data2.map(_.toString).toSet.size.toString) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-//      }else{ 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        val len = data2.length 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        val buffers = new ArrayBuffer[Double]() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      val data2 = data1.map(r => r(i)).filter(_ > 1E-8).sorted 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      val len = data2.length 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      val buffers = new ArrayBuffer[Double]() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        var lastBucketValue = data2(0) // 记录上一个桶的切分点 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        for (j <- 0 until len by oneBucketNum) { 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-          val d = data2(j) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-          if (j > 0 && d != lastBucketValue) { 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            // 如果当前切分点不同于上一个切分点,则保存当前切分点 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            buffers += d 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-          } 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-          lastBucketValue = d // 更新上一个桶的切分点 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      var lastBucketValue = data2(0) // 记录上一个桶的切分点 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      for (j <- 0 until len by oneBucketNum) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        val d = data2(j) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        if (j > 0 && d != lastBucketValue) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+          // 如果当前切分点不同于上一个切分点,则保存当前切分点 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+          buffers += d 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         } 
														 | 
														
														 | 
														
															         } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        lastBucketValue = d // 更新上一个桶的切分点 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        // 最后一个桶的结束点应该是数组的最后一个元素 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        if (!buffers.contains(data2.last)) { 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-          buffers += data2.last 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        } 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(",")) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      // 最后一个桶的结束点应该是数组的最后一个元素 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      if (!buffers.contains(data2.last)) { 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        buffers += data2.last 
														 | 
													
												
											
												
													
														| 
														 | 
														
															       } 
														 | 
														
														 | 
														
															       } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-//    } 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+      result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(",")) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    } 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     val data3 = sc.parallelize(result) 
														 | 
														
														 | 
														
															     val data3 = sc.parallelize(result) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 |