|
@@ -41,6 +41,7 @@ object makedata_15_bucket_20240608 {
|
|
|
val readPath = param.getOrElse("readPath", "/dw/recommend/model/14_feature_data/20240607")
|
|
|
val savePath = param.getOrElse("savePath", "/dw/recommend/model/15_bucket_data/")
|
|
|
val fileName = param.getOrElse("fileName", "20240607_200")
|
|
|
+ val sampleRate = param.getOrElse("sampleRate", "0.1").toDouble
|
|
|
val bucketNum = param.getOrElse("bucketNum", "200").toInt
|
|
|
|
|
|
val data = sc.textFile(readPath)
|
|
@@ -48,37 +49,33 @@ object makedata_15_bucket_20240608 {
|
|
|
val rList = r.split("\t")
|
|
|
val doubles = rList(2).split(",").map(_.toDouble)
|
|
|
doubles
|
|
|
- })
|
|
|
+ }).sample(false, sampleRate).collect()
|
|
|
|
|
|
val result = new ArrayBuffer[String]()
|
|
|
|
|
|
for (i <- contentList.indices){
|
|
|
println("特征:" + contentList(i))
|
|
|
- val data2 = data1.map(r => r(i)).filter(_ > 1E-8).collect().sorted
|
|
|
-// if (data2.map(_.toString).toSet.size < bucketNum*10){
|
|
|
-// println("无法分桶:" + data2.map(_.toString).toSet.size.toString)
|
|
|
-// }else{
|
|
|
- val len = data2.length
|
|
|
- val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
|
|
|
- val buffers = new ArrayBuffer[Double]()
|
|
|
+ val data2 = data1.map(r => r(i)).filter(_ > 1E-8).sorted
|
|
|
+ val len = data2.length
|
|
|
+ val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
|
|
|
+ val buffers = new ArrayBuffer[Double]()
|
|
|
|
|
|
- var lastBucketValue = data2(0) // 记录上一个桶的切分点
|
|
|
- for (j <- 0 until len by oneBucketNum) {
|
|
|
- val d = data2(j)
|
|
|
- if (j > 0 && d != lastBucketValue) {
|
|
|
- // 如果当前切分点不同于上一个切分点,则保存当前切分点
|
|
|
- buffers += d
|
|
|
- }
|
|
|
- lastBucketValue = d // 更新上一个桶的切分点
|
|
|
+ var lastBucketValue = data2(0) // 记录上一个桶的切分点
|
|
|
+ for (j <- 0 until len by oneBucketNum) {
|
|
|
+ val d = data2(j)
|
|
|
+ if (j > 0 && d != lastBucketValue) {
|
|
|
+ // 如果当前切分点不同于上一个切分点,则保存当前切分点
|
|
|
+ buffers += d
|
|
|
}
|
|
|
+ lastBucketValue = d // 更新上一个桶的切分点
|
|
|
+ }
|
|
|
|
|
|
- // 最后一个桶的结束点应该是数组的最后一个元素
|
|
|
- if (!buffers.contains(data2.last)) {
|
|
|
- buffers += data2.last
|
|
|
- }
|
|
|
- result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
|
|
|
+ // 最后一个桶的结束点应该是数组的最后一个元素
|
|
|
+ if (!buffers.contains(data2.last)) {
|
|
|
+ buffers += data2.last
|
|
|
}
|
|
|
-// }
|
|
|
+ result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
|
|
|
+ }
|
|
|
val data3 = sc.parallelize(result)
|
|
|
|
|
|
|