zhangbo hace 10 meses
padre
commit
414b50c057

+ 19 - 15
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_32_bucket_20240622.scala

@@ -64,24 +64,28 @@ object makedata_32_bucket_20240622 {
       println("特征:" + contentList(i))
       val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
       val len = data2.length
-      val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
-      val buffers = new ArrayBuffer[Double]()
-
-      var lastBucketValue = data2(0) // 记录上一个桶的切分点
-      for (j <- 0 until len by oneBucketNum) {
-        val d = data2(j)
-        if (j > 0 && d != lastBucketValue) {
-          // 如果当前切分点不同于上一个切分点,则保存当前切分点
-          buffers += d
+      if (len == 0){
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
+      }else{
+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
+        val buffers = new ArrayBuffer[Double]()
+
+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
+        for (j <- 0 until len by oneBucketNum) {
+          val d = data2(j)
+          if (j > 0 && d != lastBucketValue) {
+            // 如果当前切分点不同于上一个切分点,则保存当前切分点
+            buffers += d
+          }
+          lastBucketValue = d // 更新上一个桶的切分点
         }
-        lastBucketValue = d // 更新上一个桶的切分点
-      }
 
-      // 最后一个桶的结束点应该是数组的最后一个元素
-      if (!buffers.contains(data2.last)) {
-        buffers += data2.last
+        // 最后一个桶的结束点应该是数组的最后一个元素
+        if (!buffers.contains(data2.last)) {
+          buffers += data2.last
+        }
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
       }
-      result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
     }
     val data3 = sc.parallelize(result)