浏览代码

Update makedata_ad_32_bucket_20240718: clean codes

StrayWarrior 4 月之前
父节点
当前提交
c285a43943

+ 10 - 12
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/makedata_ad_32_bucket_20240718.scala

@@ -33,7 +33,7 @@ object makedata_ad_32_bucket_20240718 {
 
     val loader = getClass.getClassLoader
     val resourceUrl = loader.getResource(featureNameFile)
-    val content =
+    val featureNameContent =
       if (resourceUrl != null) {
         val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
         Source.fromURL(resourceUrl).close()
@@ -41,15 +41,13 @@ object makedata_ad_32_bucket_20240718 {
       } else {
         ""
       }
-    println(content)
-    val contentList = content.split("\n")
+    println(featureNameContent)
+    val featureNames = featureNameContent.split("\n")
       .map(r=> r.replace(" ", "").replaceAll("\n", ""))
       .filter(r=> r.nonEmpty).toList
 
-
-
     val data = sc.textFile(readPath)
-    println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
+    // println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
     val data1 = data.map(r => {
       val rList = r.split("\t")
       val jsons = JSON.parseObject(rList(2))
@@ -58,16 +56,16 @@ object makedata_ad_32_bucket_20240718 {
         doubles.put(r._1, jsons.getDoubleValue(r._1))
       })
       doubles
-    }).sample(false, sampleRate ).repartition(20)
+    }).sample(false, sampleRate)
 
     val result = new ArrayBuffer[String]()
 
-    for (i <- contentList.indices){
-      println("特征:" + contentList(i))
-      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
+    for (i <- featureNames.indices){
+      println("特征:" + featureNames(i))
+      val data2 = data1.map(r => r.getOrDefault(featureNames(i), 0D)).filter(_ > 1E-8).collect().sorted
       val len = data2.length
       if (len == 0){
-        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
+        result.add(featureNames(i) + "\t" + bucketNum.toString + "\t" + "0")
       }else{
         val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
         val buffers = new ArrayBuffer[Double]()
@@ -86,7 +84,7 @@ object makedata_ad_32_bucket_20240718 {
         if (!buffers.contains(data2.last)) {
           buffers += data2.last
         }
-        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
+        result.add(featureNames(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
       }
     }
     val data3 = sc.parallelize(result)