浏览代码

推荐样本生产-特征分桶

zhangbo 10 月之前
父节点
当前提交
076ab409c7

+ 4 - 4
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_15_bucket_20240608.scala

@@ -56,9 +56,9 @@ object makedata_15_bucket_20240608 {
     for (i <- contentList.indices){
       println("特征:" + contentList(i))
       val data2 = data1.map(r => r(i)).filter(_ > 1E-8).collect().sorted
-      if (data2.map(_.toString).toSet.size < bucketNum*10){
-        println("无法分桶:" + data2.map(_.toString).toSet.size.toString)
-      }else{
+//      if (data2.map(_.toString).toSet.size < bucketNum*10){
+//        println("无法分桶:" + data2.map(_.toString).toSet.size.toString)
+//      }else{
         val len = data2.length
         val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
         val buffers = new ArrayBuffer[Double]()
@@ -79,7 +79,7 @@ object makedata_15_bucket_20240608 {
         }
         result.add(contentList(i) + "\t" + buffers.mkString(","))
       }
-    }
+//    }
     val data3 = sc.parallelize(result)
 
 

+ 9 - 1
src/main/scala/com/aliyun/odps/spark/examples/临时记录的脚本

@@ -82,4 +82,12 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 1G --executor-memory 1G --executor-cores 1 --num-executors 32 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 partitionPrefix:dt=20240607 date:20240607 \
-> p14_data.log 2>&1 &
+> p14_data.log 2>&1 &
+
+
+nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.spark.deploy.SparkSubmit \
+--class com.aliyun.odps.spark.examples.makedata.makedata_15_bucket_20240608 \
+--master yarn --driver-memory 16G --executor-memory 1G --executor-cores 1 --num-executors 32 \
+./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
+partitionPrefix:20240607 date:20240607_bucket \
+> p15_data.log 2>&1 &