|
@@ -0,0 +1,52 @@
|
|
|
+package com.aliyun.odps.spark.examples.makedata_recsys.v20250218
|
|
|
+
|
|
|
+import com.aliyun.odps.spark.examples.myUtils.{FileUtils, ParamUtils}
|
|
|
+import examples.utils.StatisticsUtil
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+import java.util
|
|
|
+import scala.collection.JavaConverters._
|
|
|
+
|
|
|
+object makedata_recsys_45_feature_cover_degree_libsvm {
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ val spark = SparkSession
|
|
|
+ .builder()
|
|
|
+ .appName(this.getClass.getName)
|
|
|
+ .getOrCreate()
|
|
|
+ val sc = spark.sparkContext
|
|
|
+
|
|
|
+ val param = ParamUtils.parseArgs(args)
|
|
|
+ val readPath = param.getOrElse("readPath", "/dw/recommend/model/43_recsys_train_data_20250218/*")
|
|
|
+ val featureNameFile = param.getOrElse("featureNameFile", "feature_name_20250218.txt")
|
|
|
+
|
|
|
+ val resource = getClass.getClassLoader.getResource(featureNameFile)
|
|
|
+ val fileContent = FileUtils.readFile(resource)
|
|
|
+
|
|
|
+ // **优化点 1**: 用 `RDD.broadcast` 传递特征名
|
|
|
+ val featureNameSet = fileContent.split("\n").map(_.trim).filter(_.nonEmpty).toSet
|
|
|
+ val featureNameSet_br = sc.broadcast(featureNameSet)
|
|
|
+
|
|
|
+ // **优化点 2**: `map` 而不是 `foreach`,让 Spark 计算
|
|
|
+ val featureStatsRDD = sc.textFile(readPath).map { line =>
|
|
|
+ val allMap = new util.HashMap[String, java.lang.Long]()
|
|
|
+
|
|
|
+ // 解析特征覆盖度
|
|
|
+ StatisticsUtil.featureCoverRateByLibSvm(line, featureNameSet_br.value.asJava, allMap)
|
|
|
+
|
|
|
+ // **优化点 3**: 用 `Iterator` 直接返回 `RDD`
|
|
|
+ (
|
|
|
+ allMap.asScala.toSeq.map { case (k, v) => (("allMap", k), v) }
|
|
|
+ ).iterator
|
|
|
+ }.flatMap(identity) // **重要**: 展开 `Iterator`
|
|
|
+
|
|
|
+ // **优化点 4**: `reduceByKey` 直接在 Worker 端聚合数据
|
|
|
+ val aggregatedFeatureStatsRDD = featureStatsRDD
|
|
|
+ .reduceByKey(_ + _)
|
|
|
+
|
|
|
+ // **优化点 5**: 分别提取 `allMap`, `isShareMap`, `isReturnNoSelfMap`
|
|
|
+ val allMap = aggregatedFeatureStatsRDD.filter(_._1._1 == "allMap").map { case ((_, key), value) => (key, value) }.collectAsMap()
|
|
|
+
|
|
|
+ // **优化点 6**: 用 `.mkString` 方式减少 `println` 造成的性能影响
|
|
|
+ println(s"AllMap 结果:\n${allMap.mkString("\n")}")
|
|
|
+ }
|
|
|
+}
|