|
@@ -0,0 +1,52 @@
|
|
|
+package com.aliyun.odps.spark.examples.makedata_recsys.v20250218
|
|
|
+
|
|
|
+import com.aliyun.odps.spark.examples.myUtils.{FileUtils, ParamUtils}
|
|
|
+import examples.utils.StatisticsUtil
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+import java.util
|
|
|
+import scala.collection.JavaConverters._
|
|
|
+
|
|
|
+object makedata_recsys_45_feature_cover_degree_libsvm {
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ val spark = SparkSession
|
|
|
+ .builder()
|
|
|
+ .appName(this.getClass.getName)
|
|
|
+ .getOrCreate()
|
|
|
+ val sc = spark.sparkContext
|
|
|
+
|
|
|
+ val param = ParamUtils.parseArgs(args)
|
|
|
+ val readPath = param.getOrElse("readPath", "/dw/recommend/model/43_recsys_train_data_20250218/*")
|
|
|
+ val featureNameFile = param.getOrElse("featureNameFile", "feature_name_20250218.txt")
|
|
|
+
|
|
|
+ val resource = getClass.getClassLoader.getResource(featureNameFile)
|
|
|
+ val fileContent = FileUtils.readFile(resource)
|
|
|
+
|
|
|
+
|
|
|
+ val featureNameSet = fileContent.split("\n").map(_.trim).filter(_.nonEmpty).toSet
|
|
|
+ val featureNameSet_br = sc.broadcast(featureNameSet)
|
|
|
+
|
|
|
+
|
|
|
+ val featureStatsRDD = sc.textFile(readPath).map { line =>
|
|
|
+ val allMap = new util.HashMap[String, java.lang.Long]()
|
|
|
+
|
|
|
+
|
|
|
+ StatisticsUtil.featureCoverRateByLibSvm(line, featureNameSet_br.value.asJava, allMap)
|
|
|
+
|
|
|
+
|
|
|
+ (
|
|
|
+ allMap.asScala.toSeq.map { case (k, v) => (("allMap", k), v) }
|
|
|
+ ).iterator
|
|
|
+ }.flatMap(identity)
|
|
|
+
|
|
|
+
|
|
|
+ val aggregatedFeatureStatsRDD = featureStatsRDD
|
|
|
+ .reduceByKey(_ + _)
|
|
|
+
|
|
|
+
|
|
|
+ val allMap = aggregatedFeatureStatsRDD.filter(_._1._1 == "allMap").map { case ((_, key), value) => (key, value) }.collectAsMap()
|
|
|
+
|
|
|
+
|
|
|
+ println(s"AllMap 结果:\n${allMap.mkString("\n")}")
|
|
|
+ }
|
|
|
+}
|