|
@@ -8,6 +8,8 @@ import org.apache.spark.rdd.RDD
|
|
|
import org.apache.spark.sql.SparkSession
|
|
|
|
|
|
import java.util.Base64
|
|
|
+import scala.collection.JavaConverters.asScalaSetConverter
|
|
|
+import scala.collection.mutable
|
|
|
|
|
|
|
|
|
|
|
@@ -73,60 +75,199 @@ object diff_data_20250319 {
|
|
|
|
|
|
val joinedRDD: RDD[(String, (JSONObject, JSONObject))] = rdd1Pairs.join(rdd2Pairs)
|
|
|
|
|
|
- val firstElement = joinedRDD.first()
|
|
|
- firstElement match {
|
|
|
- case (logkey, (map1, map2)) =>
|
|
|
- println(logkey)
|
|
|
- println(map1)
|
|
|
- println(map2)
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+ val featureDiffSum = mutable.Map[String, Double]()
|
|
|
+ val featureCount = mutable.Map[String, Int]()
|
|
|
+
|
|
|
+ val featureDiffRates = mutable.Map[String, mutable.Map[String, Double]]()
|
|
|
+
|
|
|
+ joinedRDD.foreach { case (logkey, (map1, map2)) =>
|
|
|
+ val keys = map1.keySet().asScala.toSet ++ map2.keySet().asScala.toSet
|
|
|
+
|
|
|
+
|
|
|
+ keys.foreach { key =>
|
|
|
+ if (map1.containsKey(key) && map2.containsKey(key)) {
|
|
|
+ val value1 = map1.get(key)
|
|
|
+ val value2 = map2.get(key)
|
|
|
+ (value1, value2) match {
|
|
|
+ case (num1: java.lang.Number, num2: java.lang.Number) =>
|
|
|
+ val diff = math.abs(num1.doubleValue() - num2.doubleValue())
|
|
|
+ if (diff != 0) {
|
|
|
+ featureDiffSum(key) = featureDiffSum.getOrElse(key, 0.0) + diff
|
|
|
+ featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ case (num1: java.lang.String, num2: java.lang.String) =>
|
|
|
+ if (num1 != num2) {
|
|
|
+ featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ }
|
|
|
+ case _ =>
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
|
|
|
+
|
|
|
+ println("每个特征的平均差异:")
|
|
|
+ featureDiffSum.foreach { case (feature, sum) =>
|
|
|
+ val count = featureCount(feature)
|
|
|
+ val averageDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Average Diff: $averageDiff")
|
|
|
+ }
|
|
|
|
|
|
+ val count = joinedRDD.count()
|
|
|
+ println(s"对比总数: $count")
|
|
|
+ println("每个特征的差异率:")
|
|
|
+ featureCount.foreach { case (feature, sum) =>
|
|
|
+ val rateDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Rate Diff: $rateDiff")
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
|
|
|
|