|
@@ -60,79 +60,129 @@ object diff_data_20250319 {
|
|
|
println(rdd2.getString("logkey"))
|
|
|
println(rdd2.toString)
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- val rdd1Pairs: RDD[(String, JSONObject)] = odpsData1.map(map => (map.getString("logkey"), map))
|
|
|
- val rdd2Pairs: RDD[(String, JSONObject)] = odpsData2.map(map => (map.getString("logkey"), map))
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- val joinedRDD: RDD[(String, (JSONObject, JSONObject))] = rdd1Pairs.join(rdd2Pairs)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+ val keys = rdd1.keySet().asScala.toSet ++ rdd2.keySet().asScala.toSet
|
|
|
+ println("keys")
|
|
|
+ println(keys)
|
|
|
|
|
|
|
|
|
val featureDiffSum = mutable.Map[String, Double]()
|
|
|
val featureCount = mutable.Map[String, Int]()
|
|
|
-
|
|
|
- val featureDiffRates = mutable.Map[String, mutable.Map[String, Double]]()
|
|
|
-
|
|
|
- joinedRDD.foreach { case (logkey, (map1, map2)) =>
|
|
|
- val keys = map1.keySet().asScala.toSet ++ map2.keySet().asScala.toSet
|
|
|
-
|
|
|
-
|
|
|
- keys.foreach { key =>
|
|
|
- if (map1.containsKey(key) && map2.containsKey(key)) {
|
|
|
- val value1 = map1.getString(key)
|
|
|
- val value2 = map2.getString(key)
|
|
|
-
|
|
|
- def tryToNumber(value: Any): Option[java.lang.Number] = {
|
|
|
- value match {
|
|
|
- case num: java.lang.Number => Some(num)
|
|
|
- case str: String =>
|
|
|
- try {
|
|
|
- Some(str.toDouble)
|
|
|
- } catch {
|
|
|
- case _: NumberFormatException => None
|
|
|
- }
|
|
|
- case _ => None
|
|
|
- }
|
|
|
- }
|
|
|
|
|
|
- (tryToNumber(value1), tryToNumber(value2)) match {
|
|
|
- case (Some(num1), Some(num2)) =>
|
|
|
- val diff = math.abs(num1.doubleValue() - num2.doubleValue())
|
|
|
- if (diff > 0) {
|
|
|
- featureDiffSum(key) = featureDiffSum.getOrElse(key, 0.0) + diff
|
|
|
- featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
- }
|
|
|
- case _ =>
|
|
|
- val str1 = if (value1 != null) value1 else ""
|
|
|
- val str2 = if (value2 != null) value2 else ""
|
|
|
- if (str1 != str2) {
|
|
|
- featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ keys.foreach { key =>
|
|
|
+ if (rdd1.containsKey(key) && rdd2.containsKey(key)) {
|
|
|
+ val value1 = rdd1.getString(key)
|
|
|
+ val value2 = rdd2.getString(key)
|
|
|
+
|
|
|
+ def tryToNumber(value: Any): Option[java.lang.Number] = {
|
|
|
+ value match {
|
|
|
+ case num: java.lang.Number => Some(num)
|
|
|
+ case str: String =>
|
|
|
+ try {
|
|
|
+ Some(str.toDouble)
|
|
|
+ } catch {
|
|
|
+ case _: NumberFormatException => None
|
|
|
}
|
|
|
+ case _ => None
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ (tryToNumber(value1), tryToNumber(value2)) match {
|
|
|
+ case (Some(num1), Some(num2)) =>
|
|
|
+ val diff = math.abs(num1.doubleValue() - num2.doubleValue())
|
|
|
+ if (diff > 0) {
|
|
|
+ featureDiffSum(key) = featureDiffSum.getOrElse(key, 0.0) + diff
|
|
|
+ featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ }
|
|
|
+ case _ =>
|
|
|
+ val str1 = if (value1 != null) value1 else ""
|
|
|
+ val str2 = if (value2 != null) value2 else ""
|
|
|
+ if (str1 != str2) {
|
|
|
+ featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
-
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+ println("每个特征的平均差异:")
|
|
|
+ println(featureDiffSum.size)
|
|
|
+ featureDiffSum.foreach { case (feature, sum) =>
|
|
|
+ val count = featureCount(feature)
|
|
|
+ val averageDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Average Diff: $averageDiff")
|
|
|
+ }
|
|
|
+
|
|
|
+ val count = 1
|
|
|
+ println(s"对比总数: $count")
|
|
|
+ println("每个特征的差异率:")
|
|
|
+ println(featureCount.size)
|
|
|
+ featureCount.foreach { case (feature, sum) =>
|
|
|
+ val rateDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Rate Diff: $rateDiff")
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
@@ -141,23 +191,23 @@ object diff_data_20250319 {
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
- println("每个特征的平均差异:")
|
|
|
- println(featureDiffSum.size)
|
|
|
- featureDiffSum.foreach { case (feature, sum) =>
|
|
|
- val count = featureCount(feature)
|
|
|
- val averageDiff = sum / count
|
|
|
- println(s" Feature: $feature, Average Diff: $averageDiff")
|
|
|
- }
|
|
|
-
|
|
|
- val count = joinedRDD.count()
|
|
|
- println(s"对比总数: $count")
|
|
|
- println("每个特征的差异率:")
|
|
|
- println(featureCount.size)
|
|
|
- featureCount.foreach { case (feature, sum) =>
|
|
|
- val rateDiff = sum / count
|
|
|
- println(s" Feature: $feature, Rate Diff: $rateDiff")
|
|
|
- }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
}
|
|
|
|
|
|
|