|
@@ -50,12 +50,36 @@ object diff_data_20250319 {
|
|
|
allfeaturemap
|
|
|
})
|
|
|
|
|
|
- val rdd1: JSONObject = odpsData1.first()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ val rdd1Pairs: RDD[(String, JSONObject)] = odpsData1.map(map => (map.getString("logkey"), map))
|
|
|
+ val rdd2Pairs: RDD[(String, JSONObject)] = odpsData2.map(map => (map.getString("logkey"), map))
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ val joinedRDD: RDD[(String, (JSONObject, JSONObject))] = rdd1Pairs.join(rdd2Pairs)
|
|
|
+
|
|
|
+
|
|
|
+ val featureDiffSum = mutable.Map[String, Double]()
|
|
|
+ val featureCount = mutable.Map[String, Int]()
|
|
|
+
|
|
|
+ val featureDiffRates = mutable.Map[String, mutable.Map[String, Double]]()
|
|
|
+
|
|
|
+ val tuple = joinedRDD.first()
|
|
|
+ val value1 = tuple._1
|
|
|
+ val value2 = tuple._2
|
|
|
+ val rdd1 = value2._1
|
|
|
+ val rdd2 = value2._2
|
|
|
+
|
|
|
+
|
|
|
println("rdd1")
|
|
|
println(rdd1.get("logkey").toString)
|
|
|
println(rdd1.toString)
|
|
|
|
|
|
- val rdd2: JSONObject = odpsData2.first()
|
|
|
+
|
|
|
println("rdd2")
|
|
|
println(rdd2.getString("logkey"))
|
|
|
println(rdd2.toString)
|
|
@@ -65,8 +89,8 @@ object diff_data_20250319 {
|
|
|
println(keys)
|
|
|
|
|
|
|
|
|
- val featureDiffSum = mutable.Map[String, Double]()
|
|
|
- val featureCount = mutable.Map[String, Int]()
|
|
|
+ val featureDiffSum1 = mutable.Map[String, Double]()
|
|
|
+ val featureCount1 = mutable.Map[String, Int]()
|
|
|
|
|
|
keys.foreach { key =>
|
|
|
if (rdd1.containsKey(key) && rdd2.containsKey(key)) {
|
|
@@ -90,259 +114,105 @@ object diff_data_20250319 {
|
|
|
case (Some(num1), Some(num2)) =>
|
|
|
val diff = math.abs(num1.doubleValue() - num2.doubleValue())
|
|
|
if (diff > 0) {
|
|
|
- featureDiffSum(key) = featureDiffSum.getOrElse(key, 0.0) + diff
|
|
|
- featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ featureDiffSum1(key) = featureDiffSum1.getOrElse(key, 0.0) + diff
|
|
|
+ featureCount1(key) = featureCount1.getOrElse(key, 0) + 1
|
|
|
}
|
|
|
case _ =>
|
|
|
val str1 = if (value1 != null) value1 else ""
|
|
|
val str2 = if (value2 != null) value2 else ""
|
|
|
if (str1 != str2) {
|
|
|
- featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ featureCount1(key) = featureCount1.getOrElse(key, 0) + 1
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
- println("每个特征的平均差异:")
|
|
|
- println(featureDiffSum.size)
|
|
|
- featureDiffSum.foreach { case (feature, sum) =>
|
|
|
- val count = featureCount(feature)
|
|
|
- val averageDiff = sum / count
|
|
|
- println(s" Feature: $feature, Average Diff: $averageDiff")
|
|
|
- }
|
|
|
+ println("每个特征的平均差异:")
|
|
|
+ println(featureDiffSum1.size)
|
|
|
+ featureDiffSum1.foreach { case (feature, sum) =>
|
|
|
+ val count = featureCount1(feature)
|
|
|
+ val averageDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Average Diff: $averageDiff")
|
|
|
+ }
|
|
|
+
|
|
|
+ val count1 = 1
|
|
|
+ println(s"对比总数: $count1")
|
|
|
+ println("每个特征的差异率:")
|
|
|
+ println(featureCount1.size)
|
|
|
+ featureCount1.foreach { case (feature, sum) =>
|
|
|
+ val rateDiff = sum / count1
|
|
|
+ println(s" Feature: $feature, Rate Diff: $rateDiff")
|
|
|
+ }
|
|
|
|
|
|
- val count = 1
|
|
|
- println(s"对比总数: $count")
|
|
|
- println("每个特征的差异率:")
|
|
|
- println(featureCount.size)
|
|
|
- featureCount.foreach { case (feature, sum) =>
|
|
|
- val rateDiff = sum / count
|
|
|
- println(s" Feature: $feature, Rate Diff: $rateDiff")
|
|
|
+ println("=====================================================================")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ joinedRDD.foreach { case (logkey, (map1, map2)) =>
|
|
|
+ val keys = map1.keySet().asScala.toSet ++ map2.keySet().asScala.toSet
|
|
|
+ println(logkey)
|
|
|
+ keys.foreach { key =>
|
|
|
+ if (map1.containsKey(key) && map2.containsKey(key)) {
|
|
|
+ val value1 = map1.getString(key)
|
|
|
+ val value2 = map2.getString(key)
|
|
|
+
|
|
|
+ def tryToNumber(value: Any): Option[java.lang.Number] = {
|
|
|
+ value match {
|
|
|
+ case num: java.lang.Number => Some(num)
|
|
|
+ case str: String =>
|
|
|
+ try {
|
|
|
+ Some(str.toDouble)
|
|
|
+ } catch {
|
|
|
+ case _: NumberFormatException => None
|
|
|
+ }
|
|
|
+ case _ => None
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ (tryToNumber(value1), tryToNumber(value2)) match {
|
|
|
+ case (Some(num1), Some(num2)) =>
|
|
|
+ val diff = math.abs(num1.doubleValue() - num2.doubleValue())
|
|
|
+ if (diff > 0) {
|
|
|
+ featureDiffSum(key) = featureDiffSum.getOrElse(key, 0.0) + diff
|
|
|
+ featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ }
|
|
|
+ case _ =>
|
|
|
+ val str1 = if (value1 != null) value1 else ""
|
|
|
+ val str2 = if (value2 != null) value2 else ""
|
|
|
+ if (str1 != str2) {
|
|
|
+ featureCount(key) = featureCount.getOrElse(key, 0) + 1
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
+ println("每个特征的平均差异:")
|
|
|
+ println(featureDiffSum.size)
|
|
|
+ featureDiffSum.foreach { case (feature, sum) =>
|
|
|
+ val count = featureCount(feature)
|
|
|
+ val averageDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Average Diff: $averageDiff")
|
|
|
+ }
|
|
|
+
|
|
|
+ val count = joinedRDD.count()
|
|
|
+ println(s"对比总数: $count")
|
|
|
+ println("每个特征的差异率:")
|
|
|
+ println(featureCount.size)
|
|
|
+ featureCount.foreach { case (feature, sum) =>
|
|
|
+ val rateDiff = sum / count
|
|
|
+ println(s" Feature: $feature, Rate Diff: $rateDiff")
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+
|
|
|
def func(record: Record, schema: TableSchema): JSONObject = {
|
|
|
val featureMap = new JSONObject()
|
|
|
val columns = schema.getColumns
|