|
@@ -6,6 +6,8 @@ import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils, env}
|
|
|
import org.apache.hadoop.io.compress.GzipCodec
|
|
|
import org.apache.spark.rdd.RDD
|
|
|
import org.apache.spark.sql.SparkSession
|
|
|
+import org.json4s.DefaultFormats
|
|
|
+import org.json4s.jackson.JsonMethods._
|
|
|
|
|
|
|
|
|
|
|
@@ -31,7 +33,7 @@ object diff_data_20250319 {
|
|
|
val odpsData2 = odpsOps.readTable(project = "loghubods",
|
|
|
table = "alg_recsys_ad_sample_all",
|
|
|
partition = "dt=20250319,hh=12",
|
|
|
- transfer = func,
|
|
|
+ transfer = func1,
|
|
|
numPartition = 64)
|
|
|
|
|
|
|
|
@@ -51,27 +53,27 @@ object diff_data_20250319 {
|
|
|
}
|
|
|
|
|
|
|
|
|
- var result: List[String] = List.empty
|
|
|
-
|
|
|
- result = result :+ "ad_easyrec_eval_data_v3_sampled size =" + odpsData1.count();
|
|
|
-
|
|
|
- result = result :+ "alg_recsys_ad_sample_all size =" + odpsData2.count();
|
|
|
-
|
|
|
-
|
|
|
- val rdd1Pairs: RDD[(String, Map[String, String])] = odpsData1.map(map => (map("logkey"), map))
|
|
|
- val rdd2Pairs: RDD[(String, Map[String, String])] = odpsData2.map(map => ((map("apptype"), map("mid"), map("cid"), map("ts"), map("headvideoid")).productIterator.mkString(","), map))
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- val joinedRDD: RDD[(String, (Map[String, String], Map[String, String]))] = rdd1Pairs.join(rdd2Pairs)
|
|
|
-
|
|
|
- val firstElement = joinedRDD.first()
|
|
|
- firstElement match {
|
|
|
- case (logkey, (map1, map2)) =>
|
|
|
- println(logkey)
|
|
|
- println(map1)
|
|
|
- println(map2)
|
|
|
- }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
@@ -131,6 +133,27 @@ object diff_data_20250319 {
|
|
|
map
|
|
|
}
|
|
|
|
|
|
+ def func1(record: Record, schema: TableSchema): Map[String, String] = {
|
|
|
+ implicit val formats: DefaultFormats.type = DefaultFormats
|
|
|
+
|
|
|
+
|
|
|
+ val logKeyColumns = List("apptype", "mid", "cid", "ts", "headvideoid")
|
|
|
+
|
|
|
+ val logKey = logKeyColumns.map { columnName =>
|
|
|
+ Option(record.get(columnName)).map(_.toString).getOrElse("")
|
|
|
+ }.mkString(",")
|
|
|
+
|
|
|
+
|
|
|
+ val allFeatureMapJson = Option(record.get("allfeaturemap")).map(_.toString).getOrElse("{}")
|
|
|
+
|
|
|
+ val allFeatureMap = parse(allFeatureMapJson).extract[Map[String, String]]
|
|
|
+
|
|
|
+
|
|
|
+ val updatedMap = allFeatureMap + ("logkey" -> logKey)
|
|
|
+
|
|
|
+ updatedMap
|
|
|
+ }
|
|
|
+
|
|
|
private def processString(input: String): Map[String, String] = {
|
|
|
|
|
|
val parts = input.trim.split("\t")
|