|
@@ -0,0 +1,53 @@
|
|
|
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
|
|
|
+
|
|
|
+import com.aliyun.odps.TableSchema
|
|
|
+import com.aliyun.odps.data.Record
|
|
|
+import com.aliyun.odps.spark.examples.myUtils.{ParamUtils, env}
|
|
|
+import examples.utils.AdUtil
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+
|
|
|
+/*
|
|
|
+ diff data
|
|
|
+ */
|
|
|
+
|
|
|
+object diff_data_20240718 {
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ val spark = SparkSession
|
|
|
+ .builder()
|
|
|
+ .appName(this.getClass.getName)
|
|
|
+ .getOrCreate()
|
|
|
+ val sc = spark.sparkContext
|
|
|
+
|
|
|
+ // 1 读取参数
|
|
|
+ val param = ParamUtils.parseArgs(args)
|
|
|
+ val project = param.getOrElse("project", "loghubods")
|
|
|
+ val table = param.getOrElse("table", "ad_easyrec_train_data_v1")
|
|
|
+ val partition = "dt=20250101"
|
|
|
+ // 2 读取odps+表信息
|
|
|
+ val odpsOps = env.getODPS(sc)
|
|
|
+ val odpsData = odpsOps.readTable(project = project,
|
|
|
+ table = table,
|
|
|
+ partition = partition,
|
|
|
+ transfer = func,
|
|
|
+ numPartition = 64)
|
|
|
+ val randomRow = odpsData.takeSample(withReplacement = false, num = 10)
|
|
|
+ for (cc <- randomRow) {
|
|
|
+ println(cc)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ def func(record: Record, schema: TableSchema): Map[String, String] = {
|
|
|
+ var map: Map[String, String] = Map.empty
|
|
|
+ val columns = schema.getColumns
|
|
|
+ for (i <- 0 until columns.size()) {
|
|
|
+ val column = columns.get(i)
|
|
|
+ val name = column.getName
|
|
|
+ val value = record.get(name)
|
|
|
+ map += (name, value)
|
|
|
+ }
|
|
|
+ map
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+}
|