Browse Source

增加测试

xueyiming 4 days ago
parent
commit
1a0ac56bf5

+ 76 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/v20240718/test.scala

@@ -0,0 +1,76 @@
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
+
+import com.aliyun.odps.spark.examples.myUtils.{ParamUtils, env}
+import org.apache.spark.sql.SparkSession
+
+import scala.io.Source
+import scala.language.postfixOps
+
+object test {
+  val CTR_SMOOTH_BETA_FACTOR = 25
+  val CVR_SMOOTH_BETA_FACTOR = 10
+  val CTCVR_SMOOTH_BETA_FACTOR = 100
+
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+
+    val project = param.getOrElse("project", "loghubods")
+    val outputTable = param.getOrElse("outputTable", "ad_easyrec_train_realtime_data_v3_sampled")
+
+    val loader = getClass.getClassLoader
+    val resourceUrlBucket = loader.getResource("20250217_ad_bucket_688.txt")
+    val buckets =
+      if (resourceUrlBucket != null) {
+        val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
+        Source.fromURL(resourceUrlBucket).close()
+        buckets
+      } else {
+        ""
+      }
+    val bucketsMap = buckets.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty)
+      .map(r => {
+        val rList = r.split("\t")
+        val featureName = rList(0).replace("*", "_x_").replace("(view)", "_view")
+        (featureName, (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
+      }).toMap
+    val bucketsMap_br = sc.broadcast(bucketsMap)
+    val denseFeatureNames = bucketsMap.keySet
+    val sparseFeatureNames = Set(
+      "cid", "adid", "adverid", "targeting_conversion",
+      "region", "city", "brand",
+      "vid", "cate1", "cate2",
+      "user_cid_click_list", "user_cid_conver_list",
+      "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d", "user_vid_return_tags_7d",
+      "user_vid_return_tags_14d", "apptype", "ts", "mid", "pqtid", "hour", "hour_quarter", "root_source_scene",
+      "root_source_channel", "is_first_layer", "title_split", "profession", "user_vid_share_tags_1d", "user_vid_share_tags_14d",
+      "user_vid_return_cate1_14d", "user_vid_return_cate2_14d", "user_vid_share_cate1_14d", "user_vid_share_cate2_14d",
+      "creative_type", "creative_hook_embedding", "creative_why_embedding", "creative_action_embedding", "user_has_conver_1y",
+      "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
+      "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
+      "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
+      "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
+      "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
+      "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d",
+      "is_weekday", "day_of_the_week", "user_conver_ad_class")
+
+
+    // 2 读取odps+表信息
+    val odpsOps = env.getODPS(sc)
+    val tableSchema = odpsOps.getTableSchema(project, outputTable, isPartition = false)
+    for (t <- tableSchema) {
+      if (!denseFeatureNames.contains(t._1) && !sparseFeatureNames.contains(t._1)) {
+        printf(t._1)
+      }
+    }
+  }
+}