|
@@ -0,0 +1,76 @@
|
|
|
+package com.aliyun.odps.spark.examples.makedata_ad.v20240718
|
|
|
+
|
|
|
+import com.aliyun.odps.spark.examples.myUtils.{ParamUtils, env}
|
|
|
+import org.apache.spark.sql.SparkSession
|
|
|
+
|
|
|
+import scala.io.Source
|
|
|
+import scala.language.postfixOps
|
|
|
+
|
|
|
+object test {
|
|
|
+ val CTR_SMOOTH_BETA_FACTOR = 25
|
|
|
+ val CVR_SMOOTH_BETA_FACTOR = 10
|
|
|
+ val CTCVR_SMOOTH_BETA_FACTOR = 100
|
|
|
+
|
|
|
+ def main(args: Array[String]): Unit = {
|
|
|
+ val spark = SparkSession
|
|
|
+ .builder()
|
|
|
+ .appName(this.getClass.getName)
|
|
|
+ .getOrCreate()
|
|
|
+ val sc = spark.sparkContext
|
|
|
+
|
|
|
+
|
|
|
+ // 1 读取参数
|
|
|
+ val param = ParamUtils.parseArgs(args)
|
|
|
+
|
|
|
+ val project = param.getOrElse("project", "loghubods")
|
|
|
+ val outputTable = param.getOrElse("outputTable", "ad_easyrec_train_realtime_data_v3_sampled")
|
|
|
+
|
|
|
+ val loader = getClass.getClassLoader
|
|
|
+ val resourceUrlBucket = loader.getResource("20250217_ad_bucket_688.txt")
|
|
|
+ val buckets =
|
|
|
+ if (resourceUrlBucket != null) {
|
|
|
+ val buckets = Source.fromURL(resourceUrlBucket).getLines().mkString("\n")
|
|
|
+ Source.fromURL(resourceUrlBucket).close()
|
|
|
+ buckets
|
|
|
+ } else {
|
|
|
+ ""
|
|
|
+ }
|
|
|
+ val bucketsMap = buckets.split("\n")
|
|
|
+ .map(r => r.replace(" ", "").replaceAll("\n", ""))
|
|
|
+ .filter(r => r.nonEmpty)
|
|
|
+ .map(r => {
|
|
|
+ val rList = r.split("\t")
|
|
|
+ val featureName = rList(0).replace("*", "_x_").replace("(view)", "_view")
|
|
|
+ (featureName, (rList(1).toDouble, rList(2).split(",").map(_.toDouble)))
|
|
|
+ }).toMap
|
|
|
+ val bucketsMap_br = sc.broadcast(bucketsMap)
|
|
|
+ val denseFeatureNames = bucketsMap.keySet
|
|
|
+ val sparseFeatureNames = Set(
|
|
|
+ "cid", "adid", "adverid", "targeting_conversion",
|
|
|
+ "region", "city", "brand",
|
|
|
+ "vid", "cate1", "cate2",
|
|
|
+ "user_cid_click_list", "user_cid_conver_list",
|
|
|
+ "user_vid_return_tags_2h", "user_vid_return_tags_1d", "user_vid_return_tags_3d", "user_vid_return_tags_7d",
|
|
|
+ "user_vid_return_tags_14d", "apptype", "ts", "mid", "pqtid", "hour", "hour_quarter", "root_source_scene",
|
|
|
+ "root_source_channel", "is_first_layer", "title_split", "profession", "user_vid_share_tags_1d", "user_vid_share_tags_14d",
|
|
|
+ "user_vid_return_cate1_14d", "user_vid_return_cate2_14d", "user_vid_share_cate1_14d", "user_vid_share_cate2_14d",
|
|
|
+ "creative_type", "creative_hook_embedding", "creative_why_embedding", "creative_action_embedding", "user_has_conver_1y",
|
|
|
+ "user_adverid_view_3d", "user_adverid_view_7d", "user_adverid_view_30d",
|
|
|
+ "user_adverid_click_3d", "user_adverid_click_7d", "user_adverid_click_30d",
|
|
|
+ "user_adverid_conver_3d", "user_adverid_conver_7d", "user_adverid_conver_30d",
|
|
|
+ "user_skuid_view_3d", "user_skuid_view_7d", "user_skuid_view_30d",
|
|
|
+ "user_skuid_click_3d", "user_skuid_click_7d", "user_skuid_click_30d",
|
|
|
+ "user_skuid_conver_3d", "user_skuid_conver_7d", "user_skuid_conver_30d",
|
|
|
+ "is_weekday", "day_of_the_week", "user_conver_ad_class")
|
|
|
+
|
|
|
+
|
|
|
+ // 2 读取odps+表信息
|
|
|
+ val odpsOps = env.getODPS(sc)
|
|
|
+ val tableSchema = odpsOps.getTableSchema(project, outputTable, isPartition = false)
|
|
|
+ for (t <- tableSchema) {
|
|
|
+ if (!denseFeatureNames.contains(t._1) && !sparseFeatureNames.contains(t._1)) {
|
|
|
+ printf(t._1)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|