Parcourir la source

feat:特征分桶文件生成脚本修改

zhaohaipeng il y a 9 mois
Parent
commit
5d1e879d39

+ 688 - 0
src/main/resources/20240718_ad_feature_name.txt

@@ -0,0 +1,688 @@
+cpa
+b2_1h_ctr
+b2_1h_ctcvr
+b2_1h_cvr
+b2_1h_conver
+b2_1h_ecpm
+b2_1h_click
+b2_1h_conver*log(view)
+b2_1h_conver*ctcvr
+b2_2h_ctr
+b2_2h_ctcvr
+b2_2h_cvr
+b2_2h_conver
+b2_2h_ecpm
+b2_2h_click
+b2_2h_conver*log(view)
+b2_2h_conver*ctcvr
+b2_3h_ctr
+b2_3h_ctcvr
+b2_3h_cvr
+b2_3h_conver
+b2_3h_ecpm
+b2_3h_click
+b2_3h_conver*log(view)
+b2_3h_conver*ctcvr
+b2_4h_ctr
+b2_4h_ctcvr
+b2_4h_cvr
+b2_4h_conver
+b2_4h_ecpm
+b2_4h_click
+b2_4h_conver*log(view)
+b2_4h_conver*ctcvr
+b2_5h_ctr
+b2_5h_ctcvr
+b2_5h_cvr
+b2_5h_conver
+b2_5h_ecpm
+b2_5h_click
+b2_5h_conver*log(view)
+b2_5h_conver*ctcvr
+b2_6h_ctr
+b2_6h_ctcvr
+b2_6h_cvr
+b2_6h_conver
+b2_6h_ecpm
+b2_6h_click
+b2_6h_conver*log(view)
+b2_6h_conver*ctcvr
+b2_12h_ctr
+b2_12h_ctcvr
+b2_12h_cvr
+b2_12h_conver
+b2_12h_ecpm
+b2_12h_click
+b2_12h_conver*log(view)
+b2_12h_conver*ctcvr
+b2_1d_ctr
+b2_1d_ctcvr
+b2_1d_cvr
+b2_1d_conver
+b2_1d_ecpm
+b2_1d_click
+b2_1d_conver*log(view)
+b2_1d_conver*ctcvr
+b2_3d_ctr
+b2_3d_ctcvr
+b2_3d_cvr
+b2_3d_conver
+b2_3d_ecpm
+b2_3d_click
+b2_3d_conver*log(view)
+b2_3d_conver*ctcvr
+b2_7d_ctr
+b2_7d_ctcvr
+b2_7d_cvr
+b2_7d_conver
+b2_7d_ecpm
+b2_7d_click
+b2_7d_conver*log(view)
+b2_7d_conver*ctcvr
+b2_today_ctr
+b2_today_ctcvr
+b2_today_cvr
+b2_today_conver
+b2_today_ecpm
+b2_today_click
+b2_today_conver*log(view)
+b2_today_conver*ctcvr
+b2_yesterday_ctr
+b2_yesterday_ctcvr
+b2_yesterday_cvr
+b2_yesterday_conver
+b2_yesterday_ecpm
+b2_yesterday_click
+b2_yesterday_conver*log(view)
+b2_yesterday_conver*ctcvr
+b3_1h_ctr
+b3_1h_ctcvr
+b3_1h_cvr
+b3_1h_conver
+b3_1h_ecpm
+b3_1h_click
+b3_1h_conver*log(view)
+b3_1h_conver*ctcvr
+b3_2h_ctr
+b3_2h_ctcvr
+b3_2h_cvr
+b3_2h_conver
+b3_2h_ecpm
+b3_2h_click
+b3_2h_conver*log(view)
+b3_2h_conver*ctcvr
+b3_3h_ctr
+b3_3h_ctcvr
+b3_3h_cvr
+b3_3h_conver
+b3_3h_ecpm
+b3_3h_click
+b3_3h_conver*log(view)
+b3_3h_conver*ctcvr
+b3_4h_ctr
+b3_4h_ctcvr
+b3_4h_cvr
+b3_4h_conver
+b3_4h_ecpm
+b3_4h_click
+b3_4h_conver*log(view)
+b3_4h_conver*ctcvr
+b3_5h_ctr
+b3_5h_ctcvr
+b3_5h_cvr
+b3_5h_conver
+b3_5h_ecpm
+b3_5h_click
+b3_5h_conver*log(view)
+b3_5h_conver*ctcvr
+b3_6h_ctr
+b3_6h_ctcvr
+b3_6h_cvr
+b3_6h_conver
+b3_6h_ecpm
+b3_6h_click
+b3_6h_conver*log(view)
+b3_6h_conver*ctcvr
+b3_12h_ctr
+b3_12h_ctcvr
+b3_12h_cvr
+b3_12h_conver
+b3_12h_ecpm
+b3_12h_click
+b3_12h_conver*log(view)
+b3_12h_conver*ctcvr
+b3_1d_ctr
+b3_1d_ctcvr
+b3_1d_cvr
+b3_1d_conver
+b3_1d_ecpm
+b3_1d_click
+b3_1d_conver*log(view)
+b3_1d_conver*ctcvr
+b3_3d_ctr
+b3_3d_ctcvr
+b3_3d_cvr
+b3_3d_conver
+b3_3d_ecpm
+b3_3d_click
+b3_3d_conver*log(view)
+b3_3d_conver*ctcvr
+b3_7d_ctr
+b3_7d_ctcvr
+b3_7d_cvr
+b3_7d_conver
+b3_7d_ecpm
+b3_7d_click
+b3_7d_conver*log(view)
+b3_7d_conver*ctcvr
+b3_today_ctr
+b3_today_ctcvr
+b3_today_cvr
+b3_today_conver
+b3_today_ecpm
+b3_today_click
+b3_today_conver*log(view)
+b3_today_conver*ctcvr
+b3_yesterday_ctr
+b3_yesterday_ctcvr
+b3_yesterday_cvr
+b3_yesterday_conver
+b3_yesterday_ecpm
+b3_yesterday_click
+b3_yesterday_conver*log(view)
+b3_yesterday_conver*ctcvr
+b4_1h_ctr
+b4_1h_ctcvr
+b4_1h_cvr
+b4_1h_conver
+b4_1h_ecpm
+b4_1h_click
+b4_1h_conver*log(view)
+b4_1h_conver*ctcvr
+b4_2h_ctr
+b4_2h_ctcvr
+b4_2h_cvr
+b4_2h_conver
+b4_2h_ecpm
+b4_2h_click
+b4_2h_conver*log(view)
+b4_2h_conver*ctcvr
+b4_3h_ctr
+b4_3h_ctcvr
+b4_3h_cvr
+b4_3h_conver
+b4_3h_ecpm
+b4_3h_click
+b4_3h_conver*log(view)
+b4_3h_conver*ctcvr
+b4_4h_ctr
+b4_4h_ctcvr
+b4_4h_cvr
+b4_4h_conver
+b4_4h_ecpm
+b4_4h_click
+b4_4h_conver*log(view)
+b4_4h_conver*ctcvr
+b4_5h_ctr
+b4_5h_ctcvr
+b4_5h_cvr
+b4_5h_conver
+b4_5h_ecpm
+b4_5h_click
+b4_5h_conver*log(view)
+b4_5h_conver*ctcvr
+b4_6h_ctr
+b4_6h_ctcvr
+b4_6h_cvr
+b4_6h_conver
+b4_6h_ecpm
+b4_6h_click
+b4_6h_conver*log(view)
+b4_6h_conver*ctcvr
+b4_12h_ctr
+b4_12h_ctcvr
+b4_12h_cvr
+b4_12h_conver
+b4_12h_ecpm
+b4_12h_click
+b4_12h_conver*log(view)
+b4_12h_conver*ctcvr
+b4_1d_ctr
+b4_1d_ctcvr
+b4_1d_cvr
+b4_1d_conver
+b4_1d_ecpm
+b4_1d_click
+b4_1d_conver*log(view)
+b4_1d_conver*ctcvr
+b4_3d_ctr
+b4_3d_ctcvr
+b4_3d_cvr
+b4_3d_conver
+b4_3d_ecpm
+b4_3d_click
+b4_3d_conver*log(view)
+b4_3d_conver*ctcvr
+b4_7d_ctr
+b4_7d_ctcvr
+b4_7d_cvr
+b4_7d_conver
+b4_7d_ecpm
+b4_7d_click
+b4_7d_conver*log(view)
+b4_7d_conver*ctcvr
+b4_today_ctr
+b4_today_ctcvr
+b4_today_cvr
+b4_today_conver
+b4_today_ecpm
+b4_today_click
+b4_today_conver*log(view)
+b4_today_conver*ctcvr
+b4_yesterday_ctr
+b4_yesterday_ctcvr
+b4_yesterday_cvr
+b4_yesterday_conver
+b4_yesterday_ecpm
+b4_yesterday_click
+b4_yesterday_conver*log(view)
+b4_yesterday_conver*ctcvr
+b5_1h_ctr
+b5_1h_ctcvr
+b5_1h_cvr
+b5_1h_conver
+b5_1h_ecpm
+b5_1h_click
+b5_1h_conver*log(view)
+b5_1h_conver*ctcvr
+b5_2h_ctr
+b5_2h_ctcvr
+b5_2h_cvr
+b5_2h_conver
+b5_2h_ecpm
+b5_2h_click
+b5_2h_conver*log(view)
+b5_2h_conver*ctcvr
+b5_3h_ctr
+b5_3h_ctcvr
+b5_3h_cvr
+b5_3h_conver
+b5_3h_ecpm
+b5_3h_click
+b5_3h_conver*log(view)
+b5_3h_conver*ctcvr
+b5_4h_ctr
+b5_4h_ctcvr
+b5_4h_cvr
+b5_4h_conver
+b5_4h_ecpm
+b5_4h_click
+b5_4h_conver*log(view)
+b5_4h_conver*ctcvr
+b5_5h_ctr
+b5_5h_ctcvr
+b5_5h_cvr
+b5_5h_conver
+b5_5h_ecpm
+b5_5h_click
+b5_5h_conver*log(view)
+b5_5h_conver*ctcvr
+b5_6h_ctr
+b5_6h_ctcvr
+b5_6h_cvr
+b5_6h_conver
+b5_6h_ecpm
+b5_6h_click
+b5_6h_conver*log(view)
+b5_6h_conver*ctcvr
+b5_12h_ctr
+b5_12h_ctcvr
+b5_12h_cvr
+b5_12h_conver
+b5_12h_ecpm
+b5_12h_click
+b5_12h_conver*log(view)
+b5_12h_conver*ctcvr
+b5_1d_ctr
+b5_1d_ctcvr
+b5_1d_cvr
+b5_1d_conver
+b5_1d_ecpm
+b5_1d_click
+b5_1d_conver*log(view)
+b5_1d_conver*ctcvr
+b5_3d_ctr
+b5_3d_ctcvr
+b5_3d_cvr
+b5_3d_conver
+b5_3d_ecpm
+b5_3d_click
+b5_3d_conver*log(view)
+b5_3d_conver*ctcvr
+b5_7d_ctr
+b5_7d_ctcvr
+b5_7d_cvr
+b5_7d_conver
+b5_7d_ecpm
+b5_7d_click
+b5_7d_conver*log(view)
+b5_7d_conver*ctcvr
+b5_today_ctr
+b5_today_ctcvr
+b5_today_cvr
+b5_today_conver
+b5_today_ecpm
+b5_today_click
+b5_today_conver*log(view)
+b5_today_conver*ctcvr
+b5_yesterday_ctr
+b5_yesterday_ctcvr
+b5_yesterday_cvr
+b5_yesterday_conver
+b5_yesterday_ecpm
+b5_yesterday_click
+b5_yesterday_conver*log(view)
+b5_yesterday_conver*ctcvr
+b8_1h_ctr
+b8_1h_ctcvr
+b8_1h_cvr
+b8_1h_conver
+b8_1h_ecpm
+b8_1h_click
+b8_1h_conver*log(view)
+b8_1h_conver*ctcvr
+b8_2h_ctr
+b8_2h_ctcvr
+b8_2h_cvr
+b8_2h_conver
+b8_2h_ecpm
+b8_2h_click
+b8_2h_conver*log(view)
+b8_2h_conver*ctcvr
+b8_3h_ctr
+b8_3h_ctcvr
+b8_3h_cvr
+b8_3h_conver
+b8_3h_ecpm
+b8_3h_click
+b8_3h_conver*log(view)
+b8_3h_conver*ctcvr
+b8_4h_ctr
+b8_4h_ctcvr
+b8_4h_cvr
+b8_4h_conver
+b8_4h_ecpm
+b8_4h_click
+b8_4h_conver*log(view)
+b8_4h_conver*ctcvr
+b8_5h_ctr
+b8_5h_ctcvr
+b8_5h_cvr
+b8_5h_conver
+b8_5h_ecpm
+b8_5h_click
+b8_5h_conver*log(view)
+b8_5h_conver*ctcvr
+b8_6h_ctr
+b8_6h_ctcvr
+b8_6h_cvr
+b8_6h_conver
+b8_6h_ecpm
+b8_6h_click
+b8_6h_conver*log(view)
+b8_6h_conver*ctcvr
+b8_12h_ctr
+b8_12h_ctcvr
+b8_12h_cvr
+b8_12h_conver
+b8_12h_ecpm
+b8_12h_click
+b8_12h_conver*log(view)
+b8_12h_conver*ctcvr
+b8_1d_ctr
+b8_1d_ctcvr
+b8_1d_cvr
+b8_1d_conver
+b8_1d_ecpm
+b8_1d_click
+b8_1d_conver*log(view)
+b8_1d_conver*ctcvr
+b8_3d_ctr
+b8_3d_ctcvr
+b8_3d_cvr
+b8_3d_conver
+b8_3d_ecpm
+b8_3d_click
+b8_3d_conver*log(view)
+b8_3d_conver*ctcvr
+b8_7d_ctr
+b8_7d_ctcvr
+b8_7d_cvr
+b8_7d_conver
+b8_7d_ecpm
+b8_7d_click
+b8_7d_conver*log(view)
+b8_7d_conver*ctcvr
+b8_today_ctr
+b8_today_ctcvr
+b8_today_cvr
+b8_today_conver
+b8_today_ecpm
+b8_today_click
+b8_today_conver*log(view)
+b8_today_conver*ctcvr
+b8_yesterday_ctr
+b8_yesterday_ctcvr
+b8_yesterday_cvr
+b8_yesterday_conver
+b8_yesterday_ecpm
+b8_yesterday_click
+b8_yesterday_conver*log(view)
+b8_yesterday_conver*ctcvr
+b9_1h_ctr
+b9_1h_ctcvr
+b9_1h_cvr
+b9_1h_conver
+b9_1h_ecpm
+b9_1h_click
+b9_1h_conver*log(view)
+b9_1h_conver*ctcvr
+b9_2h_ctr
+b9_2h_ctcvr
+b9_2h_cvr
+b9_2h_conver
+b9_2h_ecpm
+b9_2h_click
+b9_2h_conver*log(view)
+b9_2h_conver*ctcvr
+b9_3h_ctr
+b9_3h_ctcvr
+b9_3h_cvr
+b9_3h_conver
+b9_3h_ecpm
+b9_3h_click
+b9_3h_conver*log(view)
+b9_3h_conver*ctcvr
+b9_4h_ctr
+b9_4h_ctcvr
+b9_4h_cvr
+b9_4h_conver
+b9_4h_ecpm
+b9_4h_click
+b9_4h_conver*log(view)
+b9_4h_conver*ctcvr
+b9_5h_ctr
+b9_5h_ctcvr
+b9_5h_cvr
+b9_5h_conver
+b9_5h_ecpm
+b9_5h_click
+b9_5h_conver*log(view)
+b9_5h_conver*ctcvr
+b9_6h_ctr
+b9_6h_ctcvr
+b9_6h_cvr
+b9_6h_conver
+b9_6h_ecpm
+b9_6h_click
+b9_6h_conver*log(view)
+b9_6h_conver*ctcvr
+b9_12h_ctr
+b9_12h_ctcvr
+b9_12h_cvr
+b9_12h_conver
+b9_12h_ecpm
+b9_12h_click
+b9_12h_conver*log(view)
+b9_12h_conver*ctcvr
+b9_1d_ctr
+b9_1d_ctcvr
+b9_1d_cvr
+b9_1d_conver
+b9_1d_ecpm
+b9_1d_click
+b9_1d_conver*log(view)
+b9_1d_conver*ctcvr
+b9_3d_ctr
+b9_3d_ctcvr
+b9_3d_cvr
+b9_3d_conver
+b9_3d_ecpm
+b9_3d_click
+b9_3d_conver*log(view)
+b9_3d_conver*ctcvr
+b9_7d_ctr
+b9_7d_ctcvr
+b9_7d_cvr
+b9_7d_conver
+b9_7d_ecpm
+b9_7d_click
+b9_7d_conver*log(view)
+b9_7d_conver*ctcvr
+b9_today_ctr
+b9_today_ctcvr
+b9_today_cvr
+b9_today_conver
+b9_today_ecpm
+b9_today_click
+b9_today_conver*log(view)
+b9_today_conver*ctcvr
+b9_yesterday_ctr
+b9_yesterday_ctcvr
+b9_yesterday_cvr
+b9_yesterday_conver
+b9_yesterday_ecpm
+b9_yesterday_click
+b9_yesterday_conver*log(view)
+b9_yesterday_conver*ctcvr
+b6_7d_ctr
+b6_7d_ctcvr
+b6_7d_cvr
+b6_7d_conver
+b6_7d_ecpm
+b6_7d_click
+b6_7d_conver*log(view)
+b6_7d_conver*ctcvr
+b6_14d_ctr
+b6_14d_ctcvr
+b6_14d_cvr
+b6_14d_conver
+b6_14d_ecpm
+b6_14d_click
+b6_14d_conver*log(view)
+b6_14d_conver*ctcvr
+b7_7d_ctr
+b7_7d_ctcvr
+b7_7d_cvr
+b7_7d_conver
+b7_7d_ecpm
+b7_7d_click
+b7_7d_conver*log(view)
+b7_7d_conver*ctcvr
+b7_14d_ctr
+b7_14d_ctcvr
+b7_14d_cvr
+b7_14d_conver
+b7_14d_ecpm
+b7_14d_click
+b7_14d_conver*log(view)
+b7_14d_conver*ctcvr
+viewAll
+clickAll
+converAll
+incomeAll
+ctr_all
+ctcvr_all
+cvr_all
+ecpm_all
+timediff_view
+timediff_click
+timediff_conver
+actionstatic_view
+actionstatic_click
+actionstatic_conver
+actionstatic_income
+actionstatic_ctr
+actionstatic_ctcvr
+actionstatic_cvr
+e1_tags_3d_matchnum
+e1_tags_3d_maxscore
+e1_tags_3d_avgscore
+e1_tags_7d_matchnum
+e1_tags_7d_maxscore
+e1_tags_7d_avgscore
+e1_tags_14d_matchnum
+e1_tags_14d_maxscore
+e1_tags_14d_avgscore
+e2_tags_3d_matchnum
+e2_tags_3d_maxscore
+e2_tags_3d_avgscore
+e2_tags_7d_matchnum
+e2_tags_7d_maxscore
+e2_tags_7d_avgscore
+e2_tags_14d_matchnum
+e2_tags_14d_maxscore
+e2_tags_14d_avgscore
+d1_feature_3h_ctr
+d1_feature_3h_ctcvr
+d1_feature_3h_cvr
+d1_feature_3h_conver
+d1_feature_3h_ecpm
+d1_feature_6h_ctr
+d1_feature_6h_ctcvr
+d1_feature_6h_cvr
+d1_feature_6h_conver
+d1_feature_6h_ecpm
+d1_feature_12h_ctr
+d1_feature_12h_ctcvr
+d1_feature_12h_cvr
+d1_feature_12h_conver
+d1_feature_12h_ecpm
+d1_feature_1d_ctr
+d1_feature_1d_ctcvr
+d1_feature_1d_cvr
+d1_feature_1d_conver
+d1_feature_1d_ecpm
+d1_feature_3d_ctr
+d1_feature_3d_ctcvr
+d1_feature_3d_cvr
+d1_feature_3d_conver
+d1_feature_3d_ecpm
+d1_feature_7d_ctr
+d1_feature_7d_ctcvr
+d1_feature_7d_cvr
+d1_feature_7d_conver
+d1_feature_7d_ecpm
+vid_rank_ctr_1d
+vid_rank_ctr_3d
+vid_rank_ctr_7d
+vid_rank_ctr_14d
+vid_rank_ctcvr_1d
+vid_rank_ctcvr_3d
+vid_rank_ctcvr_7d
+vid_rank_ctcvr_14d
+vid_rank_ecpm_1d
+vid_rank_ecpm_3d
+vid_rank_ecpm_7d
+vid_rank_ecpm_14d
+ctitle_vtitle_similarity

+ 11 - 9
src/main/scala/com/aliyun/odps/spark/examples/makedata_ad/makedata_ad_32_bucket_20240718.scala

@@ -21,8 +21,18 @@ object makedata_ad_32_bucket_20240718 {
       .getOrCreate()
     val sc = spark.sparkContext
 
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/20240620*")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/32_bucket_data/")
+    val fileName = param.getOrElse("fileName", "20240620_100")
+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
+    val bucketNum = param.getOrElse("bucketNum", "100").toInt
+    val featureNameFile = param.getOrElse("featureNameFile", "20240718_ad_feature_name.txt");
+
+
     val loader = getClass.getClassLoader
-    val resourceUrl = loader.getResource("20240703_ad_feature_name.txt")
+    val resourceUrl = loader.getResource(featureNameFile)
     val content =
       if (resourceUrl != null) {
         val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
@@ -38,14 +48,6 @@ object makedata_ad_32_bucket_20240718 {
 
 
 
-    // 1 读取参数
-    val param = ParamUtils.parseArgs(args)
-    val readPath = param.getOrElse("readPath", "/dw/recommend/model/31_ad_sample_data/20240620*")
-    val savePath = param.getOrElse("savePath", "/dw/recommend/model/32_bucket_data/")
-    val fileName = param.getOrElse("fileName", "20240620_100")
-    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
-    val bucketNum = param.getOrElse("bucketNum", "100").toInt
-
     val data = sc.textFile(readPath)
     println("问题数据数量:" + data.filter(r=>r.split("\t").length != 3).count())
     val data1 = data.map(r => {

+ 2 - 2
src/main/scala/com/aliyun/odps/spark/zhp/临时记录的脚本-广告

@@ -6,8 +6,8 @@ nohup /opt/apps/SPARK2/spark-2.4.8-hadoop3.2-1.0.8/bin/spark-class2 org.apache.s
 --master yarn --driver-memory 1G --executor-memory 2G --executor-cores 1 --num-executors 16 \
 ./target/spark-examples-1.0.0-SNAPSHOT-shaded.jar \
 tablePart:64 repartition:32 \
-beginStr:2024071200 endStr:2024071209 \
-savePath:/dw/recommend/model/33_ad_train_data_v4/ \
+beginStr:2024071200 endStr:2024071408 \
+savePath:/dw/recommend/model/31_ad_sample_data_v4 \
 table:alg_recsys_ad_sample_all \
 > logs/p31_2024062008.log 2>&1 &