Browse Source

新样本数据生产

zhangbo 1 year ago
parent
commit
478f07e8a4

+ 14 - 10
src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData_v2.scala → src/main/scala/com/aliyun/odps/spark/examples/makedata/makedata_06_originData_v3.scala

@@ -17,7 +17,7 @@ import scala.collection.mutable
    注意:所有的构造特征,原始值为0.0时,当作无意义,不保留; 如果经过change变换,得到0.0,保留。
  */
 
-object makedata_06_originData_v2 {
+object makedata_06_originData_v3 {
   def main(args: Array[String]) {
     val spark = SparkSession
       .builder()
@@ -31,9 +31,9 @@ object makedata_06_originData_v2 {
     val partitionPrefix = param.getOrElse("partitionPrefix", "dt=")
     val beginStr = param.getOrElse("beginStr", "20230101")
     val endStr = param.getOrElse("endStr", "20230101")
-    val savePath = param.getOrElse("savePath", "/dw/recommend/model/00_sample_data/")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/00_sample_data_v3/")
     val project = param.getOrElse("project", "loghubods")
-    val table = param.getOrElse("table", "alg_recsys_view_sample_v2")
+    val table = param.getOrElse("table", "alg_recsys_view_sample_v3")
 
 
     // 2 读取odps+表信息
@@ -65,7 +65,8 @@ object makedata_06_originData_v2 {
             "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
             "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
             "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
-            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt"
+            "i_3month_exp_cnt", "i_3month_click_cnt", "i_3month_share_cnt", "i_3month_return_cnt",
+            "video_recommend"
           )
           val originFeatureMap = getFeatureFromSet(originFeatureName, record)
 
@@ -97,7 +98,7 @@ object makedata_06_originData_v2 {
             "apptype", "logtimestamp", "clientip", "ctx_day", "ctx_week", "ctx_hour", "ctx_region", "ctx_city",
             "gender", "machineinfo_brand", "machineinfo_model", "machineinfo_platform", "machineinfo_sdkversion",
             "machineinfo_system", "machineinfo_wechatversion", "gmt_create_user",
-            "title", "tags"
+            "title", "tags", "video_recommend"
           ), record)
           val f2 = RankExtractorUserFeature.getUserRateFeature(originFeatureMap)
           val f3 = RankExtractorUserFeature.cntFeatureChange(originFeatureMap,
@@ -157,7 +158,7 @@ object makedata_06_originData_v2 {
             "u_7day_exp_cnt", "u_7day_click_cnt", "u_7day_share_cnt", "u_7day_return_cnt",
             "u_3month_exp_cnt", "u_3month_click_cnt", "u_3month_share_cnt", "u_3month_return_cnt",
 
-            "title", "tags", "total_time", "play_count_total",
+            "title", "tags", "total_time", "play_count_total", "video_recommend",
             "i_1day_exp_cnt", "i_1day_click_cnt", "i_1day_share_cnt", "i_1day_return_cnt",
             "i_3day_exp_cnt", "i_3day_click_cnt", "i_3day_share_cnt", "i_3day_return_cnt",
             "i_7day_exp_cnt", "i_7day_click_cnt", "i_7day_share_cnt", "i_7day_return_cnt",
@@ -199,9 +200,11 @@ object makedata_06_originData_v2 {
           })
           //2: label聚合到map中
           val labels = Set(
-            "is_share", "is_return", "playtime",
-            "is_play",
-            "share_ts", "share_ts_list", "return_mid_ts_list"
+            "pagesource", "recommend_page_type", "pagesource_change",
+            "abcode",
+            "is_play", "playtime",
+            "is_share", "share_cnt_pv", "share_ts_list",
+            "is_return", "return_cnt_pv", "return_cnt_uv", "return_mid_ts_list"
           )
           val labelNew = new JSONObject
           val labelMap = getFeatureFromSet(labels, record)
@@ -215,8 +218,9 @@ object makedata_06_originData_v2 {
           val videoid = record.getString("videoid")
           val logtimestamp = record.getString("logtimestamp")
           val sessionid = record.getString("sessionid")
+          val apptype = record.getString("apptype")
 
-          val logKey = (mid, videoid, logtimestamp, sessionid).productIterator.mkString(":")
+          val logKey = (mid, videoid, logtimestamp, sessionid, apptype).productIterator.mkString(":")
           val labelKey = labelNew.toString()
           val featureKey = resultNew.toString()