浏览代码

rov和nor特征分桶

jch 4 月之前
父节点
当前提交
45bf3f9a08

+ 756 - 0
src/main/resources/20241209_recsys_feature_name_756.txt

@@ -0,0 +1,756 @@
+b111213_12h_ROS
+b111213_12h_ROV
+b111213_12h_ROV*log(return)
+b111213_12h_STR
+b111213_12h_log(return)
+b111213_12h_log(share)
+b111213_1d_ROS
+b111213_1d_ROV
+b111213_1d_ROV*log(return)
+b111213_1d_STR
+b111213_1d_log(return)
+b111213_1d_log(share)
+b111213_1h_ROS
+b111213_1h_ROV
+b111213_1h_ROV*log(return)
+b111213_1h_STR
+b111213_1h_log(return)
+b111213_1h_log(share)
+b111213_2h_ROS
+b111213_2h_ROV
+b111213_2h_ROV*log(return)
+b111213_2h_STR
+b111213_2h_log(return)
+b111213_2h_log(share)
+b111213_3d_ROS
+b111213_3d_ROV
+b111213_3d_ROV*log(return)
+b111213_3d_STR
+b111213_3d_log(return)
+b111213_3d_log(share)
+b111213_3h_ROS
+b111213_3h_ROV
+b111213_3h_ROV*log(return)
+b111213_3h_STR
+b111213_3h_log(return)
+b111213_3h_log(share)
+b111213_4h_ROS
+b111213_4h_ROV
+b111213_4h_ROV*log(return)
+b111213_4h_STR
+b111213_4h_log(return)
+b111213_4h_log(share)
+b111213_7d_ROS
+b111213_7d_ROV
+b111213_7d_ROV*log(return)
+b111213_7d_STR
+b111213_7d_log(return)
+b111213_7d_log(share)
+b123_12h_ROS
+b123_12h_ROV
+b123_12h_ROV*log(return)
+b123_12h_STR
+b123_12h_log(return)
+b123_12h_log(share)
+b123_1d_ROS
+b123_1d_ROV
+b123_1d_ROV*log(return)
+b123_1d_STR
+b123_1d_log(return)
+b123_1d_log(share)
+b123_1h_ROS
+b123_1h_ROV
+b123_1h_ROV*log(return)
+b123_1h_STR
+b123_1h_log(return)
+b123_1h_log(share)
+b123_2h_ROS
+b123_2h_ROV
+b123_2h_ROV*log(return)
+b123_2h_STR
+b123_2h_log(return)
+b123_2h_log(share)
+b123_3d_ROS
+b123_3d_ROV
+b123_3d_ROV*log(return)
+b123_3d_STR
+b123_3d_log(return)
+b123_3d_log(share)
+b123_3h_ROS
+b123_3h_ROV
+b123_3h_ROV*log(return)
+b123_3h_STR
+b123_3h_log(return)
+b123_3h_log(share)
+b123_4h_ROS
+b123_4h_ROV
+b123_4h_ROV*log(return)
+b123_4h_STR
+b123_4h_log(return)
+b123_4h_log(share)
+b123_7d_ROS
+b123_7d_ROV
+b123_7d_ROV*log(return)
+b123_7d_STR
+b123_7d_log(return)
+b123_7d_log(share)
+b167_12h_ROS
+b167_12h_ROV
+b167_12h_ROV*log(return)
+b167_12h_STR
+b167_12h_log(return)
+b167_12h_log(share)
+b167_1d_ROS
+b167_1d_ROV
+b167_1d_ROV*log(return)
+b167_1d_STR
+b167_1d_log(return)
+b167_1d_log(share)
+b167_1h_ROS
+b167_1h_ROV
+b167_1h_ROV*log(return)
+b167_1h_STR
+b167_1h_log(return)
+b167_1h_log(share)
+b167_2h_ROS
+b167_2h_ROV
+b167_2h_ROV*log(return)
+b167_2h_STR
+b167_2h_log(return)
+b167_2h_log(share)
+b167_3d_ROS
+b167_3d_ROV
+b167_3d_ROV*log(return)
+b167_3d_STR
+b167_3d_log(return)
+b167_3d_log(share)
+b167_3h_ROS
+b167_3h_ROV
+b167_3h_ROV*log(return)
+b167_3h_STR
+b167_3h_log(return)
+b167_3h_log(share)
+b167_4h_ROS
+b167_4h_ROV
+b167_4h_ROV*log(return)
+b167_4h_STR
+b167_4h_log(return)
+b167_4h_log(share)
+b167_7d_ROS
+b167_7d_ROV
+b167_7d_ROV*log(return)
+b167_7d_STR
+b167_7d_log(return)
+b167_7d_log(share)
+b171819_12h_ROS
+b171819_12h_ROV
+b171819_12h_ROV*log(return)
+b171819_12h_STR
+b171819_12h_log(return)
+b171819_12h_log(share)
+b171819_1d_ROS
+b171819_1d_ROV
+b171819_1d_ROV*log(return)
+b171819_1d_STR
+b171819_1d_log(return)
+b171819_1d_log(share)
+b171819_1h_ROS
+b171819_1h_ROV
+b171819_1h_ROV*log(return)
+b171819_1h_STR
+b171819_1h_log(return)
+b171819_1h_log(share)
+b171819_2h_ROS
+b171819_2h_ROV
+b171819_2h_ROV*log(return)
+b171819_2h_STR
+b171819_2h_log(return)
+b171819_2h_log(share)
+b171819_3d_ROS
+b171819_3d_ROV
+b171819_3d_ROV*log(return)
+b171819_3d_STR
+b171819_3d_log(return)
+b171819_3d_log(share)
+b171819_3h_ROS
+b171819_3h_ROV
+b171819_3h_ROV*log(return)
+b171819_3h_STR
+b171819_3h_log(return)
+b171819_3h_log(share)
+b171819_4h_ROS
+b171819_4h_ROV
+b171819_4h_ROV*log(return)
+b171819_4h_STR
+b171819_4h_log(return)
+b171819_4h_log(share)
+b171819_7d_ROS
+b171819_7d_ROV
+b171819_7d_ROV*log(return)
+b171819_7d_STR
+b171819_7d_log(return)
+b171819_7d_log(share)
+b20_12h_r_cnt
+b20_12h_r_cnt4s
+b20_12h_r_rate
+b20_12h_return
+b20_12h_ros
+b20_12h_rov
+b20_12h_share
+b20_12h_share_hasreturn
+b20_12h_str
+b20_12h_view_hasreturn
+b20_1h_r_cnt
+b20_1h_r_cnt4s
+b20_1h_r_rate
+b20_1h_return
+b20_1h_ros
+b20_1h_rov
+b20_1h_share
+b20_1h_share_hasreturn
+b20_1h_str
+b20_1h_view_hasreturn
+b20_24h_r_cnt
+b20_24h_r_cnt4s
+b20_24h_r_rate
+b20_24h_return
+b20_24h_ros
+b20_24h_rov
+b20_24h_share
+b20_24h_share_hasreturn
+b20_24h_str
+b20_24h_view_hasreturn
+b20_2h_r_cnt
+b20_2h_r_cnt4s
+b20_2h_r_rate
+b20_2h_return
+b20_2h_ros
+b20_2h_rov
+b20_2h_share
+b20_2h_share_hasreturn
+b20_2h_str
+b20_2h_view_hasreturn
+b20_4h_r_cnt
+b20_4h_r_cnt4s
+b20_4h_r_rate
+b20_4h_return
+b20_4h_ros
+b20_4h_rov
+b20_4h_share
+b20_4h_share_hasreturn
+b20_4h_str
+b20_4h_view_hasreturn
+b20_6h_r_cnt
+b20_6h_r_cnt4s
+b20_6h_r_rate
+b20_6h_return
+b20_6h_ros
+b20_6h_rov
+b20_6h_share
+b20_6h_share_hasreturn
+b20_6h_str
+b20_6h_view_hasreturn
+b20_7d_r_cnt
+b20_7d_r_cnt4s
+b20_7d_r_rate
+b20_7d_return
+b20_7d_ros
+b20_7d_rov
+b20_7d_share
+b20_7d_share_hasreturn
+b20_7d_str
+b20_7d_view_hasreturn
+b21_12h_r_cnt
+b21_12h_r_cnt4s
+b21_12h_r_rate
+b21_12h_return
+b21_12h_ros
+b21_12h_rov
+b21_12h_share
+b21_12h_share_hasreturn
+b21_12h_str
+b21_12h_view_hasreturn
+b21_1h_r_cnt
+b21_1h_r_cnt4s
+b21_1h_r_rate
+b21_1h_return
+b21_1h_ros
+b21_1h_rov
+b21_1h_share
+b21_1h_share_hasreturn
+b21_1h_str
+b21_1h_view_hasreturn
+b21_24h_r_cnt
+b21_24h_r_cnt4s
+b21_24h_r_rate
+b21_24h_return
+b21_24h_ros
+b21_24h_rov
+b21_24h_share
+b21_24h_share_hasreturn
+b21_24h_str
+b21_24h_view_hasreturn
+b21_2h_r_cnt
+b21_2h_r_cnt4s
+b21_2h_r_rate
+b21_2h_return
+b21_2h_ros
+b21_2h_rov
+b21_2h_share
+b21_2h_share_hasreturn
+b21_2h_str
+b21_2h_view_hasreturn
+b21_4h_r_cnt
+b21_4h_r_cnt4s
+b21_4h_r_rate
+b21_4h_return
+b21_4h_ros
+b21_4h_rov
+b21_4h_share
+b21_4h_share_hasreturn
+b21_4h_str
+b21_4h_view_hasreturn
+b21_6h_r_cnt
+b21_6h_r_cnt4s
+b21_6h_r_rate
+b21_6h_return
+b21_6h_ros
+b21_6h_rov
+b21_6h_share
+b21_6h_share_hasreturn
+b21_6h_str
+b21_6h_view_hasreturn
+b21_7d_r_cnt
+b21_7d_r_cnt4s
+b21_7d_r_rate
+b21_7d_return
+b21_7d_ros
+b21_7d_rov
+b21_7d_share
+b21_7d_share_hasreturn
+b21_7d_str
+b21_7d_view_hasreturn
+b22_12h_r_cnt
+b22_12h_r_cnt4s
+b22_12h_r_rate
+b22_12h_return
+b22_12h_ros
+b22_12h_rov
+b22_12h_share
+b22_12h_share_hasreturn
+b22_12h_str
+b22_12h_view_hasreturn
+b22_1h_r_cnt
+b22_1h_r_cnt4s
+b22_1h_r_rate
+b22_1h_return
+b22_1h_ros
+b22_1h_rov
+b22_1h_share
+b22_1h_share_hasreturn
+b22_1h_str
+b22_1h_view_hasreturn
+b22_24h_r_cnt
+b22_24h_r_cnt4s
+b22_24h_r_rate
+b22_24h_return
+b22_24h_ros
+b22_24h_rov
+b22_24h_share
+b22_24h_share_hasreturn
+b22_24h_str
+b22_24h_view_hasreturn
+b22_2h_r_cnt
+b22_2h_r_cnt4s
+b22_2h_r_rate
+b22_2h_return
+b22_2h_ros
+b22_2h_rov
+b22_2h_share
+b22_2h_share_hasreturn
+b22_2h_str
+b22_2h_view_hasreturn
+b22_4h_r_cnt
+b22_4h_r_cnt4s
+b22_4h_r_rate
+b22_4h_return
+b22_4h_ros
+b22_4h_rov
+b22_4h_share
+b22_4h_share_hasreturn
+b22_4h_str
+b22_4h_view_hasreturn
+b22_6h_r_cnt
+b22_6h_r_cnt4s
+b22_6h_r_rate
+b22_6h_return
+b22_6h_ros
+b22_6h_rov
+b22_6h_share
+b22_6h_share_hasreturn
+b22_6h_str
+b22_6h_view_hasreturn
+b22_7d_r_cnt
+b22_7d_r_cnt4s
+b22_7d_r_rate
+b22_7d_return
+b22_7d_ros
+b22_7d_rov
+b22_7d_share
+b22_7d_share_hasreturn
+b22_7d_str
+b22_7d_view_hasreturn
+b23_14d_r_cnt
+b23_14d_r_cnt4s
+b23_14d_r_rate
+b23_14d_return
+b23_14d_ros
+b23_14d_rov
+b23_14d_share
+b23_14d_share_hasreturn
+b23_14d_str
+b23_14d_view_hasreturn
+b23_30d_r_cnt
+b23_30d_r_cnt4s
+b23_30d_r_rate
+b23_30d_return
+b23_30d_ros
+b23_30d_rov
+b23_30d_share
+b23_30d_share_hasreturn
+b23_30d_str
+b23_30d_view_hasreturn
+b24_14d_r_cnt
+b24_14d_r_cnt4s
+b24_14d_r_rate
+b24_14d_return
+b24_14d_ros
+b24_14d_rov
+b24_14d_share
+b24_14d_share_hasreturn
+b24_14d_str
+b24_14d_view_hasreturn
+b24_30d_r_cnt
+b24_30d_r_cnt4s
+b24_30d_r_rate
+b24_30d_return
+b24_30d_ros
+b24_30d_rov
+b24_30d_share
+b24_30d_share_hasreturn
+b24_30d_str
+b24_30d_view_hasreturn
+b25_14d_r_cnt
+b25_14d_r_cnt4s
+b25_14d_r_rate
+b25_14d_return
+b25_14d_ros
+b25_14d_rov
+b25_14d_share
+b25_14d_share_hasreturn
+b25_14d_str
+b25_14d_view_hasreturn
+b25_30d_r_cnt
+b25_30d_r_cnt4s
+b25_30d_r_rate
+b25_30d_return
+b25_30d_ros
+b25_30d_rov
+b25_30d_share
+b25_30d_share_hasreturn
+b25_30d_str
+b25_30d_view_hasreturn
+b26_35d_r_cnt
+b26_35d_r_cnt4s
+b26_35d_r_rate
+b26_35d_return
+b26_35d_ros
+b26_35d_rov
+b26_35d_share
+b26_35d_share_hasreturn
+b26_35d_str
+b26_35d_view_hasreturn
+b26_365d_r_cnt
+b26_365d_r_cnt4s
+b26_365d_r_rate
+b26_365d_return
+b26_365d_ros
+b26_365d_rov
+b26_365d_share
+b26_365d_share_hasreturn
+b26_365d_str
+b26_365d_view_hasreturn
+b26_7d_r_cnt
+b26_7d_r_cnt4s
+b26_7d_r_rate
+b26_7d_return
+b26_7d_ros
+b26_7d_rov
+b26_7d_share
+b26_7d_share_hasreturn
+b26_7d_str
+b26_7d_view_hasreturn
+b26_90d_r_cnt
+b26_90d_r_cnt4s
+b26_90d_r_rate
+b26_90d_return
+b26_90d_ros
+b26_90d_rov
+b26_90d_share
+b26_90d_share_hasreturn
+b26_90d_str
+b26_90d_view_hasreturn
+b27_35d_r_cnt
+b27_35d_r_cnt4s
+b27_35d_r_rate
+b27_35d_return
+b27_35d_ros
+b27_35d_rov
+b27_35d_share
+b27_35d_share_hasreturn
+b27_35d_str
+b27_35d_view_hasreturn
+b27_365d_r_cnt
+b27_365d_r_cnt4s
+b27_365d_r_rate
+b27_365d_return
+b27_365d_ros
+b27_365d_rov
+b27_365d_share
+b27_365d_share_hasreturn
+b27_365d_str
+b27_365d_view_hasreturn
+b27_7d_r_cnt
+b27_7d_r_cnt4s
+b27_7d_r_rate
+b27_7d_return
+b27_7d_ros
+b27_7d_rov
+b27_7d_share
+b27_7d_share_hasreturn
+b27_7d_str
+b27_7d_view_hasreturn
+b27_90d_r_cnt
+b27_90d_r_cnt4s
+b27_90d_r_rate
+b27_90d_return
+b27_90d_ros
+b27_90d_rov
+b27_90d_share
+b27_90d_share_hasreturn
+b27_90d_str
+b27_90d_view_hasreturn
+b28_12h_r_cnt
+b28_12h_r_cnt4s
+b28_12h_r_rate
+b28_12h_return
+b28_12h_ros
+b28_12h_rov
+b28_12h_share
+b28_12h_share_hasreturn
+b28_12h_str
+b28_12h_view_hasreturn
+b28_1h_r_cnt
+b28_1h_r_cnt4s
+b28_1h_r_rate
+b28_1h_return
+b28_1h_ros
+b28_1h_rov
+b28_1h_share
+b28_1h_share_hasreturn
+b28_1h_str
+b28_1h_view_hasreturn
+b28_24h_r_cnt
+b28_24h_r_cnt4s
+b28_24h_r_rate
+b28_24h_return
+b28_24h_ros
+b28_24h_rov
+b28_24h_share
+b28_24h_share_hasreturn
+b28_24h_str
+b28_24h_view_hasreturn
+b28_2h_r_cnt
+b28_2h_r_cnt4s
+b28_2h_r_rate
+b28_2h_return
+b28_2h_ros
+b28_2h_rov
+b28_2h_share
+b28_2h_share_hasreturn
+b28_2h_str
+b28_2h_view_hasreturn
+b28_4h_r_cnt
+b28_4h_r_cnt4s
+b28_4h_r_rate
+b28_4h_return
+b28_4h_ros
+b28_4h_rov
+b28_4h_share
+b28_4h_share_hasreturn
+b28_4h_str
+b28_4h_view_hasreturn
+b28_6h_r_cnt
+b28_6h_r_cnt4s
+b28_6h_r_rate
+b28_6h_return
+b28_6h_ros
+b28_6h_rov
+b28_6h_share
+b28_6h_share_hasreturn
+b28_6h_str
+b28_6h_view_hasreturn
+b28_7d_r_cnt
+b28_7d_r_cnt4s
+b28_7d_r_rate
+b28_7d_return
+b28_7d_ros
+b28_7d_rov
+b28_7d_share
+b28_7d_share_hasreturn
+b28_7d_str
+b28_7d_view_hasreturn
+b8910_12h_ROS
+b8910_12h_ROV
+b8910_12h_ROV*log(return)
+b8910_12h_STR
+b8910_12h_log(return)
+b8910_12h_log(share)
+b8910_1d_ROS
+b8910_1d_ROV
+b8910_1d_ROV*log(return)
+b8910_1d_STR
+b8910_1d_log(return)
+b8910_1d_log(share)
+b8910_1h_ROS
+b8910_1h_ROV
+b8910_1h_ROV*log(return)
+b8910_1h_STR
+b8910_1h_log(return)
+b8910_1h_log(share)
+b8910_2h_ROS
+b8910_2h_ROV
+b8910_2h_ROV*log(return)
+b8910_2h_STR
+b8910_2h_log(return)
+b8910_2h_log(share)
+b8910_3d_ROS
+b8910_3d_ROV
+b8910_3d_ROV*log(return)
+b8910_3d_STR
+b8910_3d_log(return)
+b8910_3d_log(share)
+b8910_3h_ROS
+b8910_3h_ROV
+b8910_3h_ROV*log(return)
+b8910_3h_STR
+b8910_3h_log(return)
+b8910_3h_log(share)
+b8910_4h_ROS
+b8910_4h_ROV
+b8910_4h_ROV*log(return)
+b8910_4h_STR
+b8910_4h_log(return)
+b8910_4h_log(share)
+b8910_7d_ROS
+b8910_7d_ROV
+b8910_7d_ROV*log(return)
+b8910_7d_STR
+b8910_7d_log(return)
+b8910_7d_log(share)
+bit_rate
+c3_feature_tags_1d_avgscore
+c3_feature_tags_1d_matchnum
+c3_feature_tags_1d_maxscore
+c3_feature_tags_3d_avgscore
+c3_feature_tags_3d_matchnum
+c3_feature_tags_3d_maxscore
+c3_feature_tags_7d_avgscore
+c3_feature_tags_7d_matchnum
+c3_feature_tags_7d_maxscore
+c4_feature_tags_1d_avgscore
+c4_feature_tags_1d_matchnum
+c4_feature_tags_1d_maxscore
+c4_feature_tags_3d_avgscore
+c4_feature_tags_3d_matchnum
+c4_feature_tags_3d_maxscore
+c4_feature_tags_7d_avgscore
+c4_feature_tags_7d_matchnum
+c4_feature_tags_7d_maxscore
+c5_feature_tags_1d_avgscore
+c5_feature_tags_1d_matchnum
+c5_feature_tags_1d_maxscore
+c5_feature_tags_3d_avgscore
+c5_feature_tags_3d_matchnum
+c5_feature_tags_3d_maxscore
+c5_feature_tags_7d_avgscore
+c5_feature_tags_7d_matchnum
+c5_feature_tags_7d_maxscore
+c6_feature_tags_1d_avgscore
+c6_feature_tags_1d_matchnum
+c6_feature_tags_1d_maxscore
+c6_feature_tags_3d_avgscore
+c6_feature_tags_3d_matchnum
+c6_feature_tags_3d_maxscore
+c6_feature_tags_7d_avgscore
+c6_feature_tags_7d_matchnum
+c6_feature_tags_7d_maxscore
+c7_feature_tags_1d_avgscore
+c7_feature_tags_1d_matchnum
+c7_feature_tags_1d_maxscore
+c7_feature_tags_3d_avgscore
+c7_feature_tags_3d_matchnum
+c7_feature_tags_3d_maxscore
+c7_feature_tags_7d_avgscore
+c7_feature_tags_7d_matchnum
+c7_feature_tags_7d_maxscore
+c8_feature_return_num
+c8_feature_return_rank
+c8_feature_return_score
+c8_feature_share_num
+c8_feature_share_rank
+c8_feature_share_score
+c9_feature_return_num
+c9_feature_return_rank
+c9_feature_return_score
+c9_feature_share_num
+c9_feature_share_rank
+c9_feature_share_score
+d1_exp
+d1_return_n
+d1_rovn
+d2_exp
+d2_return_n
+d2_rosn
+d3_exp
+d3_return_n
+d3_rosn
+d4_exp
+d4_return_n
+d4_rovn
+d5_exp
+d5_return_n
+d5_rovn
+d6
+playcnt_1d
+playcnt_3d
+playcnt_6h
+playcnt_7d
+return_uv_12h
+return_uv_1d
+return_uv_3d
+return_uv_7d
+share_pv_12h
+share_pv_1d
+share_pv_3d
+share_pv_7d
+total_time
+video_sim_cate1_list
+video_sim_cate2
+video_sim_cate2_list
+video_sim_keywords
+video_sim_style
+video_sim_theme
+video_sim_title
+video_sim_topic
+video_sim_user_value

+ 102 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys_r_rate/makedata_recsys_61_bucket_20241209.scala

@@ -0,0 +1,102 @@
+package com.aliyun.odps.spark.examples.makedata_recsys
+
+import com.alibaba.fastjson.JSON
+import com.aliyun.odps.spark.examples.myUtils.{MyHdfsUtils, ParamUtils}
+import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.spark.sql.SparkSession
+
+import scala.collection.JavaConversions._
+import scala.collection.mutable.ArrayBuffer
+import scala.io.Source
+
+/*
+
+ */
+
+object makedata_recsys_61_bucket_20241209 {
+  def main(args: Array[String]): Unit = {
+    val spark = SparkSession
+      .builder()
+      .appName(this.getClass.getName)
+      .getOrCreate()
+    val sc = spark.sparkContext
+
+    val loader = getClass.getClassLoader
+    val resourceUrl = loader.getResource("20241209_recsys_feature_name_756.txt")
+    val content =
+      if (resourceUrl != null) {
+        val content = Source.fromURL(resourceUrl).getLines().mkString("\n")
+        Source.fromURL(resourceUrl).close()
+        content
+      } else {
+        ""
+      }
+    println(content)
+    val contentList = content.split("\n")
+      .map(r => r.replace(" ", "").replaceAll("\n", ""))
+      .filter(r => r.nonEmpty).toList
+
+
+    // 1 读取参数
+    val param = ParamUtils.parseArgs(args)
+    val readPath = param.getOrElse("readPath", "/dw/recommend/model/61_origin_data/20241210*")
+    val savePath = param.getOrElse("savePath", "/dw/recommend/model/61_recsys_bucket/")
+    val fileName = param.getOrElse("fileName", "20241209_756_200")
+    val sampleRate = param.getOrElse("sampleRate", "1.0").toDouble
+    val bucketNum = param.getOrElse("bucketNum", "200").toInt
+
+    val data = sc.textFile(readPath)
+    println("问题数据数量:" + data.filter(r => r.split("\t").length != 3).count())
+    val data1 = data.map(r => {
+      val rList = r.split("\t")
+      val jsons = JSON.parseObject(rList(2))
+      val doubles = scala.collection.mutable.Map[String, Double]()
+      jsons.foreach(r => {
+        doubles.put(r._1, jsons.getDoubleValue(r._1))
+      })
+      doubles
+    }).sample(false, sampleRate).repartition(20)
+
+    val result = new ArrayBuffer[String]()
+
+    for (i <- contentList.indices) {
+      println("特征:" + contentList(i))
+      val data2 = data1.map(r => r.getOrDefault(contentList(i), 0D)).filter(_ > 1E-8).collect().sorted
+      val len = data2.length
+      if (len == 0) {
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + "0")
+      } else {
+        val oneBucketNum = (len - 1) / (bucketNum - 1) + 1 // 确保每个桶至少有一个元素
+        val buffers = new ArrayBuffer[Double]()
+
+        var lastBucketValue = data2(0) // 记录上一个桶的切分点
+        for (j <- 0 until len by oneBucketNum) {
+          val d = data2(j)
+          if (j > 0 && d != lastBucketValue) {
+            // 如果当前切分点不同于上一个切分点,则保存当前切分点
+            buffers += d
+          }
+          lastBucketValue = d // 更新上一个桶的切分点
+        }
+
+        // 最后一个桶的结束点应该是数组的最后一个元素
+        if (!buffers.contains(data2.last)) {
+          buffers += data2.last
+        }
+        result.add(contentList(i) + "\t" + bucketNum.toString + "\t" + buffers.mkString(","))
+      }
+    }
+    val data3 = sc.parallelize(result)
+
+
+    // 4 保存数据到hdfs
+    val hdfsPath = savePath + "/" + fileName
+    if (hdfsPath.nonEmpty && hdfsPath.startsWith("/dw/recommend/model/")) {
+      println("删除路径并开始数据写入:" + hdfsPath)
+      MyHdfsUtils.delete_hdfs_path(hdfsPath)
+      data3.repartition(1).saveAsTextFile(hdfsPath, classOf[GzipCodec])
+    } else {
+      println("路径不合法,无法写入:" + hdfsPath)
+    }
+  }
+}

+ 1 - 0
src/main/scala/com/aliyun/odps/spark/examples/makedata_recsys_r_rate/makedata_recsys_61_originData_20241209.scala

@@ -289,6 +289,7 @@ object makedata_recsys_61_originData_20241209 {
             ---------------------------------------------------------------
             视频特征:(4*7+3*2+2*4)*10 = 420个
             CF: 13个
+            视频相似特征: 9个
 
 
              */