feature.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. #coding utf-8
  2. import sys
  3. import pandas as pd
  4. from tqdm import tqdm
  5. from collections import defaultdict
  6. types = defaultdict(str)
  7. item_sparse_conf = [
  8. # 基础特征_视频
  9. 'i_id',
  10. 'i_up_id',
  11. # 'i_tag',
  12. # 'i_title',
  13. 'i_title_len',
  14. 'i_play_len',
  15. 'i_days_since_upload',
  16. # 统计特征_视频
  17. 'i_1day_exp_cnt',
  18. 'i_1day_click_cnt',
  19. 'i_1day_share_cnt',
  20. 'i_1day_return_cnt',
  21. 'i_3day_exp_cnt',
  22. 'i_3day_click_cnt',
  23. 'i_3day_share_cnt',
  24. 'i_3day_return_cnt',
  25. 'i_7day_exp_cnt',
  26. 'i_7day_click_cnt',
  27. 'i_7day_share_cnt',
  28. 'i_7day_return_cnt',
  29. 'i_3month_exp_cnt',
  30. 'i_3month_click_cnt',
  31. 'i_3month_share_cnt',
  32. 'i_3month_return_cnt',
  33. ]
  34. item_dense_conf = [
  35. 'i_ctr_1day',
  36. 'i_str_1day',
  37. 'i_rov_1day',
  38. 'i_ros_1day',
  39. 'i_ctr_3day',
  40. 'i_str_3day',
  41. 'i_rov_3day',
  42. 'i_ros_3day',
  43. 'i_ctr_7day',
  44. 'i_str_7day',
  45. 'i_rov_7day',
  46. 'i_ros_7day',
  47. 'i_ctr_3month',
  48. 'i_str_3month',
  49. 'i_rov_3month',
  50. 'i_ros_3month',
  51. ]
  52. user_sparse_conf = [
  53. 'u_brand',
  54. 'u_device',
  55. 'u_system',
  56. 'u_system_ver',
  57. 'ctx_region',
  58. 'ctx_city',
  59. # 统计特征_用户
  60. 'u_cycle_bucket_7days',
  61. 'u_cycle_bucket_30days',
  62. 'u_share_bucket_30days',
  63. 'u_1day_exp_cnt',
  64. 'u_1day_click_cnt',
  65. 'u_1day_share_cnt',
  66. 'u_1day_return_cnt',
  67. 'u_3day_exp_cnt',
  68. 'u_3day_click_cnt',
  69. 'u_3day_share_cnt',
  70. 'u_3day_return_cnt',
  71. 'u_7day_exp_cnt',
  72. 'u_7day_click_cnt',
  73. 'u_7day_share_cnt',
  74. 'u_7day_return_cnt',
  75. 'u_3month_exp_cnt',
  76. 'u_3month_click_cnt',
  77. 'u_3month_share_cnt',
  78. 'u_3month_return_cnt',
  79. ]
  80. user_dense_conf = [
  81. 'u_ctr_1day',
  82. 'u_str_1day',
  83. 'u_rov_1day',
  84. 'u_ros_1day',
  85. 'u_ctr_3day',
  86. 'u_str_3day',
  87. 'u_rov_3day',
  88. 'u_ros_3day',
  89. 'u_ctr_7day',
  90. 'u_str_7day',
  91. 'u_rov_7day',
  92. 'u_ros_7day',
  93. 'u_ctr_3month',
  94. 'u_str_3month',
  95. 'u_rov_3month',
  96. 'u_ros_3month',
  97. ]
  98. def format_x(x):
  99. if x is None:
  100. x = ''
  101. return str(x).replace(' ', '').replace(':', '_')
  102. def sparse_fea_2_feature(v, k):
  103. f_k = format_x(k)
  104. f_v = format_x(v)
  105. if len(f_v) < 1:
  106. return (None, None)
  107. return ('#'.join([f_k, f_v]), 1.0)
  108. def dense_fea_2_feature(v, k):
  109. f_k = format_x(k)
  110. f_v = format_x(v)
  111. if len(f_v) < 1:
  112. return (None, None)
  113. return (f_k, float(f_v))
  114. def get_features(sparse_conf, dense_conf, row):
  115. features = dict(map(lambda k:sparse_fea_2_feature(row[k], k), sparse_conf))
  116. dense_features = dict(map(lambda k:dense_fea_2_feature(row[k], k), dense_conf))
  117. features.update(dense_features)
  118. if None in features:
  119. del(features[None])
  120. return features
  121. def get_item_features(row):
  122. return get_features(item_sparse_conf, item_dense_conf, row)
  123. def get_user_features(row):
  124. return get_features(user_sparse_conf, user_dense_conf, row)
  125. label_col = 'ui_is_out'
  126. sparse_fea_cols = [
  127. # 'u_id',
  128. # 基础特征_场景
  129. #'ctx_day','
  130. 'ctx_apptype',
  131. 'ctx_week',
  132. 'ctx_hour',
  133. # 基础特征_交叉
  134. #'ui_is_out',
  135. #'playtime',
  136. #'ui_root_id',
  137. #'ui_share_id',
  138. ]
  139. dense_fea_cols = [
  140. ]