get_ad_out_sample_v2_user.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. #coding utf-8
  2. from tqdm import tqdm
  3. import sys
  4. import json
  5. import traceback
  6. from threading import Timer
  7. from tqdm import tqdm
  8. from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
  9. from config import set_config
  10. from log import Log
  11. config_, _ = set_config()
  12. log_ = Log()
  13. redis_helper = RedisHelper()
  14. from feature import get_user_features
  15. from lr_model import LrModel
  16. from utils import exe_sql
  17. if __name__ == "__main__":
  18. project = 'loghubods'
  19. datetime = sys.argv[1]
  20. sql = """
  21. --odps sql
  22. --********************************************************************--
  23. --author:研发
  24. --create time:2023-12-01 15:48:17
  25. --********************************************************************--
  26. with candidate as (
  27. select
  28. -- 基础特征_用户
  29. mid AS u_id
  30. ,machineinfo_brand AS u_brand
  31. ,machineinfo_model AS u_device
  32. ,SPLIT(machineinfo_system,' ')[0] AS u_system
  33. ,machineinfo_system AS u_system_ver
  34. -- 基础特征_视频
  35. ,videoid AS i_id
  36. ,i_up_id AS i_up_id
  37. ,tags as i_tag
  38. ,title as i_title
  39. ,ceil(log2(i_title_len + 1)) as i_title_len
  40. ,ceil(log2(total_time + 1)) as i_play_len
  41. ,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
  42. -- 基础特征_场景
  43. ,ctx_day AS ctx_day
  44. ,ctx_week AS ctx_week
  45. ,ctx_hour AS ctx_hour
  46. ,ctx_region as ctx_region
  47. ,ctx_city as ctx_city
  48. -- 基础特征_交叉
  49. ,ui_is_out as ui_is_out
  50. ,i_play_len as playtime
  51. -- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
  52. ,rootmid AS ui_root_id
  53. ,shareid AS ui_share_id
  54. -- 统计特征_用户
  55. ,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
  56. ,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
  57. ,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
  58. ,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
  59. ,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
  60. ,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
  61. ,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
  62. ,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
  63. ,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
  64. ,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
  65. ,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
  66. ,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
  67. ,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
  68. ,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
  69. ,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
  70. ,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
  71. ,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
  72. ,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
  73. ,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
  74. ,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
  75. ,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
  76. ,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
  77. ,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
  78. ,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
  79. ,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
  80. ,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
  81. ,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
  82. ,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
  83. ,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
  84. ,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
  85. ,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
  86. ,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
  87. -- 统计特征_视频
  88. ,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
  89. ,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
  90. ,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
  91. ,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
  92. ,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
  93. ,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
  94. ,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
  95. ,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
  96. ,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
  97. ,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
  98. ,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
  99. ,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
  100. ,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
  101. ,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
  102. ,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
  103. ,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
  104. ,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
  105. ,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
  106. ,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
  107. ,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
  108. ,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
  109. ,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
  110. ,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
  111. ,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
  112. ,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
  113. ,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
  114. ,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
  115. ,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
  116. ,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
  117. ,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
  118. ,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
  119. ,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
  120. from
  121. user_video_features_data_final
  122. where dt='{datetime}'
  123. and ad_ornot = '0'
  124. and apptype != '13'
  125. ), candidate_user as (
  126. SELECT
  127. u_id,
  128. max(u_brand) as u_brand,
  129. max(u_device) as u_device,
  130. max(u_system) as u_system,
  131. max(u_system_ver) as u_system_ver,
  132. max(ctx_region) as ctx_region,
  133. max(ctx_city) as ctx_city,
  134. max(u_1day_exp_cnt) as u_1day_exp_cnt,
  135. max(u_1day_click_cnt) as u_1day_click_cnt,
  136. max(u_1day_share_cnt) as u_1day_share_cnt,
  137. max(u_1day_return_cnt) as u_1day_return_cnt,
  138. max(u_3day_exp_cnt) as u_3day_exp_cnt,
  139. max(u_3day_click_cnt) as u_3day_click_cnt,
  140. max(u_3day_share_cnt) as u_3day_share_cnt,
  141. max(u_3day_return_cnt) as u_3day_return_cnt,
  142. max(u_7day_exp_cnt) as u_7day_exp_cnt,
  143. max(u_7day_click_cnt) as u_7day_click_cnt,
  144. max(u_7day_share_cnt) as u_7day_share_cnt,
  145. max(u_7day_return_cnt) as u_7day_return_cnt,
  146. max(u_3month_exp_cnt) as u_3month_exp_cnt,
  147. max(u_3month_click_cnt) as u_3month_click_cnt,
  148. max(u_3month_share_cnt) as u_3month_share_cnt,
  149. max(u_3month_return_cnt) as u_3month_return_cnt,
  150. max(u_ctr_1day) as u_ctr_1day,
  151. max(u_str_1day) as u_str_1day,
  152. max(u_rov_1day) as u_rov_1day,
  153. max(u_ros_1day) as u_ros_1day,
  154. max(u_ctr_3day) as u_ctr_3day,
  155. max(u_str_3day) as u_str_3day,
  156. max(u_rov_3day) as u_rov_3day,
  157. max(u_ros_3day) as u_ros_3day,
  158. max(u_ctr_7day) as u_ctr_7day,
  159. max(u_str_7day) as u_str_7day,
  160. max(u_rov_7day) as u_rov_7day,
  161. max(u_ros_7day) as u_ros_7day,
  162. max(u_ctr_3month) as u_ctr_3month,
  163. max(u_str_3month) as u_str_3month,
  164. max(u_rov_3month) as u_rov_3month,
  165. max(u_ros_3month) as u_ros_3month
  166. FROM
  167. candidate
  168. group by u_id
  169. ), candidate_item as (
  170. select
  171. i_id,
  172. max(i_up_id) as i_up_id,
  173. max(i_title_len) as i_title_len,
  174. max(i_play_len) as i_play_len,
  175. max(i_days_since_upload) as i_days_since_upload,
  176. max(i_1day_exp_cnt) as i_1day_exp_cnt,
  177. max(i_1day_click_cnt) as i_1day_click_cnt,
  178. max(i_1day_share_cnt) as i_1day_share_cnt,
  179. max(i_1day_return_cnt) as i_1day_return_cnt,
  180. max(i_3day_exp_cnt) as i_3day_exp_cnt,
  181. max(i_3day_click_cnt) as i_3day_click_cnt,
  182. max(i_3day_share_cnt) as i_3day_share_cnt,
  183. max(i_3day_return_cnt) as i_3day_return_cnt,
  184. max(i_7day_exp_cnt) as i_7day_exp_cnt,
  185. max(i_7day_click_cnt) as i_7day_click_cnt,
  186. max(i_7day_share_cnt) as i_7day_share_cnt,
  187. max(i_7day_return_cnt) as i_7day_return_cnt,
  188. max(i_3month_exp_cnt) as i_3month_exp_cnt,
  189. max(i_3month_click_cnt) as i_3month_click_cnt,
  190. max(i_3month_share_cnt) as i_3month_share_cnt,
  191. max(i_3month_return_cnt) as i_3month_return_cnt,
  192. max(i_ctr_1day) as i_ctr_1day,
  193. max(i_str_1day) as i_str_1day,
  194. max(i_rov_1day) as i_rov_1day,
  195. max(i_ros_1day) as i_ros_1day,
  196. max(i_ctr_3day) as i_ctr_3day,
  197. max(i_str_3day) as i_str_3day,
  198. max(i_rov_3day) as i_rov_3day,
  199. max(i_ros_3day) as i_ros_3day,
  200. max(i_ctr_7day) as i_ctr_7day,
  201. max(i_str_7day) as i_str_7day,
  202. max(i_rov_7day) as i_rov_7day,
  203. max(i_ros_7day) as i_ros_7day,
  204. max(i_ctr_3month) as i_ctr_3month,
  205. max(i_str_3month) as i_str_3month,
  206. max(i_rov_3month) as i_rov_3month,
  207. max(i_ros_3month) as i_ros_3month
  208. FROM
  209. candidate
  210. group by i_id
  211. )
  212. SELECT
  213. *
  214. from candidate_user
  215. """.format(datetime=datetime)
  216. # print(sql)
  217. data = exe_sql(project, sql)
  218. print('sql done')
  219. # data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
  220. # data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str)
  221. model_key = 'ad_out_v2_model_v1.day'
  222. lr_model = LrModel('model/{}.json'.format(model_key))
  223. user_h_dict = {}
  224. k_col = 'u_id'
  225. dt = datetime
  226. key_name_prefix = f"{config_.KEY_NAME_PREFIX_AD_OUT_MODEL_SCORE_USER}{model_key}"
  227. print(key_name_prefix)
  228. mean_user_h = 0.0
  229. count_user_h = 0
  230. with data.open_reader() as reader:
  231. for row in tqdm(reader):
  232. k = str(row['u_id'])
  233. user_features = get_user_features(row)
  234. user_h = lr_model.predict_h(user_features)
  235. redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", user_h, 28 * 3600)
  236. user_h_dict[k] = user_h
  237. mean_user_h += user_h
  238. count_user_h += 1
  239. # print(user_features)
  240. # print(user_h)
  241. mean_user_h = mean_user_h / count_user_h
  242. user_h_dict['mean'] = mean_user_h
  243. print(mean_user_h)
  244. print(count_user_h)
  245. k = 'mean'
  246. redis_helper.set_data_to_redis(f"{key_name_prefix}:{k}", mean_user_h, 28 * 3600)
  247. with open('{}.json'.format(key_name_prefix), 'w') as fout:
  248. json.dump(user_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)