get_ad_out_sample_v2_item.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. #coding utf-8
  2. from odps import ODPS
  3. import pandas as pd
  4. from collections import defaultdict
  5. from tqdm import tqdm
  6. import sys
  7. import requests
  8. import json
  9. import datetime
  10. import traceback
  11. from threading import Timer
  12. from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu
  13. from config import set_config
  14. from log import Log
  15. config_, _ = set_config()
  16. log_ = Log()
  17. redis_helper = RedisHelper()
  18. from feature import get_item_features
  19. from lr_model import LrModel
  20. odps = ODPS(
  21. access_id='LTAIWYUujJAm7CbH',
  22. secret_access_key='RfSjdiWwED1sGFlsjXv0DlfTnZTG1P',
  23. project="loghubods",
  24. endpoint='http://service.cn.maxcompute.aliyun.com/api')
  25. def get_data_from_odps(date, project, table, connect_timeout=3000, read_timeout=500000,
  26. pool_maxsize=1000, pool_connections=1000):
  27. """
  28. :param pool_connections:
  29. :return: records
  30. """
  31. records = odps.read_table(name=table, partition='dt=%s' % date)
  32. return records
  33. def exe_sql(sql):
  34. data = []
  35. with odps.execute_sql(sql).open_reader() as reader:
  36. d = defaultdict(list) #
  37. for record in reader:
  38. for res in record:
  39. d[res[0]].append(res[1]) #
  40. #data = pd.DataFrame.from_dict(d, orient='index', dtype=str).T #
  41. data = pd.DataFrame.from_dict(d, orient='columns', dtype=str) #
  42. return data
  43. if __name__=="__main__":
  44. project = 'loghubods'
  45. datetime = sys.argv[1]
  46. sql = """
  47. --odps sql
  48. --********************************************************************--
  49. --author:研发
  50. --create time:2023-12-01 15:48:17
  51. --********************************************************************--
  52. with candidate as (
  53. select
  54. -- 基础特征_用户
  55. mid AS u_id
  56. ,machineinfo_brand AS u_brand
  57. ,machineinfo_model AS u_device
  58. ,SPLIT(machineinfo_system,' ')[0] AS u_system
  59. ,machineinfo_system AS u_system_ver
  60. -- 基础特征_视频
  61. ,videoid AS i_id
  62. ,i_up_id AS i_up_id
  63. ,tags as i_tag
  64. ,title as i_title
  65. ,ceil(log2(i_title_len + 1)) as i_title_len
  66. ,ceil(log2(total_time + 1)) as i_play_len
  67. ,ceil(log2(i_days_since_upload + 1)) as i_days_since_upload -- 发布时间(距离现在天数)
  68. -- 基础特征_场景
  69. ,ctx_day AS ctx_day
  70. ,ctx_week AS ctx_week
  71. ,ctx_hour AS ctx_hour
  72. ,ctx_region as ctx_region
  73. ,ctx_city as ctx_city
  74. -- 基础特征_交叉
  75. ,ui_is_out as ui_is_out
  76. ,i_play_len as playtime
  77. -- ,IF(i_play_len > 1,'0','1') AS ui_is_out_new
  78. ,rootmid AS ui_root_id
  79. ,shareid AS ui_share_id
  80. -- 统计特征_用户
  81. ,ceil(log2(u_1day_exp_cnt + 1)) as u_1day_exp_cnt
  82. ,ceil(log2(u_1day_click_cnt + 1)) as u_1day_click_cnt
  83. ,ceil(log2(u_1day_share_cnt + 1)) as u_1day_share_cnt
  84. ,ceil(log2(u_1day_return_cnt + 1)) as u_1day_return_cnt
  85. ,ceil(log2(u_3day_exp_cnt + 1)) as u_3day_exp_cnt
  86. ,ceil(log2(u_3day_click_cnt + 1)) as u_3day_click_cnt
  87. ,ceil(log2(u_3day_share_cnt + 1)) as u_3day_share_cnt
  88. ,ceil(log2(u_3day_return_cnt + 1)) as u_3day_return_cnt
  89. ,ceil(log2(u_7day_exp_cnt + 1)) as u_7day_exp_cnt
  90. ,ceil(log2(u_7day_click_cnt + 1)) as u_7day_click_cnt
  91. ,ceil(log2(u_7day_share_cnt + 1)) as u_7day_share_cnt
  92. ,ceil(log2(u_7day_return_cnt + 1)) as u_7day_return_cnt
  93. ,ceil(log2(u_3month_exp_cnt + 1)) as u_3month_exp_cnt
  94. ,ceil(log2(u_3month_click_cnt + 1)) as u_3month_click_cnt
  95. ,ceil(log2(u_3month_share_cnt + 1)) as u_3month_share_cnt
  96. ,ceil(log2(u_3month_return_cnt + 1)) as u_3month_return_cnt
  97. ,round(if(u_ctr_1day > 10.0, 10.0, u_ctr_1day) / 10.0, 6) as u_ctr_1day
  98. ,round(if(u_str_1day > 10.0, 10.0, u_str_1day) / 10.0, 6) as u_str_1day
  99. ,round(if(u_rov_1day > 10.0, 10.0, u_rov_1day) / 10.0, 6) as u_rov_1day
  100. ,round(if(u_ros_1day > 10.0, 10.0, u_ros_1day) / 10.0, 6) as u_ros_1day
  101. ,round(if(u_ctr_3day > 10.0, 10.0, u_ctr_3day) / 10.0, 6) as u_ctr_3day
  102. ,round(if(u_str_3day > 10.0, 10.0, u_str_3day) / 10.0, 6) as u_str_3day
  103. ,round(if(u_rov_3day > 10.0, 10.0, u_rov_3day) / 10.0, 6) as u_rov_3day
  104. ,round(if(u_ros_3day > 10.0, 10.0, u_ros_3day) / 10.0, 6) as u_ros_3day
  105. ,round(if(u_ctr_7day > 10.0, 10.0, u_ctr_7day) / 10.0, 6) as u_ctr_7day
  106. ,round(if(u_str_7day > 10.0, 10.0, u_str_7day) / 10.0, 6) as u_str_7day
  107. ,round(if(u_rov_7day > 10.0, 10.0, u_rov_7day) / 10.0, 6) as u_rov_7day
  108. ,round(if(u_ros_7day > 10.0, 10.0, u_ros_7day) / 10.0, 6) as u_ros_7day
  109. ,round(if(u_ctr_3month > 10.0, 10.0, u_ctr_3month) / 10.0, 6) as u_ctr_3month
  110. ,round(if(u_str_3month > 10.0, 10.0, u_str_3month) / 10.0, 6) as u_str_3month
  111. ,round(if(u_rov_3month > 10.0, 10.0, u_rov_3month) / 10.0, 6) as u_rov_3month
  112. ,round(if(u_ros_3month > 10.0, 10.0, u_ros_3month) / 10.0, 6) as u_ros_3month
  113. -- 统计特征_视频
  114. ,ceil(log2(i_1day_exp_cnt + 1)) as i_1day_exp_cnt
  115. ,ceil(log2(i_1day_click_cnt + 1)) as i_1day_click_cnt
  116. ,ceil(log2(i_1day_share_cnt + 1)) as i_1day_share_cnt
  117. ,ceil(log2(i_1day_return_cnt + 1)) as i_1day_return_cnt
  118. ,ceil(log2(i_3day_exp_cnt + 1)) as i_3day_exp_cnt
  119. ,ceil(log2(i_3day_click_cnt + 1)) as i_3day_click_cnt
  120. ,ceil(log2(i_3day_share_cnt + 1)) as i_3day_share_cnt
  121. ,ceil(log2(i_3day_return_cnt + 1)) as i_3day_return_cnt
  122. ,ceil(log2(i_7day_exp_cnt + 1)) as i_7day_exp_cnt
  123. ,ceil(log2(i_7day_click_cnt + 1)) as i_7day_click_cnt
  124. ,ceil(log2(i_7day_share_cnt + 1)) as i_7day_share_cnt
  125. ,ceil(log2(i_7day_return_cnt + 1)) as i_7day_return_cnt
  126. ,ceil(log2(i_3month_exp_cnt + 1)) as i_3month_exp_cnt
  127. ,ceil(log2(i_3month_click_cnt + 1)) as i_3month_click_cnt
  128. ,ceil(log2(i_3month_share_cnt + 1)) as i_3month_share_cnt
  129. ,ceil(log2(i_3month_return_cnt + 1)) as i_3month_return_cnt
  130. ,round(if(i_ctr_1day > 10.0, 10.0, i_ctr_1day) / 10.0, 6) as i_ctr_1day
  131. ,round(if(i_str_1day > 10.0, 10.0, i_str_1day) / 10.0, 6) as i_str_1day
  132. ,round(if(i_rov_1day > 10.0, 10.0, i_rov_1day) / 10.0, 6) as i_rov_1day
  133. ,round(if(i_ros_1day > 10.0, 10.0, i_ros_1day) / 10.0, 6) as i_ros_1day
  134. ,round(if(i_ctr_3day > 10.0, 10.0, i_ctr_3day) / 10.0, 6) as i_ctr_3day
  135. ,round(if(i_str_3day > 10.0, 10.0, i_str_3day) / 10.0, 6) as i_str_3day
  136. ,round(if(i_rov_3day > 10.0, 10.0, i_rov_3day) / 10.0, 6) as i_rov_3day
  137. ,round(if(i_ros_3day > 10.0, 10.0, i_ros_3day) / 10.0, 6) as i_ros_3day
  138. ,round(if(i_ctr_7day > 10.0, 10.0, i_ctr_7day) / 10.0, 6) as i_ctr_7day
  139. ,round(if(i_str_7day > 10.0, 10.0, i_str_7day) / 10.0, 6) as i_str_7day
  140. ,round(if(i_rov_7day > 10.0, 10.0, i_rov_7day) / 10.0, 6) as i_rov_7day
  141. ,round(if(i_ros_7day > 10.0, 10.0, i_ros_7day) / 10.0, 6) as i_ros_7day
  142. ,round(if(i_ctr_3month > 10.0, 10.0, i_ctr_3month) / 10.0, 6) as i_ctr_3month
  143. ,round(if(i_str_3month > 10.0, 10.0, i_str_3month) / 10.0, 6) as i_str_3month
  144. ,round(if(i_rov_3month > 10.0, 10.0, i_rov_3month) / 10.0, 6) as i_rov_3month
  145. ,round(if(i_ros_3month > 10.0, 10.0, i_ros_3month) / 10.0, 6) as i_ros_3month
  146. from
  147. user_video_features_data_final
  148. where dt='{datetime}'
  149. and ad_ornot = '0'
  150. and apptype != '13'
  151. ), candidate_user as (
  152. SELECT
  153. u_id,
  154. max(u_1day_exp_cnt) as u_1day_exp_cnt,
  155. max(u_1day_click_cnt) as u_1day_click_cnt,
  156. max(u_1day_share_cnt) as u_1day_share_cnt,
  157. max(u_1day_return_cnt) as u_1day_return_cnt,
  158. max(u_3day_exp_cnt) as u_3day_exp_cnt,
  159. max(u_3day_click_cnt) as u_3day_click_cnt,
  160. max(u_3day_share_cnt) as u_3day_share_cnt,
  161. max(u_3day_return_cnt) as u_3day_return_cnt,
  162. max(u_7day_exp_cnt) as u_7day_exp_cnt,
  163. max(u_7day_click_cnt) as u_7day_click_cnt,
  164. max(u_7day_share_cnt) as u_7day_share_cnt,
  165. max(u_7day_return_cnt) as u_7day_return_cnt,
  166. max(u_3month_exp_cnt) as u_3month_exp_cnt,
  167. max(u_3month_click_cnt) as u_3month_click_cnt,
  168. max(u_3month_share_cnt) as u_3month_share_cnt,
  169. max(u_3month_return_cnt) as u_3month_return_cnt,
  170. max(u_ctr_1day) as u_ctr_1day,
  171. max(u_str_1day) as u_str_1day,
  172. max(u_rov_1day) as u_rov_1day,
  173. max(u_ros_1day) as u_ros_1day,
  174. max(u_ctr_3day) as u_ctr_3day,
  175. max(u_str_3day) as u_str_3day,
  176. max(u_rov_3day) as u_rov_3day,
  177. max(u_ros_3day) as u_ros_3day,
  178. max(u_ctr_7day) as u_ctr_7day,
  179. max(u_str_7day) as u_str_7day,
  180. max(u_rov_7day) as u_rov_7day,
  181. max(u_ros_7day) as u_ros_7day,
  182. max(u_ctr_3month) as u_ctr_3month,
  183. max(u_str_3month) as u_str_3month,
  184. max(u_rov_3month) as u_rov_3month,
  185. max(u_ros_3month) as u_ros_3month
  186. FROM
  187. candidate
  188. group by u_id
  189. ), candidate_item as (
  190. select
  191. i_id,
  192. max(i_up_id) as i_up_id,
  193. max(i_title_len) as i_title_len,
  194. max(i_play_len) as i_play_len,
  195. max(i_days_since_upload) as i_days_since_upload,
  196. max(i_1day_exp_cnt) as i_1day_exp_cnt,
  197. max(i_1day_click_cnt) as i_1day_click_cnt,
  198. max(i_1day_share_cnt) as i_1day_share_cnt,
  199. max(i_1day_return_cnt) as i_1day_return_cnt,
  200. max(i_3day_exp_cnt) as i_3day_exp_cnt,
  201. max(i_3day_click_cnt) as i_3day_click_cnt,
  202. max(i_3day_share_cnt) as i_3day_share_cnt,
  203. max(i_3day_return_cnt) as i_3day_return_cnt,
  204. max(i_7day_exp_cnt) as i_7day_exp_cnt,
  205. max(i_7day_click_cnt) as i_7day_click_cnt,
  206. max(i_7day_share_cnt) as i_7day_share_cnt,
  207. max(i_7day_return_cnt) as i_7day_return_cnt,
  208. max(i_3month_exp_cnt) as i_3month_exp_cnt,
  209. max(i_3month_click_cnt) as i_3month_click_cnt,
  210. max(i_3month_share_cnt) as i_3month_share_cnt,
  211. max(i_3month_return_cnt) as i_3month_return_cnt,
  212. max(i_ctr_1day) as i_ctr_1day,
  213. max(i_str_1day) as i_str_1day,
  214. max(i_rov_1day) as i_rov_1day,
  215. max(i_ros_1day) as i_ros_1day,
  216. max(i_ctr_3day) as i_ctr_3day,
  217. max(i_str_3day) as i_str_3day,
  218. max(i_rov_3day) as i_rov_3day,
  219. max(i_ros_3day) as i_ros_3day,
  220. max(i_ctr_7day) as i_ctr_7day,
  221. max(i_str_7day) as i_str_7day,
  222. max(i_rov_7day) as i_rov_7day,
  223. max(i_ros_7day) as i_ros_7day,
  224. max(i_ctr_3month) as i_ctr_3month,
  225. max(i_str_3month) as i_str_3month,
  226. max(i_rov_3month) as i_rov_3month,
  227. max(i_ros_3month) as i_ros_3month
  228. FROM
  229. candidate
  230. group by i_id
  231. )
  232. SELECT
  233. *
  234. from candidate_item
  235. """.format(datetime=datetime)
  236. print(sql)
  237. data = exe_sql(sql)
  238. print('sql done')
  239. #data.to_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t')
  240. #data = pd.read_csv('./data/ad_out_sample_v2_item.{datetime}'.format(datetime=datetime), sep='\t', dtype=str)
  241. data.fillna('', inplace=True)
  242. lr_model = LrModel('model/ad_out_v2_model_v1.day.json')
  243. item_h_dict = {}
  244. k_col = 'i_id'
  245. for index, row in tqdm(data.iterrows()):
  246. k = row['i_id']
  247. item_features = get_item_features(row)
  248. item_h = lr_model.predict_h(item_features)
  249. item_h_dict[k] = item_h
  250. #print(item_features)
  251. #print(item_h)
  252. dt = datetime
  253. data_key = 'test_lr_v1'
  254. key_name = f"{config_.KEY_NAME_PREFIX_AD_GROUP}{data_key}:{dt}"
  255. print(key_name)
  256. redis_helper.add_data_with_zset(key_name=key_name, data=item_h_dict, expire_time=2 * 24 * 3600)
  257. with open('test_item.json', 'w') as fout:
  258. json.dump(item_h_dict, fout, indent=2, ensure_ascii=False, sort_keys=True)