region_rule_rank_h.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791
  1. # -*- coding: utf-8 -*-
  2. # @ModuleName: region_rule_rank_h
  3. # @Author: Liqian
  4. # @Time: 2022/5/5 15:54
  5. # @Software: PyCharm
  6. import multiprocessing
  7. import os
  8. import sys
  9. import traceback
  10. import gevent
  11. import datetime
  12. import pandas as pd
  13. import math
  14. from functools import reduce
  15. from odps import ODPS
  16. from threading import Timer, Thread
  17. from utils import MysqlHelper, RedisHelper, get_data_from_odps, filter_video_status, filter_shield_video, \
  18. check_table_partition_exits, filter_video_status_app, send_msg_to_feishu, filter_political_videos
  19. from config import set_config
  20. from log import Log
  21. from check_video_limit_distribute import update_limit_video_score
  22. # os.environ['NUMEXPR_MAX_THREADS'] = '16'
  23. config_, _ = set_config()
  24. log_ = Log()
  25. region_code = config_.REGION_CODE
  26. features = [
  27. 'apptype',
  28. 'code',
  29. 'videoid',
  30. 'lastonehour_preview', # 过去1小时预曝光人数
  31. 'lastonehour_view', # 过去1小时曝光人数
  32. 'lastonehour_play', # 过去1小时播放人数
  33. 'lastonehour_share', # 过去1小时分享人数
  34. 'lastonehour_return', # 过去1小时分享,过去1小时回流人数
  35. 'lastonehour_preview_total', # 过去1小时预曝光次数
  36. 'lastonehour_view_total', # 过去1小时曝光次数
  37. 'lastonehour_play_total', # 过去1小时播放次数
  38. 'lastonehour_share_total', # 过去1小时分享次数
  39. 'platform_return',
  40. 'lastonehour_show', # 不区分地域
  41. 'lastonehour_show_region', # 地域分组
  42. ]
  43. def get_region_code(region):
  44. """获取省份对应的code"""
  45. mysql_helper = MysqlHelper(mysql_info=config_.MYSQL_INFO)
  46. sql = f"SELECT ad_code FROM region_adcode WHERE parent_id = 0 AND region LIKE '{region}%';"
  47. ad_code = mysql_helper.get_data(sql=sql)
  48. return ad_code[0][0]
  49. def h_data_check(project, table, now_date):
  50. """检查数据是否准备好"""
  51. odps = ODPS(
  52. access_id=config_.ODPS_CONFIG['ACCESSID'],
  53. secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'],
  54. project=project,
  55. endpoint=config_.ODPS_CONFIG['ENDPOINT'],
  56. connect_timeout=3000,
  57. read_timeout=500000,
  58. pool_maxsize=1000,
  59. pool_connections=1000
  60. )
  61. try:
  62. dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
  63. check_res = check_table_partition_exits(date=dt, project=project, table=table)
  64. if check_res:
  65. sql = f'select * from {project}.{table} where dt = {dt}'
  66. with odps.execute_sql(sql=sql).open_reader() as reader:
  67. data_count = reader.count
  68. else:
  69. data_count = 0
  70. except Exception as e:
  71. data_count = 0
  72. return data_count
  73. def get_rov_redis_key(now_date):
  74. """获取rov模型结果存放key"""
  75. redis_helper = RedisHelper()
  76. now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
  77. key_name = f'{config_.RECALL_KEY_NAME_PREFIX}{now_dt}'
  78. if not redis_helper.key_exists(key_name=key_name):
  79. pre_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
  80. key_name = f'{config_.RECALL_KEY_NAME_PREFIX}{pre_dt}'
  81. return key_name
  82. def get_day_30day_videos(now_date, data_key, rule_key):
  83. """获取天级更新相对30天的视频id"""
  84. redis_helper = RedisHelper()
  85. day_30day_recall_key_prefix = config_.RECALL_KEY_NAME_PREFIX_30DAY
  86. now_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
  87. day_30day_recall_key_name = f"{day_30day_recall_key_prefix}{data_key}:{rule_key}:{now_dt}"
  88. if not redis_helper.key_exists(key_name=day_30day_recall_key_name):
  89. redis_dt = datetime.datetime.strftime((now_date - datetime.timedelta(days=1)), '%Y%m%d')
  90. day_30day_recall_key_name = f"{day_30day_recall_key_prefix}{data_key}:{rule_key}:{redis_dt}"
  91. data = redis_helper.get_all_data_from_zset(key_name=day_30day_recall_key_name, with_scores=True)
  92. if data is None:
  93. return None
  94. video_ids = [int(video_id) for video_id, _ in data]
  95. return video_ids
  96. def get_feature_data(project, table, now_date):
  97. """获取特征数据"""
  98. dt = datetime.datetime.strftime(now_date, '%Y%m%d%H')
  99. # dt = '2022041310'
  100. records = get_data_from_odps(date=dt, project=project, table=table)
  101. feature_data = []
  102. for record in records:
  103. item = {}
  104. for feature_name in features:
  105. item[feature_name] = record[feature_name]
  106. feature_data.append(item)
  107. feature_df = pd.DataFrame(feature_data)
  108. return feature_df
  109. def cal_score(df, param):
  110. """
  111. 计算score
  112. :param df: 特征数据
  113. :param param: 规则参数
  114. :return:
  115. """
  116. # score计算公式: sharerate*backrate*logback*ctr
  117. # sharerate = lastonehour_share/(lastonehour_play+1000)
  118. # backrate = lastonehour_return/(lastonehour_share+10)
  119. # ctr = lastonehour_play/(lastonehour_preview+1000), 对ctr限最大值:K2 = 0.6 if ctr > 0.6 else ctr
  120. # score = sharerate * backrate * LOG(lastonehour_return+1) * K2
  121. df = df.fillna(0)
  122. df['share_rate'] = df['lastonehour_share'] / (df['lastonehour_play'] + 1000)
  123. df['back_rate'] = df['lastonehour_return'] / (df['lastonehour_share'] + 10)
  124. df['log_back'] = (df['lastonehour_return'] + 1).apply(math.log)
  125. if param.get('view_type', None) == 'video-show':
  126. df['ctr'] = df['lastonehour_play'] / (df['lastonehour_show'] + 1000)
  127. elif param.get('view_type', None) == 'video-show-region':
  128. df['ctr'] = df['lastonehour_play'] / (df['lastonehour_show_region'] + 1000)
  129. else:
  130. df['ctr'] = df['lastonehour_play'] / (df['lastonehour_preview'] + 1000)
  131. df['K2'] = df['ctr'].apply(lambda x: 0.6 if x > 0.6 else x)
  132. df['platform_return_rate'] = df['platform_return'] / df['lastonehour_return']
  133. df['score1'] = df['share_rate'] * df['back_rate'] * df['log_back'] * df['K2']
  134. click_score_rate = param.get('click_score_rate', None)
  135. back_score_rate = param.get('click_score_rate', None)
  136. if click_score_rate is not None:
  137. df['score'] = (1 - click_score_rate) * df['score1'] + click_score_rate * df['K2']
  138. elif back_score_rate is not None:
  139. df['score'] = (1 - back_score_rate) * df['score1'] + back_score_rate * df['back_rate']
  140. else:
  141. df['score'] = df['score1']
  142. df = df.sort_values(by=['score'], ascending=False)
  143. return df
  144. def add_videos(initial_df, now_date, rule_key, region, data_key):
  145. """
  146. 地域小时级数据列表中增加前6h优质视频
  147. :param initial_df: 地域小时级筛选结果
  148. :param now_date:
  149. :param data_key:
  150. :param region:
  151. :param rule_key:
  152. :return: df
  153. """
  154. redis_helper = RedisHelper()
  155. pre_h_data = []
  156. for pre_h in range(1, 7):
  157. pre_date = now_date - datetime.timedelta(hours=pre_h)
  158. pre_h_recall_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_H}{region}:{data_key}:{rule_key}:" \
  159. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{pre_h}"
  160. initial_data = redis_helper.get_all_data_from_zset(key_name=pre_h_recall_key_name, with_scores=True)
  161. if initial_data is None:
  162. continue
  163. pre_h_data.extend(initial_data)
  164. pre_h_df = pd.DataFrame(data=pre_h_data, columns=['videoid', 'score'])
  165. score_list = initial_df['score'].to_list()
  166. if len(score_list) > 0:
  167. min_score = min(score_list)
  168. else:
  169. min_score = 0
  170. pre_h_df = pre_h_df[pre_h_df['score'] > min_score]
  171. df = pd.concat([initial_df, pre_h_df], ignore_index=True)
  172. # videoid去重,保留分值高
  173. df['videoid'] = df['videoid'].astype(int)
  174. df = df.sort_values(by=['score'], ascending=False)
  175. df = df.drop_duplicates(subset=['videoid'], keep="first")
  176. return df
  177. def video_rank(df, now_date, now_h, rule_key, param, region, data_key, rule_rank_h_flag, add_videos_in_20h=False):
  178. """
  179. 获取符合进入召回源条件的视频,与每日更新的rov模型结果视频列表进行合并
  180. :param df:
  181. :param now_date:
  182. :param now_h:
  183. :param rule_key: 小时级数据进入条件
  184. :param param: 小时级数据进入条件参数
  185. :param region: 所属地域
  186. :return:
  187. """
  188. redis_helper = RedisHelper()
  189. # 获取符合进入召回源条件的视频,进入条件:小时级回流>=20 && score>=0.005
  190. return_count = param.get('return_count', 1)
  191. score_value = param.get('score_rule', 0)
  192. platform_return_rate = param.get('platform_return_rate', 0)
  193. h_recall_df = df[(df['lastonehour_return'] >= return_count) & (df['score'] >= score_value)
  194. & (df['platform_return_rate'] >= platform_return_rate)]
  195. # videoid重复时,保留分值高
  196. h_recall_df = h_recall_df.sort_values(by=['score'], ascending=False)
  197. h_recall_df = h_recall_df.drop_duplicates(subset=['videoid'], keep='first')
  198. h_recall_df['videoid'] = h_recall_df['videoid'].astype(int)
  199. # 20点增加打捞的优质视频
  200. if now_h == 20 and add_videos_in_20h is True:
  201. # print(len(h_recall_df))
  202. h_recall_df = add_videos(initial_df=h_recall_df, now_date=now_date, rule_key=rule_key,
  203. region=region, data_key=data_key)
  204. # print(len(h_recall_df))
  205. h_recall_videos = h_recall_df['videoid'].to_list()
  206. # log_.info(f'h_recall videos count = {len(h_recall_videos)}')
  207. # 视频状态过滤
  208. if data_key in ['data7', ]:
  209. filtered_videos = filter_video_status_app(h_recall_videos)
  210. else:
  211. filtered_videos = filter_video_status(h_recall_videos)
  212. # log_.info('filtered_videos count = {}'.format(len(filtered_videos)))
  213. # 屏蔽视频过滤
  214. shield_key_name_list = config_.SHIELD_CONFIG.get(region, None)
  215. if shield_key_name_list is not None:
  216. filtered_videos = filter_shield_video(video_ids=filtered_videos, shield_key_name_list=shield_key_name_list)
  217. # log_.info(f"shield filtered_videos count = {len(filtered_videos)}")
  218. # 涉政视频过滤
  219. political_filter = param.get('political_filter', None)
  220. if political_filter is True:
  221. log_.info(f"political filter videos count = {len(filtered_videos)}")
  222. filtered_videos = filter_political_videos(video_ids=filtered_videos)
  223. log_.info(f"political filtered videos count = {len(filtered_videos)}")
  224. # 写入对应的redis
  225. h_video_ids = []
  226. by_30day_rule_key = param.get('30day_rule_key', None)
  227. if by_30day_rule_key is not None:
  228. # 与相对30天列表去重
  229. h_video_ids = get_day_30day_videos(now_date=now_date, data_key=data_key, rule_key=by_30day_rule_key)
  230. log_.info(f"h_video_ids count = {len(h_video_ids)}")
  231. if h_video_ids is not None:
  232. filtered_videos = [video_id for video_id in filtered_videos if int(video_id) not in h_video_ids]
  233. log_.info(f"filtered_videos count = {len(filtered_videos)}")
  234. h_recall_result = {}
  235. for video_id in filtered_videos:
  236. score = h_recall_df[h_recall_df['videoid'] == video_id]['score']
  237. # print(score)
  238. h_recall_result[int(video_id)] = float(score)
  239. h_video_ids.append(int(video_id))
  240. h_recall_key_name = \
  241. f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_H}{region}:{data_key}:{rule_key}:" \
  242. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  243. if len(h_recall_result) > 0:
  244. log_.info(f"h_recall_result count = {len(h_recall_result)}")
  245. redis_helper.add_data_with_zset(key_name=h_recall_key_name, data=h_recall_result, expire_time=2 * 24 * 3600)
  246. # 限流视频score调整
  247. update_limit_video_score(initial_videos=h_recall_result, key_name=h_recall_key_name)
  248. # 清空线上过滤应用列表
  249. # redis_helper.del_keys(key_name=f"{config_.REGION_H_VIDEO_FILER}{region}.{app_type}.{data_key}.{rule_key}")
  250. region_24h_rule_key = param.get('region_24h_rule_key', 'rule1')
  251. by_24h_rule_key = param.get('24h_rule_key', None)
  252. by_48h_rule_key = param.get('48h_rule_key', None)
  253. # 与其他召回视频池去重,存入对应的redis
  254. dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key,
  255. region_24h_rule_key=region_24h_rule_key, by_24h_rule_key=by_24h_rule_key, by_48h_rule_key=by_48h_rule_key,
  256. region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag, political_filter=political_filter)
  257. def dup_data(h_video_ids, initial_key_name, dup_key_name, region, political_filter):
  258. redis_helper = RedisHelper()
  259. if redis_helper.key_exists(key_name=initial_key_name):
  260. initial_data = redis_helper.get_all_data_from_zset(key_name=initial_key_name, with_scores=True)
  261. # 屏蔽视频过滤
  262. initial_video_ids = [int(video_id) for video_id, _ in initial_data]
  263. shield_key_name_list = config_.SHIELD_CONFIG.get(region, None)
  264. if shield_key_name_list is not None:
  265. initial_video_ids = filter_shield_video(video_ids=initial_video_ids,
  266. shield_key_name_list=shield_key_name_list)
  267. # 涉政视频过滤
  268. if political_filter is True:
  269. initial_video_ids = filter_political_videos(video_ids=initial_video_ids)
  270. dup_data = {}
  271. for video_id, score in initial_data:
  272. if int(video_id) not in h_video_ids and int(video_id) in initial_video_ids:
  273. dup_data[int(video_id)] = score
  274. h_video_ids.append(int(video_id))
  275. if len(dup_data) > 0:
  276. redis_helper.add_data_with_zset(key_name=dup_key_name, data=dup_data, expire_time=2 * 24 * 3600)
  277. # 限流视频score调整
  278. update_limit_video_score(initial_videos=dup_data, key_name=dup_key_name)
  279. return h_video_ids
  280. def dup_to_redis(h_video_ids, now_date, now_h, rule_key, region_24h_rule_key, by_24h_rule_key, by_48h_rule_key,
  281. region, data_key, rule_rank_h_flag, political_filter):
  282. """将地域分组小时级数据与其他召回视频池去重,存入对应的redis"""
  283. # ##### 去重更新地域分组小时级24h列表,并另存为redis中
  284. region_24h_key_name = \
  285. f"{config_.RECALL_KEY_NAME_PREFIX_REGION_BY_24H}{region}:{data_key}:{region_24h_rule_key}:" \
  286. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  287. region_24h_dup_key_name = \
  288. f"{config_.RECALL_KEY_NAME_PREFIX_DUP1_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
  289. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  290. h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=region_24h_key_name,
  291. dup_key_name=region_24h_dup_key_name, region=region, political_filter=political_filter)
  292. if rule_rank_h_flag == '48h':
  293. # ##### 去重小程序相对48h更新结果,并另存为redis中
  294. h_48h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_48H}{data_key}:{by_48h_rule_key}:" \
  295. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  296. h_48h_dup_key_name = \
  297. f"{config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_48H_H}{region}:{data_key}:{rule_key}:" \
  298. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  299. h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_48h_key_name,
  300. dup_key_name=h_48h_dup_key_name, region=region, political_filter=political_filter)
  301. # ##### 去重小程序相对48h 筛选后剩余数据 更新结果,并另存为redis中
  302. if by_48h_rule_key == 'rule1':
  303. other_h_48h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_48H_OTHER}{data_key}:" \
  304. f"{by_48h_rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  305. other_h_48h_dup_key_name = \
  306. f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_48H_H}{region}:{data_key}:{rule_key}:" \
  307. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  308. h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=other_h_48h_key_name,
  309. dup_key_name=other_h_48h_dup_key_name, region=region,
  310. political_filter=political_filter)
  311. else:
  312. # ##### 去重小程序相对24h更新结果,并另存为redis中
  313. h_24h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H}{data_key}:{by_24h_rule_key}:" \
  314. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  315. h_24h_dup_key_name = \
  316. f"{config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
  317. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  318. h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=h_24h_key_name,
  319. dup_key_name=h_24h_dup_key_name, region=region, political_filter=political_filter)
  320. # ##### 去重小程序相对24h 筛选后剩余数据 更新结果,并另存为redis中
  321. # if by_24h_rule_key in ['rule3', 'rule4']:
  322. other_h_24h_key_name = f"{config_.RECALL_KEY_NAME_PREFIX_BY_24H_OTHER}{data_key}:" \
  323. f"{by_24h_rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  324. other_h_24h_dup_key_name = \
  325. f"{config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H}{region}:{data_key}:{rule_key}:" \
  326. f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  327. h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=other_h_24h_key_name,
  328. dup_key_name=other_h_24h_dup_key_name, region=region, political_filter=political_filter)
  329. # ##### 去重小程序模型更新结果,并另存为redis中
  330. # model_key_name = get_rov_redis_key(now_date=now_date)
  331. # model_data_dup_key_name = \
  332. # f"{config_.RECALL_KEY_NAME_PREFIX_DUP_REGION_H}{region}:{data_key}:{rule_key}:" \
  333. # f"{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  334. # h_video_ids = dup_data(h_video_ids=h_video_ids, initial_key_name=model_key_name,
  335. # dup_key_name=model_data_dup_key_name, region=region)
  336. def merge_df(df_left, df_right):
  337. """
  338. df按照videoid, code 合并,对应特征求和
  339. :param df_left:
  340. :param df_right:
  341. :return:
  342. """
  343. df_merged = pd.merge(df_left, df_right, on=['videoid', 'code'], how='outer', suffixes=['_x', '_y'])
  344. df_merged.fillna(0, inplace=True)
  345. feature_list = ['videoid', 'code']
  346. for feature in features:
  347. if feature in ['apptype', 'videoid', 'code']:
  348. continue
  349. df_merged[feature] = df_merged[f'{feature}_x'] + df_merged[f'{feature}_y']
  350. feature_list.append(feature)
  351. return df_merged[feature_list]
  352. def merge_df_with_score(df_left, df_right):
  353. """
  354. df 按照[videoid, code]合并,平台回流人数、回流人数、分数 分别求和
  355. :param df_left:
  356. :param df_right:
  357. :return:
  358. """
  359. df_merged = pd.merge(df_left, df_right, on=['videoid', 'code'], how='outer', suffixes=['_x', '_y'])
  360. df_merged.fillna(0, inplace=True)
  361. feature_list = ['videoid', 'code', 'lastonehour_return', 'platform_return', 'score']
  362. for feature in feature_list[2:]:
  363. df_merged[feature] = df_merged[f'{feature}_x'] + df_merged[f'{feature}_y']
  364. return df_merged[feature_list]
  365. def process_with_region(region, df_merged, data_key, rule_key, rule_param, now_date, now_h,
  366. rule_rank_h_flag, add_videos_in_20h):
  367. log_.info(f"region = {region} start...")
  368. # 计算score
  369. region_df = df_merged[df_merged['code'] == region]
  370. log_.info(f'region = {region}, region_df count = {len(region_df)}')
  371. score_df = cal_score(df=region_df, param=rule_param)
  372. video_rank(df=score_df, now_date=now_date, now_h=now_h, rule_key=rule_key, param=rule_param,
  373. region=region, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag,
  374. add_videos_in_20h=add_videos_in_20h)
  375. log_.info(f"region = {region} end!")
  376. def process_with_region2(region, df_merged, data_key, rule_key, rule_param, now_date, now_h,
  377. rule_rank_h_flag, add_videos_in_20h):
  378. log_.info(f"region = {region} start...")
  379. region_score_df = df_merged[df_merged['code'] == region]
  380. log_.info(f'region = {region}, region_score_df count = {len(region_score_df)}')
  381. video_rank(df=region_score_df, now_date=now_date, now_h=now_h, region=region,
  382. rule_key=rule_key, param=rule_param, data_key=data_key, rule_rank_h_flag=rule_rank_h_flag,
  383. add_videos_in_20h=add_videos_in_20h)
  384. log_.info(f"region = {region} end!")
  385. def process_with_app_type(app_type, params, region_code_list, feature_df, now_date, now_h, rule_rank_h_flag):
  386. log_.info(f"app_type = {app_type} start...")
  387. data_params_item = params.get('data_params')
  388. rule_params_item = params.get('rule_params')
  389. task_list = []
  390. for param in params.get('params_list'):
  391. data_key = param.get('data')
  392. data_param = data_params_item.get(data_key)
  393. log_.info(f"data_key = {data_key}, data_param = {data_param}")
  394. df_list = [feature_df[feature_df['apptype'] == apptype] for apptype in data_param]
  395. df_merged = reduce(merge_df, df_list)
  396. rule_key = param.get('rule')
  397. rule_param = rule_params_item.get(rule_key)
  398. log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
  399. task_list.extend(
  400. [
  401. gevent.spawn(process_with_region, region, df_merged, app_type, data_key, rule_key, rule_param,
  402. now_date, now_h, rule_rank_h_flag)
  403. for region in region_code_list
  404. ]
  405. )
  406. gevent.joinall(task_list)
  407. log_.info(f"app_type = {app_type} end!")
  408. # log_.info(f"app_type = {app_type}")
  409. # task_list = []
  410. # for data_key, data_param in params['data_params'].items():
  411. # log_.info(f"data_key = {data_key}, data_param = {data_param}")
  412. # df_list = [feature_df[feature_df['apptype'] == apptype] for apptype in data_param]
  413. # df_merged = reduce(merge_df, df_list)
  414. # for rule_key, rule_param in params['rule_params'].items():
  415. # log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
  416. # task_list.extend(
  417. # [
  418. # gevent.spawn(process_with_region, region, df_merged, app_type, data_key, rule_key, rule_param,
  419. # now_date, now_h)
  420. # for region in region_code_list
  421. # ]
  422. # )
  423. # gevent.joinall(task_list)
  424. def copy_data_for_city(region, city_code, data_key, rule_key, now_date, now_h):
  425. """copy 对应数据到城市对应redis,并做相应屏蔽视频过滤"""
  426. log_.info(f"city_code = {city_code} start ...")
  427. redis_helper = RedisHelper()
  428. key_prefix_list = [
  429. config_.RECALL_KEY_NAME_PREFIX_REGION_BY_H, # 地域小时级
  430. config_.RECALL_KEY_NAME_PREFIX_DUP1_REGION_24H_H, # 地域相对24h
  431. config_.RECALL_KEY_NAME_PREFIX_DUP2_REGION_24H_H, # 不区分地域相对24h
  432. config_.RECALL_KEY_NAME_PREFIX_DUP3_REGION_24H_H, # 不区分地域相对24h筛选后
  433. config_.RECALL_KEY_NAME_PREFIX_DUP_REGION_H, # rov大列表
  434. ]
  435. for key_prefix in key_prefix_list:
  436. region_key = f"{key_prefix}{region}:{data_key}:{rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  437. city_key = f"{key_prefix}{city_code}:{data_key}:{rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  438. if not redis_helper.key_exists(key_name=region_key):
  439. continue
  440. region_data = redis_helper.get_all_data_from_zset(key_name=region_key, with_scores=True)
  441. if not region_data:
  442. continue
  443. # 屏蔽视频过滤
  444. region_video_ids = [int(video_id) for video_id, _ in region_data]
  445. shield_key_name_list = config_.SHIELD_CONFIG.get(city_code, None)
  446. if shield_key_name_list is not None:
  447. filtered_video_ids = filter_shield_video(video_ids=region_video_ids,
  448. shield_key_name_list=shield_key_name_list)
  449. else:
  450. filtered_video_ids = region_video_ids
  451. city_data = {}
  452. for video_id, score in region_data:
  453. if int(video_id) in filtered_video_ids:
  454. city_data[int(video_id)] = score
  455. if len(city_data) > 0:
  456. redis_helper.add_data_with_zset(key_name=city_key, data=city_data, expire_time=2 * 24 * 3600)
  457. log_.info(f"city_code = {city_code} end!")
  458. def process_with_param(param, data_params_item, rule_params_item, region_code_list, feature_df, now_date, now_h, rule_rank_h_flag):
  459. log_.info(f"param = {param} start...")
  460. data_key = param.get('data')
  461. data_param = data_params_item.get(data_key)
  462. log_.info(f"data_key = {data_key}, data_param = {data_param}")
  463. rule_key = param.get('rule')
  464. rule_param = rule_params_item.get(rule_key)
  465. log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
  466. merge_func = rule_param.get('merge_func', None)
  467. # 是否在20点的数据中增加打捞的优质视频
  468. add_videos_in_20h = rule_param.get('add_videos_in_20h', False)
  469. if merge_func == 2:
  470. score_df_list = []
  471. for apptype, weight in data_param.items():
  472. df = feature_df[feature_df['apptype'] == apptype]
  473. # 计算score
  474. score_df = cal_score(df=df, param=rule_param)
  475. score_df['score'] = score_df['score'] * weight
  476. score_df_list.append(score_df)
  477. # 分数合并
  478. df_merged = reduce(merge_df_with_score, score_df_list)
  479. # 更新平台回流比
  480. df_merged['platform_return_rate'] = df_merged['platform_return'] / df_merged['lastonehour_return']
  481. task_list = [
  482. gevent.spawn(process_with_region2,
  483. region, df_merged, data_key, rule_key, rule_param, now_date, now_h, rule_rank_h_flag,
  484. add_videos_in_20h)
  485. for region in region_code_list
  486. ]
  487. else:
  488. df_list = [feature_df[feature_df['apptype'] == apptype] for apptype in data_param]
  489. df_merged = reduce(merge_df, df_list)
  490. task_list = [
  491. gevent.spawn(process_with_region,
  492. region, df_merged, data_key, rule_key, rule_param, now_date, now_h, rule_rank_h_flag,
  493. add_videos_in_20h)
  494. for region in region_code_list
  495. ]
  496. gevent.joinall(task_list)
  497. # 特殊城市视频数据准备
  498. for region, city_list in config_.REGION_CITY_MAPPING.items():
  499. t = [
  500. gevent.spawn(
  501. copy_data_for_city,
  502. region, city_code, data_key, rule_key, now_date, now_h
  503. )
  504. for city_code in city_list
  505. ]
  506. gevent.joinall(t)
  507. log_.info(f"param = {param} end!")
  508. def rank_by_h(project, table, now_date, now_h, rule_params, region_code_list, rule_rank_h_flag):
  509. # 获取特征数据
  510. feature_df = get_feature_data(project=project, table=table, now_date=now_date)
  511. feature_df['apptype'] = feature_df['apptype'].astype(int)
  512. data_params_item = rule_params.get('data_params')
  513. rule_params_item = rule_params.get('rule_params')
  514. params_list = rule_params.get('params_list')
  515. pool = multiprocessing.Pool(processes=len(params_list))
  516. for param in params_list:
  517. pool.apply_async(
  518. func=process_with_param,
  519. args=(param, data_params_item, rule_params_item, region_code_list, feature_df, now_date, now_h, rule_rank_h_flag)
  520. )
  521. pool.close()
  522. pool.join()
  523. # pool = multiprocessing.Pool(processes=len(config_.APP_TYPE))
  524. # for app_type, params in rule_params.items():
  525. # pool.apply_async(func=process_with_app_type,
  526. # args=(app_type, params, region_code_list, feature_df, now_date, now_h, rule_rank_h_flag))
  527. # pool.close()
  528. # pool.join()
  529. """
  530. for app_type, params in rule_params.items():
  531. log_.info(f"app_type = {app_type} start...")
  532. data_params_item = params.get('data_params')
  533. rule_params_item = params.get('rule_params')
  534. for param in params.get('params_list'):
  535. log_.info(f"param = {param} start...")
  536. data_key = param.get('data')
  537. data_param = data_params_item.get(data_key)
  538. log_.info(f"data_key = {data_key}, data_param = {data_param}")
  539. df_list = [feature_df[feature_df['apptype'] == apptype] for apptype in data_param]
  540. df_merged = reduce(merge_df, df_list)
  541. rule_key = param.get('rule')
  542. rule_param = rule_params_item.get(rule_key)
  543. log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
  544. task_list = []
  545. for region in region_code_list:
  546. t = Thread(target=process_with_region,
  547. args=(region, df_merged, app_type, data_key, rule_key, rule_param, now_date, now_h)
  548. )
  549. t.start()
  550. task_list.append(t)
  551. for t in task_list:
  552. t.join()
  553. log_.info(f"param = {param} end!")
  554. log_.info(f"app_type = {app_type} end!")
  555. """
  556. # for app_type, params in rule_params.items():
  557. # log_.info(f"app_type = {app_type}")
  558. # for data_key, data_param in params['data_params'].items():
  559. # log_.info(f"data_key = {data_key}, data_param = {data_param}")
  560. # df_list = [feature_df[feature_df['apptype'] == apptype] for apptype in data_param]
  561. # df_merged = reduce(merge_df, df_list)
  562. # for rule_key, rule_param in params['rule_params'].items():
  563. # log_.info(f"rule_key = {rule_key}, rule_param = {rule_param}")
  564. # task_list = [
  565. # gevent.spawn(process_with_region, region, df_merged, app_type, data_key, rule_key, rule_param, now_date, now_h)
  566. # for region in region_code_list
  567. # ]
  568. # gevent.joinall(task_list)
  569. # rank
  570. # for key, value in rule_params.items():
  571. # log_.info(f"rule = {key}, param = {value}")
  572. # for region in region_code_list:
  573. # log_.info(f"region = {region}")
  574. # # 计算score
  575. # region_df = feature_df[feature_df['code'] == region]
  576. # log_.info(f'region_df count = {len(region_df)}')
  577. # score_df = cal_score(df=region_df, param=value)
  578. # video_rank(df=score_df, now_date=now_date, now_h=now_h, rule_key=key, param=value, region=region)
  579. # # to-csv
  580. # score_filename = f"score_{region}_{key}_{datetime.datetime.strftime(now_date, '%Y%m%d%H')}.csv"
  581. # score_df.to_csv(f'./data/{score_filename}')
  582. # # to-logs
  583. # log_.info({"date": datetime.datetime.strftime(now_date, '%Y%m%d%H'),
  584. # "region_code": region,
  585. # "redis_key_prefix": config_.RECALL_KEY_NAME_PREFIX_REGION_BY_H,
  586. # "rule_key": key,
  587. # # "score_df": score_df[['videoid', 'score']]
  588. # }
  589. # )
  590. def h_rank_bottom(now_date, now_h, rule_params, region_code_list, rule_rank_h_flag):
  591. """未按时更新数据,用上一小时结果作为当前小时的数据"""
  592. # 获取rov模型结果
  593. redis_helper = RedisHelper()
  594. if now_h == 0:
  595. redis_dt = datetime.datetime.strftime(now_date - datetime.timedelta(days=1), '%Y%m%d')
  596. redis_h = 23
  597. else:
  598. redis_dt = datetime.datetime.strftime(now_date, '%Y%m%d')
  599. redis_h = now_h - 1
  600. # 以上一小时的地域分组数据作为当前小时的数据
  601. key_prefix = config_.RECALL_KEY_NAME_PREFIX_REGION_BY_H
  602. rule_params_item = rule_params.get('rule_params')
  603. for param in rule_params.get('params_list'):
  604. data_key = param.get('data')
  605. rule_key = param.get('rule')
  606. rule_param = rule_params_item.get(rule_key)
  607. log_.info(f"data_key = {data_key}, rule_key = {rule_key}, rule_param = {rule_param}")
  608. region_24h_rule_key = rule_param.get('region_24h_rule_key', 'rule1')
  609. by_24h_rule_key = rule_param.get('24h_rule_key', None)
  610. by_48h_rule_key = rule_param.get('48h_rule_key', None)
  611. # 涉政视频过滤
  612. political_filter = param.get('political_filter', None)
  613. for region in region_code_list:
  614. log_.info(f"region = {region}")
  615. key_name = f"{key_prefix}{region}:{data_key}:{rule_key}:{redis_dt}:{redis_h}"
  616. initial_data = redis_helper.get_all_data_from_zset(key_name=key_name, with_scores=True)
  617. if initial_data is None:
  618. initial_data = []
  619. final_data = dict()
  620. h_video_ids = []
  621. for video_id, score in initial_data:
  622. final_data[video_id] = score
  623. h_video_ids.append(int(video_id))
  624. # 存入对应的redis
  625. final_key_name = \
  626. f"{key_prefix}{region}:{data_key}:{rule_key}:{datetime.datetime.strftime(now_date, '%Y%m%d')}:{now_h}"
  627. if len(final_data) > 0:
  628. redis_helper.add_data_with_zset(key_name=final_key_name, data=final_data, expire_time=2 * 24 * 3600)
  629. # 与其他召回视频池去重,存入对应的redis
  630. dup_to_redis(h_video_ids=h_video_ids, now_date=now_date, now_h=now_h, rule_key=rule_key,
  631. region_24h_rule_key=region_24h_rule_key, region=region,
  632. data_key=data_key, by_24h_rule_key=by_24h_rule_key,
  633. by_48h_rule_key=by_48h_rule_key, rule_rank_h_flag=rule_rank_h_flag,
  634. political_filter=political_filter)
  635. # 特殊城市视频数据准备
  636. for region, city_list in config_.REGION_CITY_MAPPING.items():
  637. t = [
  638. gevent.spawn(
  639. copy_data_for_city,
  640. region, city_code, data_key, rule_key, now_date, now_h
  641. )
  642. for city_code in city_list
  643. ]
  644. gevent.joinall(t)
  645. def h_timer_check():
  646. try:
  647. rule_rank_h_flag = sys.argv[1]
  648. if rule_rank_h_flag == '48h':
  649. rule_params = config_.RULE_PARAMS_REGION_APP_TYPE_48H
  650. else:
  651. rule_params = config_.RULE_PARAMS_REGION_APP_TYPE
  652. project = config_.PROJECT_REGION_APP_TYPE
  653. table = config_.TABLE_REGION_APP_TYPE
  654. region_code_list = [code for region, code in region_code.items()]
  655. now_date = datetime.datetime.today()
  656. log_.info(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d%H')}, rule_rank_h_flag: {rule_rank_h_flag}")
  657. now_h = datetime.datetime.now().hour
  658. now_min = datetime.datetime.now().minute
  659. if now_h == 0:
  660. h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params, region_code_list=region_code_list,
  661. rule_rank_h_flag=rule_rank_h_flag)
  662. return
  663. # 查看当前小时更新的数据是否已准备好
  664. h_data_count = h_data_check(project=project, table=table, now_date=now_date)
  665. if h_data_count > 0:
  666. log_.info(f'region_h_data_count = {h_data_count}')
  667. # 数据准备好,进行更新
  668. rank_by_h(now_date=now_date, now_h=now_h, rule_params=rule_params,
  669. project=project, table=table, region_code_list=region_code_list, rule_rank_h_flag=rule_rank_h_flag)
  670. log_.info(f"region_h_data end!")
  671. elif now_min > 50:
  672. log_.info('h_recall data is None, use bottom data!')
  673. h_rank_bottom(now_date=now_date, now_h=now_h, rule_params=rule_params, region_code_list=region_code_list,
  674. rule_rank_h_flag=rule_rank_h_flag)
  675. log_.info(f"region_h_data end!")
  676. else:
  677. # 数据没准备好,1分钟后重新检查
  678. Timer(60, h_timer_check).start()
  679. send_msg_to_feishu(
  680. webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
  681. key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
  682. msg_text=f"rov-offline{config_.ENV_TEXT} - 推荐视频数据更新完成\n"
  683. f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}\n"
  684. f"now_h: {now_h}\n"
  685. f"finished time: {datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d %H:%M:%S')}"
  686. )
  687. except Exception as e:
  688. log_.error(f"地域分组小时级数据更新失败, exception: {e}, traceback: {traceback.format_exc()}")
  689. send_msg_to_feishu(
  690. webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'),
  691. key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'),
  692. msg_text=f"rov-offline{config_.ENV_TEXT} - 地域分组小时级数据更新失败\n"
  693. f"exception: {e}\n"
  694. f"traceback: {traceback.format_exc()}"
  695. )
  696. if __name__ == '__main__':
  697. log_.info(f"region_h_data start...")
  698. h_timer_check()