import datetime import logging import multiprocessing import time import traceback import gevent import asyncio from threading import Timer from concurrent.futures import ThreadPoolExecutor from utils import RedisHelper, data_check, get_feature_data, send_msg_to_feishu from config import set_config from log import Log config_, _ = set_config() log_ = Log() redis_helper = RedisHelper() # features = [ # 'apptype', # 'return1mids', # 'return2_3mids', # 'return4_8mids', # 'return9_24mids', # 'return25_nmids', # 'return0share1mids', # 'return0share2_nmids' # ] mid_group_mapping_global = {} mids_global = [] def to_redis(group, mid_list, class_key_list): log_.info(f"group = {group} update redis start ...") start_time = time.time() log_.info(f"mid count = {len(mid_list)}") for class_key in class_key_list: for i in range(len(mid_list) // 100 + 1): # log_.info(f"i = {i}") mid_temp_list = mid_list[i * 100:(i + 1) * 100] # print(mid_temp_list) task_list = [ gevent.spawn(redis_helper.set_data_to_redis, f"{config_.KEY_NAME_PREFIX_MID_GROUP}{class_key}:{mid}", group, 28 * 3600) for mid in mid_temp_list ] gevent.joinall(task_list) log_.info(f"group = {group}, mid count = {len(mid_list)}, update redis finished! " f"execute time = {(time.time() - start_time) / 60}min") def to_redis2(process_mid_list, mid_group_mapping, ad_mid_group_key_params): log_.info(f"mid count = {len(process_mid_list)}") start_time = time.time() for i in range(len(process_mid_list) // 100 + 1): mid_temp_list = process_mid_list[i * 100:(i + 1) * 100] task_list = [] for mid in mid_temp_list: group_list = mid_group_mapping.get(mid) mid_value = {} for group in group_list: for class_key in ad_mid_group_key_params.get(group, []): mid_value[class_key] = group # print(f"mid={mid}, mid_value={mid_value}") if len(mid_value) > 0: task_list.append( gevent.spawn(redis_helper.set_data_to_redis, f"{config_.KEY_NAME_PREFIX_MID_GROUP}{mid}", str(mid_value), 28 * 3600) ) gevent.joinall(task_list) log_.info(f"mid count = {len(process_mid_list)}, update redis finished! " f"execute time = {(time.time() - start_time) / 60}min") def mapping_process(group, mid_list): global mids_global, mid_group_mapping_global for mid in mid_list: if mid is None: continue if mid in mids_global: mid_group_mapping_global[mid].append(group) else: mid_group_mapping_global[mid] = [group] mids_global.append(mid) def mapping_process2(mid_list, group_list, group_mid_list): global mid_group_mapping_global for mid in mid_list: if mid is None: continue mid_group = [group for group in group_list if mid in group_mid_list.get(group, [])] mid_group_mapping_global[mid] = mid_group async def get_mid_group_mapping2(feature_df, group_list): """获取mid对应的分组列表""" start_time = time.time() group_mid_list = {} mid_list_all = [] for group in group_list: start_time = time.time() mid_list = feature_df[group].tolist() mid_list = list(set(mid_list)) group_mid_list[group] = mid_list mid_list_all.extend(mid_list) log_.info(f"group = {group}, mid_list_count = {len(mid_list)}") mid_list_all = list(set(mid_list_all)) log_.info(f"mid_list_all count = {len(mid_list_all)}") global mids_global, mid_group_mapping_global mids_global = mid_list_all step = 10000 loop = asyncio.get_running_loop() executor = ThreadPoolExecutor(max_workers=20) tasks = [] for i in range(len(mid_list_all) // step + 1): log_.info(f"i = {i}") process_mid_list = mid_list_all[i * step:(i + 1) * step] tasks.append(loop.run_in_executor(executor, mapping_process2, process_mid_list, group_list, group_mid_list)) await asyncio.wait(tasks) # mid_group_mapping = {} # for mid in mid_list_all: # if mid is None: # continue # mid_group = [group for group in group_list if mid in group_mid_list.get(group, [])] # mid_group_mapping[mid] = mid_group # return mid_group_mapping, mid_list_all log_.info(f"group mid mapping finished! " f"mid_count = {len(mids_global)}, mid_group_mapping_count = {len(mid_group_mapping_global)}, " f"execute time = {(time.time() - start_time) / 60}min") async def get_mid_group_mapping(feature_df, group_list): """获取mid对应的分组列表""" for group in group_list: start_time = time.time() mid_list = feature_df[group].tolist() mid_list = list(set(mid_list)) log_.info(f"group = {group}, mid_list_count = {len(mid_list)}") # pool = multiprocessing.Pool(processes=10) # step = len(mid_list) // (10 - 1) # for i in range(10 + 1): # process_mid_list = mid_list[i * step:(i + 1) * step] # pool.apply_async(func=mapping_process, args=(group, process_mid_list)) # pool.close() # pool.join() # step = len(mid_list) // (100 - 1) step = 100 loop = asyncio.get_running_loop() executor = ThreadPoolExecutor(max_workers=20) tasks = [] for i in range(len(mid_list)//step + 1): log_.info(f"i = {i}") process_mid_list = mid_list[i * step:(i + 1) * step] tasks.append(loop.run_in_executor(executor, mapping_process, group, process_mid_list)) await asyncio.wait(tasks) global mids_global, mid_group_mapping_global log_.info(f"group = {group} mid mapping finished! " f"mid_count = {len(mids_global)}, mid_group_mapping_count = {len(mid_group_mapping_global)}, " f"execute time = {(time.time() - start_time) / 60}min") # for mid in mid_list: # if mid is None: # continue # if mid in mids: # mid_group_mapping[mid].append(group) # else: # mid_group_mapping[mid] = [group] # mids.append(mid) # mid_group_mapping, mids = mid_group_mapping_global, mids_global # return mid_group_mapping, mids def update_user_group_to_redis(project, table, dt, app_type_list, features, ad_mid_group_key_params): """更新mid对应分组到redis中""" # 获取用户分组数据 feature_df = get_feature_data(project=project, table=table, features=features, dt=dt) feature_df['apptype'] = feature_df['apptype'].astype(int) feature_df = feature_df[feature_df['apptype'].isin(app_type_list)] # print(len(feature_df)) group_list = [group for group in ad_mid_group_key_params] # mid_group_mapping, mids = get_mid_group_mapping(feature_df=feature_df, group_list=group_list) asyncio.run(get_mid_group_mapping2(feature_df=feature_df, group_list=group_list)) global mid_group_mapping_global, mids_global mid_group_mapping, mids = mid_group_mapping_global, mids_global pool = multiprocessing.Pool(processes=len(ad_mid_group_key_params)) step = len(mids) // (len(ad_mid_group_key_params) - 1) for i in range(len(ad_mid_group_key_params) + 1): process_mid_list = mids[i*step:(i+1)*step] pool.apply_async(func=to_redis2, args=(process_mid_list, mid_group_mapping, ad_mid_group_key_params)) # for group, class_key_list in ad_mid_group_key_params.items(): # mid_list = feature_df[group].tolist() # mid_list = list(set(mid_list)) # mid_list = [mid for mid in mid_list if mid is not None] # # class_key_list = ad_mid_group_key_params.get(group) # pool.apply_async(func=to_redis, args=(group, mid_list, class_key_list)) pool.close() pool.join() def get_group_keys_mapping(ad_mid_group): ad_mid_group_key_params = {} features = ['apptype'] for class_key, group_list in ad_mid_group.items(): for group in group_list: if group not in features: features.append(group) ad_mid_group_key_params[group] = [class_key] else: ad_mid_group_key_params[group].append(class_key) return features, ad_mid_group_key_params def timer_check(): try: app_type_list = config_.AD_APP_TYPE_LIST ad_mid_group = config_.AD_MID_GROUP project = config_.ad_model_data['user_group'].get('project') table = config_.ad_model_data['user_group'].get('table') now_date = datetime.datetime.today() dt = datetime.datetime.strftime(now_date, '%Y%m%d') log_.info(f"now_date: {dt}") now_min = datetime.datetime.now().minute # 查看当前更新的数据是否已准备好 data_count = data_check(project=project, table=table, dt=dt) if data_count > 0: log_.info(f"user group data count = {data_count}") # 获取features & 用户分组对应key features, ad_mid_group_key_params = get_group_keys_mapping(ad_mid_group=ad_mid_group) log_.info(f"features = {features}, \nad_mid_group_key_params = {ad_mid_group_key_params}") # 数据准备好,进行更新 update_user_group_to_redis(project=project, table=table, dt=dt, app_type_list=app_type_list, features=features, ad_mid_group_key_params=ad_mid_group_key_params) log_.info(f"user group data update end!") # elif now_min > 45: # log_.info('user group data is None!') # send_msg_to_feishu( # webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'), # key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'), # msg_text=f"rov-offline{config_.ENV_TEXT} - 用户分组数据未准备好!\n" # f"traceback: {traceback.format_exc()}" # ) else: # 数据没准备好,1分钟后重新检查 Timer(60, timer_check).start() except Exception as e: log_.error(f"用户分组数据更新失败, exception: {e}, traceback: {traceback.format_exc()}") send_msg_to_feishu( webhook=config_.FEISHU_ROBOT['server_robot'].get('webhook'), key_word=config_.FEISHU_ROBOT['server_robot'].get('key_word'), msg_text=f"rov-offline{config_.ENV_TEXT} - 用户分组数据更新失败\n" f"exception: {e}\n" f"traceback: {traceback.format_exc()}" ) if __name__ == '__main__': timer_check()