| 
					
				 | 
			
			
				@@ -0,0 +1,177 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# -*- coding: utf-8 -*- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import sys 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import traceback 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if root_dir not in sys.path: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    sys.path.append(root_dir) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("******** sys.path ********") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(sys.path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from multiprocessing import Process 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from odps import ODPS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from threading import Timer 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import threading 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from my_utils import RedisHelper, execute_sql_from_odps, request_post 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from my_config import set_config 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from log import Log 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import json 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from datetime import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from queue import Queue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from tqdm import tqdm 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+DOUDI_URL = "http://recommend-common-internal.piaoquantv.com/longvideoapi/openapi/recommend/updateFallBackVideoListV2" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+config_, _ = set_config() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+log_ = Log() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def check_data(project, table,  date, hour, mm) -> int: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """检查数据是否准备好,输出数据条数""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    odps = ODPS( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        access_id=config_.ODPS_CONFIG['ACCESSID'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        project=project, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        connect_timeout=3000, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        read_timeout=500000, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pool_maxsize=1000, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pool_connections=1000 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        t = odps.get_table(name=table) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info(f"检查分区是否存在-dt={date}.{hour}") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_res = t.exist_partition(partition_spec=f'dt={date},hour={hour},minute={hour}0000') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if check_res: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sql = f'select * from {project}.{table} where dt = \"{date}\" and hour = \"{hour}\" and minute = \"{hour}0000\"' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info(sql) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            with odps.execute_sql(sql=sql).open_reader() as reader: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                data_count = reader.count 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("表{}分区{}不存在".format(table, date + hour)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            data_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.error("table:{},date:{}.{} no data. return data_count=0,报错原因是:{}".format(table, date, hour, e)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        data_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return data_count 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def get_sql(project, table, date, hour, mm): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    sql = ''' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    SELECT  vid 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ,AVG(rank) AS score 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        ,SUM(1) as cnt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+FROM    ( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            SELECT  a.dt 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    ,a.hour 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    ,a.region_provience 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    ,t1.videoid AS vid 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    ,t1.pos + 1 AS rank -- pos 从 0 开始,所以 +1 作为 rank 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            FROM    {}.{} a 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            LATERAL VIEW POSEXPLODE(SPLIT(a.videoid_arr,",")) t1 AS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    pos 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                    ,videoid 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            WHERE   a.dt = "{}" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            AND     a.hour = "{}" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            AND     a.region_provience IN ("上海","中国","云南","内蒙古","北京","吉林","四川","天津","宁夏","安徽","山东","山西","广东","广西","新疆","江苏","江西","河北","河南","浙江","海南","湖北","湖南","甘肃","福建","西藏","贵州","辽宁","重庆","陕西","青海","黑龙江") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        )  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+GROUP BY vid 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ORDER BY AVG(rank) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    '''.format( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        project, table, date, hour 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("sql:" + sql) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    records = execute_sql_from_odps(project=project, sql=sql) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    video_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    with records.open_reader() as reader: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for record in reader: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            key = record['vid'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            score = record['score'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            video_list.append({'videoId': key, 'rovScore': float(score)}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return video_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        date = sys.argv[1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        hour = sys.argv[2] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # mm = sys.argv[3] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        mm = "00" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        date = datetime.now().strftime('%Y%m%d') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        hour = datetime.now().hour 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # mm = datetime.now().minute 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        mm = "00" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info("没有读取到参数,采用系统时间:{}".format(e)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("使用时间参数-日期:{},小时:{}".format(date, str(hour))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if hour in ["23", "00", "01", "02", "03", "04", "05"]: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info(f"hour={hour}不执行,直接返回。") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 1 判断上游数据表是否生产完成 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    project = "loghubods" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    table = "alg_recsys_recall_region_1h_v1" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    run_flag = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    begin_ts = int(time.time()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    table_data_cnt = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    while run_flag: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if int(time.time()) - begin_ts >= 60 * 40: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("等待上游数据超过40分钟了,认为失败退出:过了{}秒。".format(int(time.time()) - begin_ts)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sys.exit(1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        table_data_cnt = check_data(project, table, date, hour, mm) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if table_data_cnt == 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("上游数据{}未就绪{},等待...".format(table, date)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("等待10分钟") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            time.sleep(60 * 10) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            run_flag = False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("上游数据就绪,count={},开始读取数据表".format(table_data_cnt)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 2 读取数据表 处理特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    video_list = get_sql(project, table, date, hour, mm) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 3 通过url请求,写入后端。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    video_list = video_list[:2000] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("video的数据量:{}".format(len(video_list))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("10个videos:{}".format(video_list[:10])) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result = request_post(request_url=DOUDI_URL, request_data={'videos': video_list}) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("请求结果result={}".format(str(result))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if result is None: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        msg = "请求失败1" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.error(msg) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        sys.exit(1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    elif result['code'] == 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        msg = "请求成功" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info(msg) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        msg = "请求失败2" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.error(msg) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        sys.exit(1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if __name__ == '__main__': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("开始执行:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    process = Process(target=main) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    process.start() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 等待子进程完成或超时 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    timeout = 3600 * 1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    process.join(timeout=timeout)  # 设置超时为3600秒(1小时) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if process.is_alive(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print("脚本执行时间超过1小时,执行失败,经过了{}秒。".format(timeout)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        process.terminate()  # 终止子进程 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        sys.exit(1)  # 直接退出主进程并返回状态码999 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("完成执行:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# cd /root/zhangbo/rov-offline 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# python alg_recsys_task_doudi_videos.py 20240731 14 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    !!!!!!!!!!!!!! 通过url,给后端传送兜底视频。每小时执行一次,方案是从loghubods.alg_recsys_recall_region_1h_v1 的单路召回源中挑选视频。 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    更改字段:table 表名 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            两段sql 各种字段 注意分区是否有“分钟” 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            record 各种字段 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if hour in ["00"]: 哪些小时不执行 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            process.join(timeout=3600) 任务超时时间3600 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            int(time.time()) - begin_ts >= 60*50 任务超时时间3000 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 |