| 
					
				 | 
			
			
				@@ -0,0 +1,195 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# -*- coding: utf-8 -*- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import os 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import sys 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if root_dir not in sys.path: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    sys.path.append(root_dir) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("******** sys.path ********") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(sys.path) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from multiprocessing import Process 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from odps import ODPS 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from threading import Timer 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import threading 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from my_utils import RedisHelper, execute_sql_from_odps 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from my_config import set_config 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from log import Log 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import json 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from datetime import datetime 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from queue import Queue 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from tqdm import tqdm 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+config_, _ = set_config() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+log_ = Log() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+redis_helper = RedisHelper() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+REDIS_PREFIX = "redis:vid_vovhour4rank:" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+EXPIRE_TIME = 8 * 3600 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def worker(queue, executor): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    while True: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        row = queue.get() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if row is None:  # 结束信号 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            queue.task_done() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        executor(row) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        queue.task_done() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def records_process_for_list(records, executor, max_size=50, num_workers=10): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 创建一个线程安全的队列 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    queue = Queue(maxsize=max_size)  # 可以调整 maxsize 以控制内存使用 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 设置线程池大小 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    num_workers = num_workers 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 启动工作线程 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    threads = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for _ in range(num_workers): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        t = threading.Thread(target=worker, args=(queue, executor)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        t.start() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        threads.append(t) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 读取数据并放入队列 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for row in tqdm(records): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        queue.put(row) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 发送结束信号 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for _ in range(num_workers): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        queue.put(None) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 等待所有任务完成 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    queue.join() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 等待所有工作线程结束 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for t in threads: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        t.join() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def process_and_store(row): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    table_key, json_str = row 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    key = REDIS_PREFIX + str(table_key) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    expire_time = EXPIRE_TIME 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    redis_helper.set_data_to_redis(key, json_str, expire_time) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def check_data(project, table,  date, hour, mm) -> int: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """检查数据是否准备好,输出数据条数""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    odps = ODPS( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        access_id=config_.ODPS_CONFIG['ACCESSID'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        secret_access_key=config_.ODPS_CONFIG['ACCESSKEY'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        project=project, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        endpoint=config_.ODPS_CONFIG['ENDPOINT'], 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        connect_timeout=3000, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        read_timeout=500000, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pool_maxsize=1000, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        pool_connections=1000 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        t = odps.get_table(name=table) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info(f"检查分区是否存在-【 dt={date} hh={hour}】") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        check_res = t.exist_partition(partition_spec=f'dt={date},hh={hour}') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if check_res: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sql = f'select * from {project}.{table} where dt = {date} and hh = {hour}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info(sql) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            with odps.execute_sql(sql=sql).open_reader() as reader: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                data_count = reader.count 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("表{}分区{}/{}不存在".format(table, date, hour)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            data_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.error("table:{},date:{},hour:{} no data. return data_count=0,报错原因是:{}".format(table, date, hour, e)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        data_count = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return data_count 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def get_sql(project, table, date, hour, mm): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    sql = ''' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    SELECT  vid 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            ,feature 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    FROM    {}.{} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    WHERE   dt = '{}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    and     hh = '{}' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    '''.format( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        project, table, date, hour 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print("sql:" + sql) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    records = execute_sql_from_odps(project=project, sql=sql) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    video_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    with records.open_reader() as reader: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        for record in reader: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            key1 = record['vid'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            key = key1 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            m = dict() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                json_str = record['feature'] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                json_str = json.dumps(m) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                log_.error(e) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            video_list.append([key, json_str]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return video_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def main(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        date = sys.argv[1] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        hour = sys.argv[2] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # mm = sys.argv[3] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        mm = "00" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    except Exception as e: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        date = datetime.now().strftime('%Y%m%d') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        hour = datetime.now().hour 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # mm = datetime.now().minute 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        mm = "00" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info("没有读取到参数,采用系统时间:{}".format(e)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("使用时间参数-日期:{},小时:{}".format(date, str(hour))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if hour in []: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        log_.info(f"hour={hour}不执行,直接返回。") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        return 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 1 判断上游数据表是否生产完成 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    project = "loghubods" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    table = "alg_vid_hour_vov" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    run_flag = True 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    begin_ts = int(time.time()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    table_data_cnt = 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    while run_flag: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if int(time.time()) - begin_ts >= 60 * 40: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("等待上游数据超过40分钟了,认为失败退出:过了{}秒。".format(int(time.time()) - begin_ts)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            sys.exit(1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        table_data_cnt = check_data(project, table, date, hour, mm) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if table_data_cnt == 0: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("上游数据{}未就绪{}/{},等待...".format(table, date, hour)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            log_.info("等待5分钟") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            time.sleep(60 * 5) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        else: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            run_flag = False 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("上游数据就绪,count={},开始读取数据表".format(table_data_cnt)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 2 读取数据表 处理特征 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    video_list = get_sql(project, table, date, hour, mm) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 3 写入redis 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("video的数据量:{}".format(len(video_list))) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    records_process_for_list(video_list, process_and_store, max_size=50, num_workers=8) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if __name__ == '__main__': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("开始执行:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    process = Process(target=main) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    process.start() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 等待子进程完成或超时 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    timeout = 3600 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    process.join(timeout=timeout)  # 设置超时为3600秒(1小时) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    if process.is_alive(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        print("脚本执行时间超过1小时,执行失败,经过了{}秒。".format(timeout)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        process.terminate()  # 终止子进程 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        sys.exit(1)  # 直接退出主进程并返回状态码999 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    log_.info("完成执行:" + datetime.now().strftime("%Y-%m-%d %H:%M:%S")) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# cd /root/zhangbo/rov-offline 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+# python alg_recsys_feature_06_vovhour4rank_redis.py 20240925 20 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    !!!!!!!!!!!!!! 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    更改字段:table 表名 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            REDIS_PREFIX redis的key 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            EXPIRE_TIME redis的过期时间 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            两段sql 各种字段 注意分区是否有“分钟” 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            record 各种字段 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if hour in ["00"]: 哪些小时不执行 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            process.join(timeout=3600) 任务超时时间3600 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            int(time.time()) - begin_ts >= 60*50 任务超时时间3000 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 |