sunxy 1 year ago
parent
commit
4c62a4247a
9 changed files with 112 additions and 204 deletions
  1. 1 47
      ai_tag_task.py
  2. 2 2
      ai_tag_task.sh
  3. BIN
      archive.tar.gz
  4. BIN
      asr_res.tar.gz
  5. 100 125
      asr_task.py
  6. 3 2
      asr_task.sh
  7. 1 23
      download_videos_task.py
  8. 3 3
      download_videos_task.sh
  9. 2 2
      mysql_connect.py

+ 1 - 47
ai_tag_task.py

@@ -267,50 +267,4 @@ def timer_check():
 
 
 if __name__ == '__main__':
-    # timer_check()
-    size = 10000
-    for i in range(0, 10000, size):
-        print(f"query_videos start i = {i} ...")
-        records = ODPSQueryUtil.query_videos(i, size)
-        if records is None or len(records) == 0:
-            continue
-        print(f"Got {len(records)} records")
-
-        video_info = {}
-        # 遍历 records,将每个视频的信息添加到字典中
-        for record in records:
-            # 将 video_id 从字符串转换为整数,这里假设 video_id 格式总是 "vid" 后跟数字
-            video_id = int(record['videoid'])
-            title = record['title']
-            video_path = record['video_path']
-
-            # 使用 video_id 作为键,其他信息作为值
-            video_info[video_id] = {'title': title, 'video_path': video_path}
-
-        # 打印结果查看
-        print(video_info)
-
-    asr_folder = 'asr_res'
-    retry = 0
-    while retry < 30:
-        asr_file_list = os.listdir(asr_folder)
-        if len(asr_file_list) < 1:
-            retry += 1
-            time.sleep(60)
-            continue
-        retry = 0
-        for asr_filename in asr_file_list:
-            video_id = int(asr_filename[:-4])
-            if video_id not in video_info:
-                continue
-            asr_file = os.path.join(asr_folder, asr_filename)
-            if video_info.get(video_id, None) is None:
-                os.remove(asr_file)
-            else:
-                get_video_ai_tags(
-                    video_id=video_id, asr_file=asr_file, video_info=video_info.get(video_id))
-                os.remove(asr_file)
-
-
-# get_video_ai_tags(16598277, 'aigc-test/asr_res/16598277.txt',
-#                   {'title': '九九重阳节送祝福🚩', 'video_path': '视频路径'})
+    timer_check()

+ 2 - 2
ai_tag_task.sh

@@ -2,8 +2,8 @@ ps -ef | grep ai_tag_task.py | grep -v grep | awk '{print $2}' | xargs kill -9
 
 # cd /data/aigc-test
 
-# source activate whisper
+source activate srt
 
 nohup python ai_tag_task.py > logs/ai_tag_task.log 2>&1 &
 
-# conda deactivate
+conda deactivate

BIN
archive.tar.gz


BIN
asr_res.tar.gz


+ 100 - 125
asr_task.py

@@ -13,6 +13,7 @@ from gpt_tag import request_gpt
 from config import set_config
 from audio_process import get_wav
 from log import Log
+from utils import data_check, get_feature_data
 config_ = set_config()
 log_ = Log()
 features = ['videoid', 'title', 'video_path']
@@ -33,138 +34,112 @@ def get_asr(video_id, download_folder, asr_folder):
             with open(asr_path, 'w', encoding='utf-8') as wf:
                 wf.write(asr_res_initial)
             # 将处理过的视频进行删除
-            # shutil.rmtree(os.path.join(download_folder, video_id))
+            shutil.rmtree(os.path.join(download_folder, video_id))
             break
 
 
-# def asr_process(project, table, dt, cuda_id):
-#     # 获取特征数据
-#     feature_df = get_feature_data(
-#         project=project, table=table, dt=dt, features=features)
-#     video_id_list = feature_df['videoid'].to_list()
-#     video_info = {}
-#     for video_id in video_id_list:
-#         title = feature_df[feature_df['videoid']
-#                            == video_id]['title'].values[0]
-#         if title is None:
-#             continue
-#         title = title.strip()
-#         if len(title) > 0:
-#             video_info[video_id] = {'title': title}
-#     # 获取已下载视频,做asr识别
-#     download_folder = 'videos'
-#     download_folder = f'{download_folder}_{cuda_id}'
-#     asr_folder = 'asr_res'
-#     if not os.path.exists(asr_folder):
-#         os.makedirs(asr_folder)
-#     retry = 0
-#     while retry < 3:
-#         video_folder_list = os.listdir(download_folder)
-#         if len(video_folder_list) < 1:
-#             retry += 1
-#             time.sleep(60)
-#             continue
-#         retry = 0
-#         # for video_id in video_folder_list:
-#         #     if video_id not in video_id_list:
-#         #         continue
-#         #     if video_info.get(video_id, None) is None:
-#         #         try:
-#         #             shutil.rmtree(os.path.join(download_folder, video_id))
-#         #         except:
-#         #             continue
-#         #     else:
-#         #         video_folder = os.path.join(download_folder, video_id)
-#         #         for filename in os.listdir(video_folder):
-#         #             video_type = filename.split('.')[-1]
-#         #             if video_type in ['mp4', 'm3u8']:
-#         #                 video_file = os.path.join(video_folder, filename)
-#         #                 # 1. asr识别
-#         #                 asr_res_initial = get_whisper_asr(video=video_file)
-#         #                 print(video_id, asr_res_initial)
-#         #                 # 2. 识别结果写入文件
-#         #                 asr_path = os.path.join(asr_folder, f"{video_id}.txt")
-#         #                 with open(asr_path, 'w', encoding='utf-8') as wf:
-#         #                     wf.write(asr_res_initial)
-#         #                 # 将处理过的视频进行删除
-#         #                 shutil.rmtree(os.path.join(download_folder, video_id))
-#         #                 break
-
-#         pool = multiprocessing.Pool(processes=2)
-#         for video_id in video_folder_list:
-#             if video_id not in video_id_list:
-#                 continue
-#             if video_info.get(video_id, None) is None:
-#                 try:
-#                     shutil.rmtree(os.path.join(download_folder, video_id))
-#                 except:
-#                     continue
-#             else:
-#                 pool.apply_async(
-#                     func=get_asr,
-#                     args=(video_id, download_folder, asr_folder)
-#                 )
-#         pool.close()
-#         pool.join()
-
-
-# def timer_check():
-#     try:
-#         cuda_id = sys.argv[1]
-#         project = config_.DAILY_VIDEO['project']
-#         table = config_.DAILY_VIDEO['table']
-#         now_date = datetime.datetime.today()
-#         print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
-#         dt = datetime.datetime.strftime(
-#             now_date-datetime.timedelta(days=1), '%Y%m%d')
-#         # 查看数据是否已准备好
-#         data_count = data_check(project=project, table=table, dt=dt)
-#         if data_count > 0:
-#             print(f'videos count = {data_count}')
-#             download_folder = 'videos'
-#             download_folder = f'{download_folder}_{cuda_id}'
-#             if not os.path.exists(download_folder):
-#                 # 视频未下载好,1分钟后重新检查
-#                 Timer(60, timer_check).start()
-#             else:
-#                 # 数据准备好,进行asr
-#                 asr_process(project=project, table=table,
-#                             dt=dt, cuda_id=cuda_id)
-#                 print(f"videos asr finished!")
-
-#         else:
-#             # 数据没准备好,1分钟后重新检查
-#             Timer(60, timer_check).start()
-#     except Exception as e:
-#         print(
-#             f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
-
-
-if __name__ == '__main__':
-    # timer_check()
-    cuda_id = sys.argv[1]
+def asr_process(project, table, dt, cuda_id):
+    # 获取特征数据
+    feature_df = get_feature_data(
+        project=project, table=table, dt=dt, features=features)
+    video_id_list = feature_df['videoid'].to_list()
+    video_info = {}
+    for video_id in video_id_list:
+        title = feature_df[feature_df['videoid']
+                           == video_id]['title'].values[0]
+        if title is None:
+            continue
+        title = title.strip()
+        if len(title) > 0:
+            video_info[video_id] = {'title': title}
+    # 获取已下载视频,做asr识别
     download_folder = 'videos'
     download_folder = f'{download_folder}_{cuda_id}'
-    if not os.path.exists(download_folder):
-        print(f"download_folder: {download_folder} not exists!")
-        exit(0)
-    # 遍历download_folder下所有的子文件夹名,即video_id list
-    video_folder_list = os.listdir(download_folder)
-    if len(video_folder_list) < 1:
-        print(f"video_folder_list is empty!")
-        exit(0)
-
     asr_folder = 'asr_res'
     if not os.path.exists(asr_folder):
         os.makedirs(asr_folder)
+    retry = 0
+    while retry < 3:
+        video_folder_list = os.listdir(download_folder)
+        if len(video_folder_list) < 1:
+            retry += 1
+            time.sleep(60)
+            continue
+        retry = 0
+        for video_id in video_folder_list:
+            if video_id not in video_id_list:
+                continue
+            if video_info.get(video_id, None) is None:
+                try:
+                    shutil.rmtree(os.path.join(download_folder, video_id))
+                except:
+                    continue
+            else:
+                video_folder = os.path.join(download_folder, video_id)
+                for filename in os.listdir(video_folder):
+                    video_type = filename.split('.')[-1]
+                    if video_type in ['mp4', 'm3u8']:
+                        video_file = os.path.join(video_folder, filename)
+                        # 1. asr识别
+                        asr_res_initial = get_whisper_asr(video=video_file)
+                        print(video_id, asr_res_initial)
+                        # 2. 识别结果写入文件
+                        asr_path = os.path.join(asr_folder, f"{video_id}.txt")
+                        with open(asr_path, 'w', encoding='utf-8') as wf:
+                            wf.write(asr_res_initial)
+                        # 将处理过的视频进行删除
+                        shutil.rmtree(os.path.join(download_folder, video_id))
+                        break
+
+        pool = multiprocessing.Pool(processes=2)
+        for video_id in video_folder_list:
+            if video_id not in video_id_list:
+                continue
+            if video_info.get(video_id, None) is None:
+                try:
+                    shutil.rmtree(os.path.join(download_folder, video_id))
+                except:
+                    continue
+            else:
+                pool.apply_async(
+                    func=get_asr,
+                    args=(video_id, download_folder, asr_folder)
+                )
+        pool.close()
+        pool.join()
+
 
-    pool = multiprocessing.Pool(processes=2)
-    for video_id in video_folder_list:
-        pool.apply_async(
-            func=get_asr,
-            args=(video_id, download_folder, asr_folder)
-        )
-    pool.close()
-    pool.join()
+def timer_check():
+    try:
+        cuda_id = sys.argv[1]
+        project = config_.DAILY_VIDEO['project']
+        table = config_.DAILY_VIDEO['table']
+        now_date = datetime.datetime.today()
+        print(f"now_date: {datetime.datetime.strftime(now_date, '%Y%m%d')}")
+        dt = datetime.datetime.strftime(
+            now_date-datetime.timedelta(days=1), '%Y%m%d')
+        # 查看数据是否已准备好
+        data_count = data_check(project=project, table=table, dt=dt)
+        if data_count > 0:
+            print(f'videos count = {data_count}')
+            download_folder = 'videos'
+            download_folder = f'{download_folder}_{cuda_id}'
+            if not os.path.exists(download_folder):
+                # 视频未下载好,1分钟后重新检查
+                Timer(60, timer_check).start()
+            else:
+                # 数据准备好,进行asr
+                asr_process(project=project, table=table,
+                            dt=dt, cuda_id=cuda_id)
+                print(f"videos asr finished!")
 
-    print(f"videos asr finished!")
+        else:
+            # 数据没准备好,1分钟后重新检查
+            Timer(60, timer_check).start()
+    except Exception as e:
+        print(
+            f"视频asr识别失败, exception: {e}, traceback: {traceback.format_exc()}")
+
+
+if __name__ == '__main__':
+    timer_check()

+ 3 - 2
asr_task.sh

@@ -2,9 +2,10 @@ ps -ef | grep asr_task.py | grep -v grep | awk '{print $2}' | xargs kill -9
 
 rm -r asr_res/
 
-# source activate aigc-test
+source activate srt
+
 nohup env CUDA_VISIBLE_DEVICES=0 python asr_task.py 0 > logs/asr_task_0.log 2>&1 &
 
 nohup env CUDA_VISIBLE_DEVICES=1 python asr_task.py 1 > logs/asr_task_1.log 2>&1 &
 
-# conda deactivate
+conda deactivate

+ 1 - 23
download_videos_task.py

@@ -99,26 +99,4 @@ def timer_check():
 
 
 if __name__ == '__main__':
-    # timer_check()
-    size = 10000
-    for i in range(0, 10000, size):
-        print(f"query_videos start i = {i} ...")
-        records = ODPSQueryUtil.query_videos(i, size)
-        if records is None or len(records) == 0:
-            continue
-        print(f"Got {len(records)} records")
-        pool = multiprocessing.Pool(processes=6)
-        for record in records:
-            try:
-                videoid = record.videoid
-                video_path = record.video_path
-                pool.apply_async(
-                    func=download_video_from_oss,
-                    args=(videoid, video_path, 'videos')
-                )
-            except Exception as e:
-                print(
-                    f"download video fail, exception: {e}, traceback: {traceback.format_exc()}")
-        print(f"query_videos end i = {i} ...")
-        pool.close()
-        pool.join()
+    timer_check()

+ 3 - 3
download_videos_task.sh

@@ -1,11 +1,11 @@
 ps -ef | grep download_videos_task.py | grep -v grep | awk '{print $2}' | xargs kill -9
 
-cd /data/aigc-test
+cd /sunxy/aigc-test 
 
 rm -r videos_0/ videos_1/
 
-source activate whisper
+source activate srt
 
-python download_videos_task.py
+nohup python download_videos_task.py > logs/download_videos_task.log 2>&1 &
 
 conda deactivate

+ 2 - 2
mysql_connect.py

@@ -3,8 +3,8 @@ from mysql.connector import Error
 import json
 
 MYSQL_CONFIG = {
-    'host': 'rm-bp1k5853td1r25g3n690.mysql.rds.aliyuncs.com',
-    'database': 'incentive',
+    'host': 'rm-bp19uc56sud25ag4o.mysql.rds.aliyuncs.com',
+    'database': 'longvideo',
     'port': 3306,
     'user': 'wx2016_longvideo',
     'password': 'wx2016_longvideoP@assword1234',