|
@@ -1,60 +1,16 @@
|
|
|
"""
|
|
|
Created on Mon Mar 18, 2024
|
|
|
@author: luojunhui
|
|
|
+Read data from odps and save to json file in local files
|
|
|
"""
|
|
|
import os
|
|
|
import sys
|
|
|
import json
|
|
|
from tqdm import tqdm
|
|
|
-from datetime import datetime, timedelta
|
|
|
-sys.path.append(os.getcwd())
|
|
|
-
|
|
|
-from functions.odps_function import PyODPS
|
|
|
-
|
|
|
-
|
|
|
-def generate_hourly_strings(start_date, end_date):
|
|
|
- """
|
|
|
- Generate hourly date_str
|
|
|
- :param start_date:
|
|
|
- :param end_date:
|
|
|
- :return:
|
|
|
- """
|
|
|
- start = datetime.strptime(start_date, '%Y%m%d%H')
|
|
|
- end = datetime.strptime(end_date, '%Y%m%d%H')
|
|
|
- current = start
|
|
|
- date_strings = []
|
|
|
- while current <= end:
|
|
|
- date_strings.append(current.strftime('%Y%m%d%H'))
|
|
|
- current += timedelta(hours=1)
|
|
|
- return date_strings
|
|
|
-
|
|
|
-
|
|
|
-def generate_daily_strings(start_date, end_date):
|
|
|
- """
|
|
|
- Generate daily date_str
|
|
|
- :param start_date:
|
|
|
- :param end_date:
|
|
|
- :return:
|
|
|
- """
|
|
|
- start = datetime.strptime(start_date, '%Y%m%d')
|
|
|
- end = datetime.strptime(end_date, '%Y%m%d')
|
|
|
- current = start
|
|
|
- date_strings = []
|
|
|
- while current <= end:
|
|
|
- date_strings.append(current.strftime('%Y%m%d'))
|
|
|
- current += timedelta(days=1)
|
|
|
- return date_strings
|
|
|
|
|
|
+sys.path.append(os.getcwd())
|
|
|
|
|
|
-def generate_label_date(now_dt):
|
|
|
- """
|
|
|
- Generate date in 3 days
|
|
|
- :param now_dt:
|
|
|
- :return:
|
|
|
- """
|
|
|
- now_date = datetime.strptime(now_dt, "%Y%m%d%H")
|
|
|
- three_date = now_date + timedelta(days=4)
|
|
|
- return three_date.strftime("%Y%m%d")
|
|
|
+from functions import PyODPS, generate_hourly_strings, generate_daily_strings
|
|
|
|
|
|
|
|
|
class VideoDataGenerator(object):
|
|
@@ -136,19 +92,64 @@ class VideoDataGenerator(object):
|
|
|
return result
|
|
|
|
|
|
|
|
|
-if __name__ == '__main__':
|
|
|
- # date_list = generate_hourly_strings("2024010100", "2024013123")
|
|
|
- date_list = generate_daily_strings("20240101", "20240228")
|
|
|
+def save_daily_data(start_date, end_date, save_path):
|
|
|
+ """
|
|
|
+ 获取日期范围内数据,并且保存到指定路径
|
|
|
+ :param start_date:
|
|
|
+ :param end_date:
|
|
|
+ :param save_path:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ date_list = generate_daily_strings(start_date, end_date)
|
|
|
V = VideoDataGenerator()
|
|
|
L = {}
|
|
|
- # print(date_list)
|
|
|
for date_str in tqdm(date_list):
|
|
|
L[date_str] = {}
|
|
|
- # data_list = V.get_hour_data(date_str)
|
|
|
data_list = V.get_daily_data(date_str)
|
|
|
for obj in tqdm(data_list):
|
|
|
video_id = obj['video_id']
|
|
|
L[date_str][video_id] = obj
|
|
|
+ with open(save_path, "w") as f:
|
|
|
+ f.write(json.dumps(L, ensure_ascii=False))
|
|
|
+
|
|
|
+
|
|
|
+def save_hourly_data(start_date, end_date, save_path):
|
|
|
+ """
|
|
|
+ save hourly data
|
|
|
+ :param start_date:
|
|
|
+ :param end_date:
|
|
|
+ :param save_path:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ date_list = generate_hourly_strings(start_date, end_date)
|
|
|
+ V = VideoDataGenerator()
|
|
|
+ L = []
|
|
|
+ for date_str in tqdm(date_list):
|
|
|
+ data_list = V.get_daily_data(date_str)
|
|
|
+ for obj in tqdm(data_list):
|
|
|
+ L.append(obj)
|
|
|
+ with open(save_path, "w") as f:
|
|
|
+ f.write(json.dumps(L, ensure_ascii=False))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ flag = int(input("请输入标识符,输入 1 获取小时级数据, 输入 2 获取天级数据: \n"))
|
|
|
+ if flag == 1:
|
|
|
+ start = str(input("请输入开始字符串, 格式为 yyyymmddhh:\n"))
|
|
|
+ end = str(input("请输入结束字符串, 格式为 yymmddhh: \n"))
|
|
|
+ save_p = "data/hourly-train-{}-{}.json".format(start, end)
|
|
|
+ if len(start) == 10 and len(end) == 10:
|
|
|
+ save_hourly_data(start, end, save_p)
|
|
|
+ else:
|
|
|
+ print("Time format is not ok")
|
|
|
+ elif flag == 2:
|
|
|
+ start = str(input("请输入开始字符串, 格式为 yyyymmdd:\n"))
|
|
|
+ end = str(input("请输入结束字符串, 格式为 yymmdd: \n"))
|
|
|
+ save_p = "data/daily-label-{}-{}.json".format(start, end)
|
|
|
+ if len(start) == 8 and len(end) == 8:
|
|
|
+ save_daily_data(start, end, save_p)
|
|
|
+ else:
|
|
|
+ print("Time format is not ok")
|
|
|
+ else:
|
|
|
+ print("Input Error ! Make sure your input is 1 or 2!!")
|
|
|
|
|
|
- with open('data/jan_feb_label.json', 'w') as f:
|
|
|
- f.write(json.dumps(L, ensure_ascii=False, indent=4))
|