瀏覽代碼

更新初始化代码

罗俊辉 1 年之前
父節點
當前提交
f28043332d
共有 4 個文件被更改,包括 115 次插入55 次删除
  1. 5 0
      functions/__init__.py
  2. 49 0
      functions/date.py
  3. 6 1
      process_data.py
  4. 55 54
      read_data_from_odps.py

+ 5 - 0
functions/__init__.py

@@ -0,0 +1,5 @@
+"""
+init file for functions
+"""
+from .date import *
+from .odps_function import PyODPS

+ 49 - 0
functions/date.py

@@ -0,0 +1,49 @@
+"""
+处理时间格式
+"""
+from datetime import datetime, timedelta
+
+
+def generate_hourly_strings(start_date, end_date):
+    """
+    Generate hourly date_str
+    :param start_date:
+    :param end_date:
+    :return:
+    """
+    start = datetime.strptime(start_date, '%Y%m%d%H')
+    end = datetime.strptime(end_date, '%Y%m%d%H')
+    current = start
+    date_strings = []
+    while current <= end:
+        date_strings.append(current.strftime('%Y%m%d%H'))
+        current += timedelta(hours=1)
+    return date_strings
+
+
+def generate_daily_strings(start_date, end_date):
+    """
+    Generate daily date_str
+    :param start_date:
+    :param end_date:
+    :return:
+    """
+    start = datetime.strptime(start_date, '%Y%m%d')
+    end = datetime.strptime(end_date, '%Y%m%d')
+    current = start
+    date_strings = []
+    while current <= end:
+        date_strings.append(current.strftime('%Y%m%d'))
+        current += timedelta(days=1)
+    return date_strings
+
+
+def generate_label_date(now_dt):
+    """
+    Generate date in 3 days
+    :param now_dt:
+    :return:
+    """
+    now_date = datetime.strptime(now_dt, "%Y%m%d%H")
+    three_date = now_date + timedelta(days=4)
+    return three_date.strftime("%Y%m%d")

+ 6 - 1
test.py → process_data.py

@@ -1,9 +1,14 @@
+"""
+process the data to satisfy the lightgbm
+"""
 import sys
 import os
 import json
 from tqdm import tqdm
+
 sys.path.append(os.getcwd())
-from functions.generate_data import generate_label_date
+
+from functions import generate_label_date
 
 
 def generate_train_label(item, y_ori_data):

+ 55 - 54
functions/generate_data.py → read_data_from_odps.py

@@ -1,60 +1,16 @@
 """
 Created on Mon Mar 18, 2024
 @author: luojunhui
+Read data from odps and save to json file in local files
 """
 import os
 import sys
 import json
 from tqdm import tqdm
-from datetime import datetime, timedelta
-sys.path.append(os.getcwd())
-
-from functions.odps_function import PyODPS
-
-
-def generate_hourly_strings(start_date, end_date):
-    """
-    Generate hourly date_str
-    :param start_date:
-    :param end_date:
-    :return:
-    """
-    start = datetime.strptime(start_date, '%Y%m%d%H')
-    end = datetime.strptime(end_date, '%Y%m%d%H')
-    current = start
-    date_strings = []
-    while current <= end:
-        date_strings.append(current.strftime('%Y%m%d%H'))
-        current += timedelta(hours=1)
-    return date_strings
-
-
-def generate_daily_strings(start_date, end_date):
-    """
-    Generate daily date_str
-    :param start_date:
-    :param end_date:
-    :return:
-    """
-    start = datetime.strptime(start_date, '%Y%m%d')
-    end = datetime.strptime(end_date, '%Y%m%d')
-    current = start
-    date_strings = []
-    while current <= end:
-        date_strings.append(current.strftime('%Y%m%d'))
-        current += timedelta(days=1)
-    return date_strings
 
+sys.path.append(os.getcwd())
 
-def generate_label_date(now_dt):
-    """
-    Generate date in 3 days
-    :param now_dt:
-    :return:
-    """
-    now_date = datetime.strptime(now_dt, "%Y%m%d%H")
-    three_date = now_date + timedelta(days=4)
-    return three_date.strftime("%Y%m%d")
+from functions import PyODPS, generate_hourly_strings, generate_daily_strings
 
 
 class VideoDataGenerator(object):
@@ -136,19 +92,64 @@ class VideoDataGenerator(object):
         return result
 
 
-if __name__ == '__main__':
-    # date_list = generate_hourly_strings("2024010100", "2024013123")
-    date_list = generate_daily_strings("20240101", "20240228")
+def save_daily_data(start_date, end_date, save_path):
+    """
+    获取日期范围内数据,并且保存到指定路径
+    :param start_date:
+    :param end_date:
+    :param save_path:
+    :return:
+    """
+    date_list = generate_daily_strings(start_date, end_date)
     V = VideoDataGenerator()
     L = {}
-    # print(date_list)
     for date_str in tqdm(date_list):
         L[date_str] = {}
-        # data_list = V.get_hour_data(date_str)
         data_list = V.get_daily_data(date_str)
         for obj in tqdm(data_list):
             video_id = obj['video_id']
             L[date_str][video_id] = obj
+    with open(save_path, "w") as f:
+        f.write(json.dumps(L, ensure_ascii=False))
+
+
+def save_hourly_data(start_date, end_date, save_path):
+    """
+    save hourly data
+    :param start_date:
+    :param end_date:
+    :param save_path:
+    :return:
+    """
+    date_list = generate_hourly_strings(start_date, end_date)
+    V = VideoDataGenerator()
+    L = []
+    for date_str in tqdm(date_list):
+        data_list = V.get_daily_data(date_str)
+        for obj in tqdm(data_list):
+            L.append(obj)
+    with open(save_path, "w") as f:
+        f.write(json.dumps(L, ensure_ascii=False))
+
+
+if __name__ == '__main__':
+    flag = int(input("请输入标识符,输入 1 获取小时级数据, 输入 2 获取天级数据: \n"))
+    if flag == 1:
+        start = str(input("请输入开始字符串, 格式为 yyyymmddhh:\n"))
+        end = str(input("请输入结束字符串, 格式为 yymmddhh: \n"))
+        save_p = "data/hourly-train-{}-{}.json".format(start, end)
+        if len(start) == 10 and len(end) == 10:
+            save_hourly_data(start, end, save_p)
+        else:
+            print("Time format is not ok")
+    elif flag == 2:
+        start = str(input("请输入开始字符串, 格式为 yyyymmdd:\n"))
+        end = str(input("请输入结束字符串, 格式为 yymmdd: \n"))
+        save_p = "data/daily-label-{}-{}.json".format(start, end)
+        if len(start) == 8 and len(end) == 8:
+            save_daily_data(start, end, save_p)
+        else:
+            print("Time format is not ok")
+    else:
+        print("Input Error ! Make sure your input is 1 or 2!!")
 
-    with open('data/jan_feb_label.json', 'w') as f:
-        f.write(json.dumps(L, ensure_ascii=False, indent=4))