浏览代码

更新初始化代码

罗俊辉 1 年之前
父节点
当前提交
f28043332d
共有 4 个文件被更改,包括 115 次插入55 次删除
  1. 5 0
      functions/__init__.py
  2. 49 0
      functions/date.py
  3. 6 1
      process_data.py
  4. 55 54
      read_data_from_odps.py

+ 5 - 0
functions/__init__.py

@@ -0,0 +1,5 @@
+"""
+init file for functions
+"""
+from .date import *
+from .odps_function import PyODPS

+ 49 - 0
functions/date.py

@@ -0,0 +1,49 @@
+"""
+处理时间格式
+"""
+from datetime import datetime, timedelta
+
+
+def generate_hourly_strings(start_date, end_date):
+    """
+    Generate hourly date_str
+    :param start_date:
+    :param end_date:
+    :return:
+    """
+    start = datetime.strptime(start_date, '%Y%m%d%H')
+    end = datetime.strptime(end_date, '%Y%m%d%H')
+    current = start
+    date_strings = []
+    while current <= end:
+        date_strings.append(current.strftime('%Y%m%d%H'))
+        current += timedelta(hours=1)
+    return date_strings
+
+
+def generate_daily_strings(start_date, end_date):
+    """
+    Generate daily date_str
+    :param start_date:
+    :param end_date:
+    :return:
+    """
+    start = datetime.strptime(start_date, '%Y%m%d')
+    end = datetime.strptime(end_date, '%Y%m%d')
+    current = start
+    date_strings = []
+    while current <= end:
+        date_strings.append(current.strftime('%Y%m%d'))
+        current += timedelta(days=1)
+    return date_strings
+
+
+def generate_label_date(now_dt):
+    """
+    Generate date in 3 days
+    :param now_dt:
+    :return:
+    """
+    now_date = datetime.strptime(now_dt, "%Y%m%d%H")
+    three_date = now_date + timedelta(days=4)
+    return three_date.strftime("%Y%m%d")

+ 6 - 1
test.py → process_data.py

@@ -1,9 +1,14 @@
+"""
+process the data to satisfy the lightgbm
+"""
 import sys
 import sys
 import os
 import os
 import json
 import json
 from tqdm import tqdm
 from tqdm import tqdm
+
 sys.path.append(os.getcwd())
 sys.path.append(os.getcwd())
-from functions.generate_data import generate_label_date
+
+from functions import generate_label_date
 
 
 
 
 def generate_train_label(item, y_ori_data):
 def generate_train_label(item, y_ori_data):

+ 55 - 54
functions/generate_data.py → read_data_from_odps.py

@@ -1,60 +1,16 @@
 """
 """
 Created on Mon Mar 18, 2024
 Created on Mon Mar 18, 2024
 @author: luojunhui
 @author: luojunhui
+Read data from odps and save to json file in local files
 """
 """
 import os
 import os
 import sys
 import sys
 import json
 import json
 from tqdm import tqdm
 from tqdm import tqdm
-from datetime import datetime, timedelta
-sys.path.append(os.getcwd())
-
-from functions.odps_function import PyODPS
-
-
-def generate_hourly_strings(start_date, end_date):
-    """
-    Generate hourly date_str
-    :param start_date:
-    :param end_date:
-    :return:
-    """
-    start = datetime.strptime(start_date, '%Y%m%d%H')
-    end = datetime.strptime(end_date, '%Y%m%d%H')
-    current = start
-    date_strings = []
-    while current <= end:
-        date_strings.append(current.strftime('%Y%m%d%H'))
-        current += timedelta(hours=1)
-    return date_strings
-
-
-def generate_daily_strings(start_date, end_date):
-    """
-    Generate daily date_str
-    :param start_date:
-    :param end_date:
-    :return:
-    """
-    start = datetime.strptime(start_date, '%Y%m%d')
-    end = datetime.strptime(end_date, '%Y%m%d')
-    current = start
-    date_strings = []
-    while current <= end:
-        date_strings.append(current.strftime('%Y%m%d'))
-        current += timedelta(days=1)
-    return date_strings
 
 
+sys.path.append(os.getcwd())
 
 
-def generate_label_date(now_dt):
-    """
-    Generate date in 3 days
-    :param now_dt:
-    :return:
-    """
-    now_date = datetime.strptime(now_dt, "%Y%m%d%H")
-    three_date = now_date + timedelta(days=4)
-    return three_date.strftime("%Y%m%d")
+from functions import PyODPS, generate_hourly_strings, generate_daily_strings
 
 
 
 
 class VideoDataGenerator(object):
 class VideoDataGenerator(object):
@@ -136,19 +92,64 @@ class VideoDataGenerator(object):
         return result
         return result
 
 
 
 
-if __name__ == '__main__':
-    # date_list = generate_hourly_strings("2024010100", "2024013123")
-    date_list = generate_daily_strings("20240101", "20240228")
+def save_daily_data(start_date, end_date, save_path):
+    """
+    获取日期范围内数据,并且保存到指定路径
+    :param start_date:
+    :param end_date:
+    :param save_path:
+    :return:
+    """
+    date_list = generate_daily_strings(start_date, end_date)
     V = VideoDataGenerator()
     V = VideoDataGenerator()
     L = {}
     L = {}
-    # print(date_list)
     for date_str in tqdm(date_list):
     for date_str in tqdm(date_list):
         L[date_str] = {}
         L[date_str] = {}
-        # data_list = V.get_hour_data(date_str)
         data_list = V.get_daily_data(date_str)
         data_list = V.get_daily_data(date_str)
         for obj in tqdm(data_list):
         for obj in tqdm(data_list):
             video_id = obj['video_id']
             video_id = obj['video_id']
             L[date_str][video_id] = obj
             L[date_str][video_id] = obj
+    with open(save_path, "w") as f:
+        f.write(json.dumps(L, ensure_ascii=False))
+
+
+def save_hourly_data(start_date, end_date, save_path):
+    """
+    save hourly data
+    :param start_date:
+    :param end_date:
+    :param save_path:
+    :return:
+    """
+    date_list = generate_hourly_strings(start_date, end_date)
+    V = VideoDataGenerator()
+    L = []
+    for date_str in tqdm(date_list):
+        data_list = V.get_daily_data(date_str)
+        for obj in tqdm(data_list):
+            L.append(obj)
+    with open(save_path, "w") as f:
+        f.write(json.dumps(L, ensure_ascii=False))
+
+
+if __name__ == '__main__':
+    flag = int(input("请输入标识符,输入 1 获取小时级数据, 输入 2 获取天级数据: \n"))
+    if flag == 1:
+        start = str(input("请输入开始字符串, 格式为 yyyymmddhh:\n"))
+        end = str(input("请输入结束字符串, 格式为 yymmddhh: \n"))
+        save_p = "data/hourly-train-{}-{}.json".format(start, end)
+        if len(start) == 10 and len(end) == 10:
+            save_hourly_data(start, end, save_p)
+        else:
+            print("Time format is not ok")
+    elif flag == 2:
+        start = str(input("请输入开始字符串, 格式为 yyyymmdd:\n"))
+        end = str(input("请输入结束字符串, 格式为 yymmdd: \n"))
+        save_p = "data/daily-label-{}-{}.json".format(start, end)
+        if len(start) == 8 and len(end) == 8:
+            save_daily_data(start, end, save_p)
+        else:
+            print("Time format is not ok")
+    else:
+        print("Input Error ! Make sure your input is 1 or 2!!")
 
 
-    with open('data/jan_feb_label.json', 'w') as f:
-        f.write(json.dumps(L, ensure_ascii=False, indent=4))