浏览代码

更新初始化代码

罗俊辉 1 年之前
父节点
当前提交
bddeded83e
共有 2 个文件被更改,包括 29 次插入11 次删除
  1. 2 3
      main.py
  2. 27 8
      read_data_from_odps.py

+ 2 - 3
main.py

@@ -29,7 +29,7 @@ my_c = [
         "out_collection_cnt"
     ]
 
-str_cols = ["uid", "type", "channel", "mode"]
+str_cols = ["uid", "type", "channel", "mode", "out_user_id"]
 float_cols = [
         "fans",
         "view_count_user_30days",
@@ -37,7 +37,6 @@ float_cols = [
         "return_count_user_30days",
         "rov_user",
         "str_user",
-        "out_user_id",
         "out_play_cnt",
         "out_like_cnt",
         "out_share_cnt",
@@ -68,7 +67,7 @@ with open("whole_data/y_data.json") as f2:
     y_test = np.array(y__list[index_t:])
 
 # 创建LightGBM数据集
-train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode'])
+train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=['uid', 'type', 'channel', 'mode', 'out_user_id'])
 test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
 
 # 设置模型的参数

+ 27 - 8
read_data_from_odps.py

@@ -7,6 +7,7 @@ import os
 import sys
 import json
 from tqdm import tqdm
+from concurrent.futures import ThreadPoolExecutor
 
 sys.path.append(os.getcwd())
 
@@ -85,6 +86,7 @@ class VideoDataGenerator(object):
                 "3day_view": item['flowpool_3days_distribute_view_times'],
                 "3day_share": item['flowpool_3days_share_times'],
                 "3day_return": item['flowpool_3days_return_users'],
+                "3day_up_level": item['up_level_3_days'],
                 "dt": item['dt']
 
             } for item in data
@@ -113,6 +115,22 @@ def save_daily_data(start_date, end_date, save_path):
         f.write(json.dumps(L, ensure_ascii=False))
 
 
+def download_hour_video_data(date_str):
+    """
+    获取日期参数
+    :param date_str:
+    :return:
+    """
+    V = VideoDataGenerator()
+    data_list = V.get_hour_data(date_str)
+    L = []
+    for obj in data_list:
+        L.append(obj)
+    temp_path = "data/{}.json".format(date_str)
+    with open(temp_path, "w") as f:
+        f.write(json.dumps(L, ensure_ascii=False))
+
+
 def save_hourly_data(start_date, end_date, save_path):
     """
     save hourly data
@@ -121,15 +139,16 @@ def save_hourly_data(start_date, end_date, save_path):
     :param save_path:
     :return:
     """
+    print(save_path)
     date_list = generate_hourly_strings(start_date, end_date)
-    V = VideoDataGenerator()
-    L = []
-    for date_str in tqdm(date_list):
-        data_list = V.get_hour_data(date_str)
-        for obj in tqdm(data_list):
-            L.append(obj)
-    with open(save_path, "w") as f:
-        f.write(json.dumps(L, ensure_ascii=False))
+    with ThreadPoolExecutor(max_workers=10) as Pool:
+        Pool.map(download_hour_video_data, date_list)
+    # for date_str in tqdm(date_list):
+    #     data_list = V.get_hour_data(date_str)
+    #     for obj in tqdm(data_list):
+    #         L.append(obj)
+    # with open(save_path, "w") as f:
+    #     f.write(json.dumps(L, ensure_ascii=False))
 
 
 if __name__ == '__main__':