فهرست منبع

预测模型V1 完成
代码已经用 black 规范化

罗俊辉 1 سال پیش
والد
کامیت
66e4588602
8فایلهای تغییر یافته به همراه88 افزوده شده و 81 حذف شده
  1. 7 6
      functions/date.py
  2. 0 1
      functions/mysql.py
  3. 2 1
      functions/odps_function.py
  4. 21 18
      main.py
  5. 4 1
      p_data_process.py
  6. 8 7
      process_data.py
  7. 0 1
      process_temp.py
  8. 46 46
      read_data_from_odps.py

+ 7 - 6
functions/date.py

@@ -1,6 +1,7 @@
 """
 处理时间格式
 """
+
 from datetime import datetime, timedelta
 
 
@@ -11,12 +12,12 @@ def generate_hourly_strings(start_date, end_date):
     :param end_date:
     :return:
     """
-    start = datetime.strptime(start_date, '%Y%m%d%H')
-    end = datetime.strptime(end_date, '%Y%m%d%H')
+    start = datetime.strptime(start_date, "%Y%m%d%H")
+    end = datetime.strptime(end_date, "%Y%m%d%H")
     current = start
     date_strings = []
     while current <= end:
-        date_strings.append(current.strftime('%Y%m%d%H'))
+        date_strings.append(current.strftime("%Y%m%d%H"))
         current += timedelta(hours=1)
     return date_strings
 
@@ -28,12 +29,12 @@ def generate_daily_strings(start_date, end_date):
     :param end_date:
     :return:
     """
-    start = datetime.strptime(start_date, '%Y%m%d')
-    end = datetime.strptime(end_date, '%Y%m%d')
+    start = datetime.strptime(start_date, "%Y%m%d")
+    end = datetime.strptime(end_date, "%Y%m%d")
     current = start
     date_strings = []
     while current <= end:
-        date_strings.append(current.strftime('%Y%m%d'))
+        date_strings.append(current.strftime("%Y%m%d"))
         current += timedelta(days=1)
     return date_strings
 

+ 0 - 1
functions/mysql.py

@@ -7,4 +7,3 @@ class MySQLClient(object):
     """
     MySQL Client
     """
-

+ 2 - 1
functions/odps_function.py

@@ -1,6 +1,7 @@
 """
 python ODPS class
 """
+
 from odps import ODPS
 
 
@@ -31,4 +32,4 @@ class PyODPS(object):
         with self.od.execute_sql(sql).open_reader() as reader:
             for record in reader:
                 result.append(record)
-        return result
+        return result

+ 21 - 18
main.py

@@ -35,7 +35,7 @@ class LightGBM(object):
             "out_play_cnt",
             "out_like_cnt",
             "out_share_cnt",
-            "out_collection_cnt"
+            "out_collection_cnt",
         ]
         self.str_columns = ["uid", "type", "channel", "mode", "out_user_id"]
         self.float_columns = [
@@ -48,7 +48,7 @@ class LightGBM(object):
             "out_play_cnt",
             "out_like_cnt",
             "out_share_cnt",
-            "out_collection_cnt"
+            "out_collection_cnt",
         ]
         self.split_c = 0.95
         self.yc = 0.8
@@ -66,12 +66,12 @@ class LightGBM(object):
         for key in self.str_columns:
             X_train[key] = self.label_encoder.fit_transform(X_train[key])
         for key in self.float_columns:
-            X_train[key] = pd.to_numeric(X_train[key], errors='coerce')
+            X_train[key] = pd.to_numeric(X_train[key], errors="coerce")
         X_test = pd.DataFrame(x_list[index_t:], columns=self.my_c)
         for key in self.str_columns:
             X_test[key] = self.label_encoder.fit_transform(X_test[key])
         for key in self.float_columns:
-            X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
+            X_test[key] = pd.to_numeric(X_test[key], errors="coerce")
         return X_train, X_test
 
     def generate_y_data(self):
@@ -97,18 +97,21 @@ class LightGBM(object):
         """
         X_train, X_test = self.generate_x_data()
         Y_train, Y_test = self.generate_y_data()
-        train_data = lgb.Dataset(X_train, label=Y_train,
-                                 categorical_feature=['uid', 'type', 'channel', 'mode', 'out_user_id'])
+        train_data = lgb.Dataset(
+            X_train,
+            label=Y_train,
+            categorical_feature=["uid", "type", "channel", "mode", "out_user_id"],
+        )
         test_data = lgb.Dataset(X_test, label=Y_test, reference=train_data)
         params = {
-            'objective': 'binary',  # 指定二分类任务
-            'metric': 'binary_logloss',  # 评估指标为二分类的log损失
-            'num_leaves': 31,  # 叶子节点数
-            'learning_rate': 0.05,  # 学习率
-            'bagging_fraction': 0.9,  # 建树的样本采样比例
-            'feature_fraction': 0.8,  # 建树的特征选择比例
-            'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
-            'num_threads': 4  # 线程数量
+            "objective": "binary",  # 指定二分类任务
+            "metric": "binary_logloss",  # 评估指标为二分类的log损失
+            "num_leaves": 31,  # 叶子节点数
+            "learning_rate": 0.05,  # 学习率
+            "bagging_fraction": 0.9,  # 建树的样本采样比例
+            "feature_fraction": 0.8,  # 建树的特征选择比例
+            "bagging_freq": 5,  # k 意味着每 k 次迭代执行bagging
+            "num_threads": 4,  # 线程数量
         }
         # 训练模型
         num_round = 100
@@ -136,7 +139,7 @@ class LightGBM(object):
         for key in self.str_columns:
             X_test[key] = self.label_encoder.fit_transform(X_test[key])
         for key in self.float_columns:
-            X_test[key] = pd.to_numeric(X_test[key], errors='coerce')
+            X_test[key] = pd.to_numeric(X_test[key], errors="coerce")
         bst = lgb.Booster(model_file=self.model)
         y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
         y_pred_binary = [0 if i <= 0.1613 else 1 for i in list(y_pred)]
@@ -155,11 +158,11 @@ class LightGBM(object):
         print(data_series.describe())
         # 评估模型
         accuracy = accuracy_score(Y_test, y_pred_binary)
-        print(f'Accuracy: {accuracy}')
+        print(f"Accuracy: {accuracy}")
         fw.close()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     L = LightGBM()
     # L.train_model()
-    L.evaluate_model()
+    L.evaluate_model()

+ 4 - 1
p_data_process.py

@@ -2,11 +2,14 @@
 生成预测数据,  3月14日 和 3月17日的小时级数据
              3月18日至 3月21日的daily 数据
 """
+
 import os
 import json
 from functions import generate_hourly_strings
 
-target_string_list = generate_hourly_strings(start_date="2024031400", end_date="2024031723")
+target_string_list = generate_hourly_strings(
+    start_date="2024031400", end_date="2024031723"
+)
 path = "temp_data/data"
 L = []
 for file in target_string_list:

+ 8 - 7
process_data.py

@@ -1,6 +1,7 @@
 """
 process the data to satisfy the lightgbm
 """
+
 import sys
 import os
 import json
@@ -16,8 +17,8 @@ def generate_train_label(item, y_ori_data, cate):
     生成训练数据,用 np.array矩阵的方式返回,
     :return: x_train, 训练数据, y_train, 训练 label
     """
-    video_id = item['video_id']
-    dt = item['dt']
+    video_id = item["video_id"]
+    dt = item["dt"]
     userful_features = [
         "uid",
         "type",
@@ -33,7 +34,7 @@ def generate_train_label(item, y_ori_data, cate):
         "out_play_cnt",
         "out_like_cnt",
         "out_share_cnt",
-        "out_collection_cnt"
+        "out_collection_cnt",
     ]
     item_features = [item[i] for i in userful_features]
     label_dt = generate_label_date(dt)
@@ -45,16 +46,16 @@ def generate_train_label(item, y_ori_data, cate):
     return label, item_features
 
 
-if __name__ == '__main__':
-    x_path = 'prid_data/train_0314_0317.json'
-    y_path = 'data/daily-label-20240315-20240321.json'
+if __name__ == "__main__":
+    x_path = "prid_data/train_0314_0317.json"
+    y_path = "data/daily-label-20240315-20240321.json"
 
     with open(x_path) as f:
         x_data = json.loads(f.read())
 
     with open(y_path) as f:
         y_data = json.loads(f.read())
-    cate_list = ['total_return']
+    cate_list = ["total_return"]
     for c in cate_list:
         x_list = []
         y_list = []

+ 0 - 1
process_temp.py

@@ -15,4 +15,3 @@ for file_name in path_list:
 
 with open("data/hour_train.json", "w") as f:
     f.write(json.dumps(L, ensure_ascii=False))
-

+ 46 - 46
read_data_from_odps.py

@@ -3,6 +3,7 @@ Created on Mon Mar 18, 2024
 @author: luojunhui
 Read data from odps and save to json file in local files
 """
+
 import os
 import sys
 import json
@@ -33,38 +34,38 @@ class VideoDataGenerator(object):
         result = []
         for line in hour_data:
             obj = {
-                "uid": line['uid'],
-                "video_id": line['videoid'],
-                "type": line['type'],
-                "channel": line['channel'],
-                "fst": line['flowpool_start_type'],
-                "fsl": line['flowpool_start_level'],
-                "fet": line['flowpool_end_type'],
-                "fel": line['flowpool_end_level'],
-                "f_view": line['flowpool_distribute_view_times'],
-                "f_share": line['flowpool_share_times'],
-                "f_return": line['flowpool_return_users'],
-                "f3_view": line['flowpool_3days_distribute_view_times'],
-                "f3_share": line['flowpool_3days_share_times'],
-                "f3_return": line['flowpool_3days_return_users'],
-                "ros_dms": line['ros_dms'],
-                "rov_dms": line['rov_dms'],
-                "ros_sls": line['ros_sls'],
-                "rov_sls": line['rov_sls'],
-                "fans": line['fans'],
-                "view_count_user_30days": line['view_cnt_user_30days'],
-                "share_count_user_30days": line['share_cnt_user_30days'],
-                "return_count_user_30days": line['return_cnt_user_30days'],
-                "rov_user": line['rov_user'],
-                "str_user": line['str_user'],  # share / view
-                "out_user_id": line['out_user_id'],
-                "mode": line['strategy'],
-                "out_play_cnt": line['out_play_cnt'],
-                "out_like_cnt": line['out_like_cnt'],
-                "out_share_cnt": line['out_share_cnt'],
-                "out_collection_cnt": line['out_collection_cnt'],
-                "up_level_time_hour": line['up_level_time_hour'],
-                "dt": line['dt']
+                "uid": line["uid"],
+                "video_id": line["videoid"],
+                "type": line["type"],
+                "channel": line["channel"],
+                "fst": line["flowpool_start_type"],
+                "fsl": line["flowpool_start_level"],
+                "fet": line["flowpool_end_type"],
+                "fel": line["flowpool_end_level"],
+                "f_view": line["flowpool_distribute_view_times"],
+                "f_share": line["flowpool_share_times"],
+                "f_return": line["flowpool_return_users"],
+                "f3_view": line["flowpool_3days_distribute_view_times"],
+                "f3_share": line["flowpool_3days_share_times"],
+                "f3_return": line["flowpool_3days_return_users"],
+                "ros_dms": line["ros_dms"],
+                "rov_dms": line["rov_dms"],
+                "ros_sls": line["ros_sls"],
+                "rov_sls": line["rov_sls"],
+                "fans": line["fans"],
+                "view_count_user_30days": line["view_cnt_user_30days"],
+                "share_count_user_30days": line["share_cnt_user_30days"],
+                "return_count_user_30days": line["return_cnt_user_30days"],
+                "rov_user": line["rov_user"],
+                "str_user": line["str_user"],  # share / view
+                "out_user_id": line["out_user_id"],
+                "mode": line["strategy"],
+                "out_play_cnt": line["out_play_cnt"],
+                "out_like_cnt": line["out_like_cnt"],
+                "out_share_cnt": line["out_share_cnt"],
+                "out_collection_cnt": line["out_collection_cnt"],
+                "up_level_time_hour": line["up_level_time_hour"],
+                "dt": line["dt"],
             }
             result.append(obj)
         return result
@@ -79,17 +80,17 @@ class VideoDataGenerator(object):
         data = self.oo.select(sql)
         result = [
             {
-                "video_id": item['videoid'],
-                "total_view": item['flowpool_distribute_view_times'],
-                "total_share": item['flowpool_share_times'],
-                "total_return": item['flowpool_return_users'],
-                "3day_view": item['flowpool_3days_distribute_view_times'],
-                "3day_share": item['flowpool_3days_share_times'],
-                "3day_return": item['flowpool_3days_return_users'],
-                "3day_up_level": item['up_level_3_days'],
-                "dt": item['dt']
-
-            } for item in data
+                "video_id": item["videoid"],
+                "total_view": item["flowpool_distribute_view_times"],
+                "total_share": item["flowpool_share_times"],
+                "total_return": item["flowpool_return_users"],
+                "3day_view": item["flowpool_3days_distribute_view_times"],
+                "3day_share": item["flowpool_3days_share_times"],
+                "3day_return": item["flowpool_3days_return_users"],
+                "3day_up_level": item["up_level_3_days"],
+                "dt": item["dt"],
+            }
+            for item in data
         ]
         return result
 
@@ -109,7 +110,7 @@ def save_daily_data(start_date, end_date, save_path):
         L[date_str] = {}
         data_list = V.get_daily_data(date_str)
         for obj in tqdm(data_list):
-            video_id = obj['video_id']
+            video_id = obj["video_id"]
             L[date_str][video_id] = obj
     with open(save_path, "w") as f:
         f.write(json.dumps(L, ensure_ascii=False))
@@ -151,7 +152,7 @@ def save_hourly_data(start_date, end_date, save_path):
     #     f.write(json.dumps(L, ensure_ascii=False))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     flag = int(input("请输入标识符,输入 1 获取小时级数据, 输入 2 获取天级数据: \n"))
     if flag == 1:
         start = str(input("请输入开始字符串, 格式为 yyyymmddhh:\n"))
@@ -171,4 +172,3 @@ if __name__ == '__main__':
             print("Time format is not ok")
     else:
         print("Input Error ! Make sure your input is 1 or 2!!")
-