罗俊辉 1 year ago
parent
commit
ac2bf5c91a
4 changed files with 240 additions and 15 deletions
  1. 150 0
      functions/generate_data.py
  2. 10 0
      functions/mysql.py
  3. 16 15
      main.py
  4. 64 0
      test.py

+ 150 - 0
functions/generate_data.py

@@ -2,4 +2,154 @@
 Created on Mon Mar 18, 2024
 @author: luojunhui
 """
+import os
+import sys
+import json
+from tqdm import tqdm
+from datetime import datetime, timedelta
+sys.path.append(os.getcwd())
 
+from functions.odps_function import PyODPS
+
+
+def generate_hourly_strings(start_date, end_date):
+    """
+    Generate hourly date_str
+    :param start_date:
+    :param end_date:
+    :return:
+    """
+    start = datetime.strptime(start_date, '%Y%m%d%H')
+    end = datetime.strptime(end_date, '%Y%m%d%H')
+    current = start
+    date_strings = []
+    while current <= end:
+        date_strings.append(current.strftime('%Y%m%d%H'))
+        current += timedelta(hours=1)
+    return date_strings
+
+
+def generate_daily_strings(start_date, end_date):
+    """
+    Generate daily date_str
+    :param start_date:
+    :param end_date:
+    :return:
+    """
+    start = datetime.strptime(start_date, '%Y%m%d')
+    end = datetime.strptime(end_date, '%Y%m%d')
+    current = start
+    date_strings = []
+    while current <= end:
+        date_strings.append(current.strftime('%Y%m%d'))
+        current += timedelta(days=1)
+    return date_strings
+
+
+def generate_label_date(now_dt):
+    """
+    Generate date in 3 days
+    :param now_dt:
+    :return:
+    """
+    now_date = datetime.strptime(now_dt, "%Y%m%d%H")
+    three_date = now_date + timedelta(days=4)
+    return three_date.strftime("%Y%m%d")
+
+
+class VideoDataGenerator(object):
+    """
+    生成训练数据,测试数据
+    """
+
+    def __init__(self):
+        self.oo = PyODPS()
+
+    def get_hour_data(self, dt):
+        """
+        获取小时级的新视频
+        :param dt: 小时参数
+        :return:
+        """
+        sql = f"""select * from loghubods.conten_quality_base_hour where dt = '{dt}';"""
+        hour_data = self.oo.select(sql)
+        result = []
+        for line in hour_data:
+            obj = {
+                "uid": line['uid'],
+                "video_id": line['videoid'],
+                "type": line['type'],
+                "channel": line['channel'],
+                "fst": line['flowpool_start_type'],
+                "fsl": line['flowpool_start_level'],
+                "fet": line['flowpool_end_type'],
+                "fel": line['flowpool_end_level'],
+                "f_view": line['flowpool_distribute_view_times'],
+                "f_share": line['flowpool_share_times'],
+                "f_return": line['flowpool_return_users'],
+                "f3_view": line['flowpool_3days_distribute_view_times'],
+                "f3_share": line['flowpool_3days_share_times'],
+                "f3_return": line['flowpool_3days_return_users'],
+                "ros_dms": line['ros_dms'],
+                "rov_dms": line['rov_dms'],
+                "ros_sls": line['ros_sls'],
+                "rov_sls": line['rov_sls'],
+                "fans": line['fans'],
+                "view_count_user_30days": line['view_cnt_user_30days'],
+                "share_count_user_30days": line['share_cnt_user_30days'],
+                "return_count_user_30days": line['return_cnt_user_30days'],
+                "rov_user": line['rov_user'],
+                "str_user": line['str_user'],  # share / view
+                "out_user_id": line['out_user_id'],
+                "mode": line['strategy'],
+                "out_play_cnt": line['out_play_cnt'],
+                "out_like_cnt": line['out_like_cnt'],
+                "out_share_cnt": line['out_share_cnt'],
+                "out_collection_cnt": line['out_collection_cnt'],
+                "up_level_time_hour": line['up_level_time_hour'],
+                "dt": line['dt']
+            }
+            result.append(obj)
+        return result
+
+    def get_daily_data(self, dt):
+        """
+        天级表里面存储了视频的表现 label, 通过小时级的 video_id 去获取视频的表现
+        :param dt: 20240101
+        :return: data_list
+        """
+        sql = f"""select * from loghubods.conten_quality_base where dt = '{dt}';"""
+        data = self.oo.select(sql)
+        result = [
+            {
+                "video_id": item['videoid'],
+                "total_view": item['flowpool_distribute_view_times'],
+                "total_share": item['flowpool_share_times'],
+                "total_return": item['flowpool_return_users'],
+                "3day_view": item['flowpool_3days_distribute_view_times'],
+                "3day_share": item['flowpool_3days_share_times'],
+                "3day_return": item['flowpool_3days_return_users'],
+                "dt": item['dt']
+
+            } for item in data
+        ]
+        return result
+
+
+if __name__ == '__main__':
+    # print(generate_label_date("2024031101"))
+    date_list = generate_daily_strings("20240312", "20240319")
+    print(date_list)
+    V = VideoDataGenerator()
+    # w = V.get_daily_data("20240319")
+    # print(w)
+    L = {}
+    for date_str in tqdm(date_list):
+        L[date_str] = {}
+        data_list = V.get_daily_data(date_str)
+        for obj in tqdm(data_list):
+            video_id = obj['video_id']
+            L[date_str][video_id] = obj
+
+    with open('../data/12-17-train_label.json', 'w') as f:
+        f.write(json.dumps(L, ensure_ascii=False, indent=4))

+ 10 - 0
functions/mysql.py

@@ -0,0 +1,10 @@
+"""
+Mysql Functions
+"""
+
+
+class MySQLClient(object):
+    """
+    MySQL Client
+    """
+

+ 16 - 15
main.py

@@ -1,33 +1,34 @@
+import json
+
 import numpy as np
 import lightgbm as lgb
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import make_classification
 from sklearn.metrics import accuracy_score
 
-# 生成模拟数据
-X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
+with open("whole_data/x_data.json") as f1:
+    x_list = json.loads(f1.read())
+    X_train = np.array(x_list[:10000])
+    X_test = np.array(x_list[10000:])
 
-# 分割数据集
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+with open("whole_data/y_data.json") as f2:
+    y_list = json.loads(f2.read())
+    y_train = np.array(y_list[:10000])
+    y_test = np.array(y_list[10000:])
 
 # 创建LightGBM数据集
 train_data = lgb.Dataset(X_train, label=y_train)
 test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
 
-print(X_train.shape)
-for line in X_train:
-    print(line)
-# print(X_train)
-print(y_test)
 # 设置模型的参数
 params = {
-    'objective': 'binary',       # 指定二分类任务
+    'objective': 'binary',  # 指定二分类任务
     'metric': 'binary_logloss',  # 评估指标为二分类的log损失
-    'num_leaves': 31,            # 叶子节点数
-    'learning_rate': 0.05,       # 学习率
-    'bagging_fraction': 0.9,     # 建树的样本采样比例
-    'feature_fraction': 0.8,     # 建树的特征选择比例
-    'bagging_freq': 5,           # k 意味着每 k 次迭代执行bagging
+    'num_leaves': 31,  # 叶子节点数
+    'learning_rate': 0.05,  # 学习率
+    'bagging_fraction': 0.9,  # 建树的样本采样比例
+    'feature_fraction': 0.8,  # 建树的特征选择比例
+    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
 }
 
 # 训练模型

+ 64 - 0
test.py

@@ -0,0 +1,64 @@
+import sys
+import os
+import json
+from tqdm import tqdm
+sys.path.append(os.getcwd())
+from functions.generate_data import generate_label_date
+
+
+def generate_train_label(item, y_ori_data):
+    """
+    生成训练数据,用 np.array矩阵的方式返回,
+    :return: x_train, 训练数据, y_train, 训练 label
+    """
+    video_id = item['video_id']
+    dt = item['dt']
+    userful_features = [
+        "uid",
+        "type",
+        "channel",
+        "fans",
+        "view_count_user_30days",
+        "share_count_user_30days",
+        "return_count_user_30days",
+        "rov_user",
+        "str_user",
+        "out_user_id",
+        "mode",
+        "out_play_cnt",
+        "out_like_cnt",
+        "out_share_cnt",
+        "out_collection_cnt"
+    ]
+    item_features = [item[i] for i in userful_features]
+    label_dt = generate_label_date(dt)
+    label_obj = y_ori_data.get(label_dt, {}).get(video_id)
+    if label_obj:
+        label = int(label_obj['total_return']) if label_obj['total_return'] else None
+    else:
+        label = None
+    return label, item_features
+
+
+if __name__ == '__main__':
+    x_path = 'data/0312-0317_hour_train.json'
+    y_path = 'data/12-17-train_label.json'
+
+    with open(x_path) as f:
+        x_data = json.loads(f.read())
+
+    with open(y_path) as f:
+        y_data = json.loads(f.read())
+    x_list = []
+    y_list = []
+    for video_obj in tqdm(x_data):
+        our_label, features = generate_train_label(video_obj, y_data)
+        if our_label:
+            x_list.append(features)
+            y_list.append(our_label)
+    print(len(y_list))
+    with open("whole_data/x_data.json", "w") as f1:
+        f1.write(json.dumps(x_list, ensure_ascii=False))
+
+    with open("whole_data/y_data.json", "w") as f2:
+        f2.write(json.dumps(y_list, ensure_ascii=False))