5 tháng trước cách đây · 6abf86ad3a
--- a/applications/tasks/algorithm_tasks/account_category_analysis.py
+++ b/applications/tasks/algorithm_tasks/account_category_analysis.py
@@ -1,4 +1,7 @@
 
				-import time, json
			
 
				+import time
			
 
				+import json
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				 from datetime import datetime, timedelta
			
 
				 from pandas import DataFrame
			
 
				 from tqdm.asyncio import tqdm
			
@@ -11,7 +14,7 @@ class AccountCategoryConst:
 
				     SAMPLE_MIN_SIZE = 5
			
 
				     POSITIVE_STATUS = 1
			
 
				     NEGATIVE_STATUS = 0
			
 
				-    VERSION = 2
			
 
				+    CURRENT_VERSION = 3
			
 
				 
			
 
				     # SOME THRESHOLDS
			
 
				     SIMILARITY_THRESHOLD = 0
			
@@ -22,7 +25,13 @@ class AccountCategoryConst:
 
				     INDEX_MAX = 3
			
 
				 
			
 
				     # MAX VALUE
			
 
				-    MAX_VALUE = 0.5
			
 
				+    MAX_VALUE = 0.25
			
 
				+
			
 
				+    # === NEW: 策略相关常量 ===
			
 
				+    # 达到这个样本数后，账号权重占比接近 1
			
 
				+    MERGE_SAMPLE_COUNT = 90
			
 
				+    # 时间权重半衰期（天）：晚 30 天，权重衰减一半
			
 
				+    TIME_HALF_LIFE_DAYS = 90
			
 
				 
			
 
				 
			
 
				 class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
			
@@ -38,6 +47,7 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				         self.view_only = data.get("view_only")
			
 
				 
			
 
				     def reverse_category_map(self):
			
 
				+        # param_name -> category_name
			
 
				         return {v: k for k, v in self.category_map.items()}
			
 
				 
			
 
				     def init_execute_date(self):
			
@@ -47,11 +57,10 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				             run_date = datetime.today()
			
 
				 
			
 
				         end_dt = (run_date - timedelta(1)).strftime("%Y%m%d")
			
 
				-        begin_dt = (run_date - timedelta(61)).strftime("%Y%m%d")
			
 
				+        begin_dt = (run_date - timedelta(181)).strftime("%Y%m%d")
			
 
				         return begin_dt, end_dt
			
 
				 
			
 
				     async def prepare_raw_data(self, end_dt, begin_dt: str = "20250401"):
			
 
				-        begin_dt = "20250401"
			
 
				         query = """
			
 
				             select dt, gh_id, account_name, title, similarity, view_count_rate, category,
			
 
				                     read_avg, read_avg_rate, first_pub_interval, `index`
			
@@ -63,7 +72,6 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				                 and read_avg_rate between %s and %s
			
 
				                 and view_count_rate > %s
			
 
				                 and `index` < %s
			
 
				-                and account_name in ('生活慢时光', '美好时光阅读汇', '史趣探秘', '趣味生活漫谈', '趣味生活方式', '趣味生活漫时光')
			
 
				             ;
			
 
				         """
			
 
				         fetch_response = await self.pool.async_fetch(
			
@@ -98,11 +106,53 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				         dataframe = dataframe.drop_duplicates(["dt", "gh_id", "title"])
			
 
				         return dataframe
			
 
				 
			
 
				+    # === NEW: 构建 WLS 权重（时间 + 曝光） ===
			
 
				+    def build_sample_weights(self, df: DataFrame) -> pd.Series:
			
 
				+        """
			
 
				+        时间越近 + read_avg 越大，权重越高。
			
 
				+        最后做一个均值归一化，避免数值过大。
			
 
				+        """
			
 
				+        # dt 形如 "20251124"
			
 
				+        dt_series = pd.to_datetime(df["dt"], format="%Y%m%d", errors="coerce")
			
 
				+        latest_dt = dt_series.max()
			
 
				+        days_ago = (latest_dt - dt_series).dt.days.fillna(0)
			
 
				+        time_weight = np.power(0.5, days_ago / self.TIME_HALF_LIFE_DAYS)
			
 
				+
			
 
				+        exposure_weight = np.log1p(df["read_avg"]).fillna(1.0)
			
 
				+        weights = time_weight * exposure_weight
			
 
				+
			
 
				+        mean_w = weights.mean()
			
 
				+        if mean_w > 0:
			
 
				+            weights = weights / mean_w
			
 
				+        return weights
			
 
				+
			
 
				+    # === NEW: 把回归参数转成 {category: score} ===
			
 
				+    def _extract_category_scores(
			
 
				+        self,
			
 
				+        param_names,
			
 
				+        params,
			
 
				+        p_values,
			
 
				+        param_to_category_map,
			
 
				+    ):
			
 
				+        scores = {}
			
 
				+        for name, param, p_value in zip(param_names, params, p_values):
			
 
				+            category_name = param_to_category_map.get(name)
			
 
				+            if not category_name:
			
 
				+                continue
			
 
				+            if abs(param) <= 0.1 or p_value >= self.P_VALUE_THRESHOLD:
			
 
				+                continue
			
 
				+            scale_factor = min(0.1 / p_value, 1.0)
			
 
				+            truncate_param = max(min(param, self.MAX_VALUE), -self.MAX_VALUE)
			
 
				+            truncate_param *= scale_factor
			
 
				+            scores[category_name] = round(truncate_param, 6)
			
 
				+        return scores
			
 
				+
			
 
				     async def update_each_account(self, record):
			
 
				         now_timestamp = int(time.time())
			
 
				         query = """
			
 
				-            insert ignore into account_category (dt, gh_id, category_map, status, version, create_timestamp, update_timestamp)
			
 
				-                values (%s, %s, %s, %s, %s, %s, %s)
			
 
				+            insert ignore into account_category 
			
 
				+                (dt, gh_id, category_map, status, version, create_timestamp, update_timestamp)
			
 
				+            values (%s, %s, %s, %s, %s, %s, %s)
			
 
				         """
			
 
				         insert_rows = await self.pool.async_save(
			
 
				             query=query,
			
@@ -111,15 +161,16 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				                 record["gh_id"],
			
 
				                 record["category_map"],
			
 
				                 self.POSITIVE_STATUS,
			
 
				-                self.VERSION,
			
 
				+                self.CURRENT_VERSION,
			
 
				                 now_timestamp,
			
 
				                 now_timestamp,
			
 
				             ),
			
 
				         )
			
 
				         if insert_rows:
			
 
				             update_query = """
			
 
				-                update account_category set status = %s, update_timestamp = %s 
			
 
				-                where gh_id = %s and dt < %s and status = %s and version = %s;
			
 
				+                update account_category 
			
 
				+                set status = %s, update_timestamp = %s 
			
 
				+                where gh_id = %s and dt < %s and status = %s;
			
 
				             """
			
 
				             await self.pool.async_save(
			
 
				                 query=update_query,
			
@@ -129,60 +180,75 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				                     record["gh_id"],
			
 
				                     record["dt"],
			
 
				                     self.POSITIVE_STATUS,
			
 
				-                    self.VERSION,
			
 
				                 ),
			
 
				             )
			
 
				 
			
 
				     async def predict_each_account(
			
 
				-        self, df, account_id, account_id_map, end_dt, param_to_category_map
			
 
				+        self,
			
 
				+        sub_df,
			
 
				+        account_id,
			
 
				+        account_id_map,
			
 
				+        end_dt,
			
 
				+        param_to_category_map,
			
 
				+        global_scores,
			
 
				+        global_weights,
			
 
				     ):
			
 
				-        sub_df = df[df["gh_id"] == account_id]
			
 
				         account_name = account_id_map[account_id]
			
 
				         sample_count = sub_df.shape[0]
			
 
				         if sample_count < self.SAMPLE_MIN_SIZE:
			
 
				             return
			
 
				 
			
 
				-        params, t_stats, p_values = self.run_ols_linear_regression(
			
 
				-            sub_df, self.view_only, self.P_VALUE_THRESHOLD
			
 
				+        # 对应子集的权重
			
 
				+        if global_weights is not None:
			
 
				+            weights = global_weights[sub_df.index.to_numpy()]
			
 
				+        else:
			
 
				+            weights = None
			
 
				+
			
 
				+        param_names, params, t_stats, p_values = self.run_ols_linear_regression(
			
 
				+            sub_df,
			
 
				+            weights=weights,
			
 
				+            print_residual=self.view_only,
			
 
				+            print_p_value_threshold=self.P_VALUE_THRESHOLD,
			
 
				         )
			
 
				-        current_record = {
			
 
				-            "dt": end_dt,
			
 
				-            "gh_id": account_id,
			
 
				-            "category_map": {},
			
 
				-            "name": account_name,
			
 
				-        }
			
 
				-        params_names = self.get_param_names()
			
 
				-        for name, param, p_value in zip(params_names, params, p_values):
			
 
				-            category_name = param_to_category_map.get(name, None)
			
 
				-            if (
			
 
				-                abs(param) > 0.1
			
 
				-                and p_value < self.P_VALUE_THRESHOLD
			
 
				-                and category_name is not None
			
 
				-            ):
			
 
				-                scale_factor = min(0.1 / p_value, 1)
			
 
				-                print(
			
 
				-                    f"{account_id} {account_name} {category_name} {param:.3f} {p_value:.3f}"
			
 
				-                )
			
 
				-                truncate_param = round(
			
 
				-                    max(min(param, self.MAX_VALUE), -self.MAX_VALUE) * scale_factor, 6
			
 
				-                )
			
 
				-                current_record["category_map"][category_name] = truncate_param
			
 
				+        if not len(params):
			
 
				+            return
			
 
				 
			
 
				+        # 账号层品类得分
			
 
				+        account_scores = self._extract_category_scores(
			
 
				+            param_names, params, p_values, param_to_category_map
			
 
				+        )
			
 
				+
			
 
				+        # 记录负向品类（账号自身）
			
 
				+        for name, param, p_value in zip(param_names, params, p_values):
			
 
				+            category_name = param_to_category_map.get(name)
			
 
				             if (
			
 
				-                param < -0.1
			
 
				+                category_name is not None
			
 
				+                and param < -0.1
			
 
				                 and p_value < self.P_VALUE_THRESHOLD
			
 
				-                and category_name is not None
			
 
				             ):
			
 
				                 self.account_negative_categories[account_id].append(category_name)
			
 
				 
			
 
				-        if not current_record["category_map"]:
			
 
				-            return
			
 
				+        # 样本少时多依赖全局，样本多时更信账号自身
			
 
				+        alpha = min(sample_count / self.MERGE_SAMPLE_COUNT, 1.0)
			
 
				+        merged_scores = {}
			
 
				+        all_categories = set(global_scores.keys()) | set(account_scores.keys())
			
 
				+        for cat in all_categories:
			
 
				+            g = global_scores.get(cat, 0.0)
			
 
				+            a = account_scores.get(cat, 0.0)
			
 
				+            final = (1 - alpha) * g + alpha * a
			
 
				+            if abs(final) > 1e-6:
			
 
				+                merged_scores[cat] = round(final, 6)
			
 
				 
			
 
				-        current_record["category_map"] = json.dumps(
			
 
				-            current_record["category_map"], ensure_ascii=False
			
 
				-        )
			
 
				+        if not merged_scores:
			
 
				+            return
			
 
				 
			
 
				-        await self.update_each_account(current_record)
			
 
				+        record = {
			
 
				+            "dt": end_dt,
			
 
				+            "gh_id": account_id,
			
 
				+            "category_map": json.dumps(merged_scores, ensure_ascii=False),
			
 
				+            "name": account_name,
			
 
				+        }
			
 
				+        await self.update_each_account(record)
			
 
				 
			
 
				     async def deal(self):
			
 
				         begin_dt, end_dt = self.init_execute_date()
			
@@ -191,12 +257,32 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				         # prepare data for model
			
 
				         pre_processed_dataframe = self.preprocess_data(raw_dataframe)
			
 
				 
			
 
				+        if pre_processed_dataframe.empty:
			
 
				+            print(f"[INFO] no valid data between {begin_dt} and {end_dt}")
			
 
				+            return
			
 
				+
			
 
				         if self.view_only:
			
 
				             self.build_and_print_matrix(pre_processed_dataframe)
			
 
				             return
			
 
				 
			
 
				         param_to_category_map = self.reverse_category_map()
			
 
				 
			
 
				+        # === 1) 全局模型：带权重的 WLS，得到 global_scores ===
			
 
				+        global_weights = self.build_sample_weights(pre_processed_dataframe)
			
 
				+        g_param_names, g_params, g_t_stats, g_p_values = self.run_ols_linear_regression(
			
 
				+            pre_processed_dataframe,
			
 
				+            weights=global_weights,
			
 
				+            print_residual=False,
			
 
				+            print_p_value_threshold=self.P_VALUE_THRESHOLD,
			
 
				+        )
			
 
				+        if len(g_params):
			
 
				+            global_scores = self._extract_category_scores(
			
 
				+                g_param_names, g_params, g_p_values, param_to_category_map
			
 
				+            )
			
 
				+        else:
			
 
				+            global_scores = {}
			
 
				+
			
 
				+        # 账号信息准备
			
 
				         account_ids = pre_processed_dataframe["gh_id"].unique()
			
 
				         account_id_map = (
			
 
				             pre_processed_dataframe[["account_name", "gh_id"]]
			
@@ -204,15 +290,20 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
 
				             .set_index("gh_id")["account_name"]
			
 
				             .to_dict()
			
 
				         )
			
 
				+        self.account_negative_categories = {key: [] for key in account_ids}
			
 
				 
			
 
				-        account_negative_categories = {key: [] for key in account_ids}
			
 
				-        self.account_negative_categories = account_negative_categories
			
 
				-
			
 
				-        for account_id in tqdm(account_ids, desc="analysis each account"):
			
 
				+        # === 2) per-account 模型：在全局的基础上做微调 ===
			
 
				+        for account_id, sub_df in tqdm(
			
 
				+            pre_processed_dataframe.groupby("gh_id"),
			
 
				+            desc="analysis each account",
			
 
				+        ):
			
 
				             await self.predict_each_account(
			
 
				-                pre_processed_dataframe,
			
 
				+                sub_df,
			
 
				                 account_id,
			
 
				                 account_id_map,
			
 
				                 end_dt,
			
 
				                 param_to_category_map,
			
 
				+                global_scores,
			
 
				+                global_weights,
			
 
				             )
			
 
				+
			
--- a/applications/tasks/algorithm_tasks/models.py
+++ b/applications/tasks/algorithm_tasks/models.py
@@ -25,16 +25,16 @@ class CategoryRegression:
 
				 
			
 
				     def preprocess_data(self, raw_dataframe: DataFrame) -> DataFrame:
			
 
				         """预处理数据"""
			
 
				+        df = raw_dataframe.copy()
			
 
				+
			
 
				+        # 按品类打 one-hot
			
 
				         for category in self.category_map:
			
 
				             colname = self.category_map[category]
			
 
				-            raw_dataframe[colname] = raw_dataframe["category"] == category
			
 
				-            raw_dataframe[colname] = raw_dataframe[colname].astype(int)
			
 
				+            df[colname] = (df["category"] == category).astype(int)
			
 
				 
			
 
				         # 次条阅读量校正
			
 
				-        df_idx1 = raw_dataframe[raw_dataframe["index"] == 1][
			
 
				-            ["dt", "gh_id", "read_avg_rate"]
			
 
				-        ]
			
 
				-        merged_dataframe = raw_dataframe.merge(
			
 
				+        df_idx1 = df[df["index"] == 1][["dt", "gh_id", "read_avg_rate"]]
			
 
				+        merged_dataframe = df.merge(
			
 
				             df_idx1, how="left", on=["dt", "gh_id"], suffixes=("", "1")
			
 
				         )
			
 
				         debias_selection = merged_dataframe.query(
			
@@ -42,15 +42,26 @@ class CategoryRegression:
 
				         )
			
 
				         output_dataframe = merged_dataframe.drop(debias_selection.index)
			
 
				 
			
 
				-        output_dataframe["read_avg_rate"] = output_dataframe["read_avg_rate"].apply(
			
 
				-            self.clip_func
			
 
				+        # clip 成分位+log 形态
			
 
				+        def _clip_series(s):
			
 
				+            mask = s < 1.4
			
 
				+            res = s.copy()
			
 
				+            res[~mask] = 0.7 * np.log(res[~mask]) + 1.165
			
 
				+            return res
			
 
				+
			
 
				+        output_dataframe["read_avg_rate"] = _clip_series(
			
 
				+            output_dataframe["read_avg_rate"]
			
 
				         )
			
 
				-        output_dataframe["view_count_rate"] = output_dataframe["view_count_rate"].apply(
			
 
				-            self.clip_func
			
 
				+        output_dataframe["view_count_rate"] = _clip_series(
			
 
				+            output_dataframe["view_count_rate"]
			
 
				         )
			
 
				-        output_dataframe["days_decrease"] = output_dataframe["first_pub_interval"] * (
			
 
				-            -0.2 / 120
			
 
				+
			
 
				+        # 首发间隔带一点线性衰减
			
 
				+        output_dataframe["days_decrease"] = (
			
 
				+            output_dataframe["first_pub_interval"] * (-0.2 / 120)
			
 
				         )
			
 
				+
			
 
				+        # 回归目标
			
 
				         output_dataframe["RegressionY"] = output_dataframe["read_avg_rate"]
			
 
				         return output_dataframe
			
 
				 
			
@@ -65,7 +76,9 @@ class CategoryRegression:
 
				         if len(sub_df) < 5:
			
 
				             return
			
 
				         sample_count = len(sub_df)
			
 
				-        params, t_stats, p_values = self.run_ols_linear_regression(sub_df)
			
 
				+        param_names, params, t_stats, p_values = self.run_ols_linear_regression(
			
 
				+            sub_df
			
 
				+        )
			
 
				         row = f"{account_name}\t{sample_count}"
			
 
				         for param, p_value in zip(params, p_values):
			
 
				             row += f"\t{param:.3f}\t{p_value:.3f}"
			
@@ -76,53 +89,86 @@ class CategoryRegression:
 
				             [name + "\tp-" + name for name in ["bias"] + self.features]
			
 
				         )
			
 
				         print("account\tsamples\t{}".format(p_value_column_names))
			
 
				-        # self._build_and_print_by_account(raw_dataframe, None)
			
 
				         for account_name in raw_dataframe["account_name"].unique():
			
 
				             self._build_and_print_by_account(raw_dataframe, account_name)
			
 
				 
			
 
				     def get_param_names(self):
			
 
				+        # 仅用于打印 header，真实参数名以后直接从 model 里取
			
 
				         return ["bias"] + self.features
			
 
				 
			
 
				     def run_ols_linear_regression(
			
 
				         self,
			
 
				-        raw_dataframe: DataFrame.series,
			
 
				+        raw_dataframe: DataFrame,
			
 
				+        weights=None,
			
 
				         print_residual: bool = False,
			
 
				         print_p_value_threshold: float = 0.1,
			
 
				     ):
			
 
				-        X = raw_dataframe[self.features]  # 特征列
			
 
				-        y = raw_dataframe["RegressionY"]  # 目标变量
			
 
				-        X = sm.add_constant(X, has_constant="add")
			
 
				+        """
			
 
				+        统一的回归入口:
			
 
				+        - 支持可选的加权回归 (WLS)
			
 
				+        - 返回: param_names, params, t_stats, p_values
			
 
				+        """
			
 
				+        if raw_dataframe.empty:
			
 
				+            return [], np.array([]), np.array([]), np.array([])
			
 
				+
			
 
				+        X = raw_dataframe[self.features]
			
 
				+        y = raw_dataframe["RegressionY"]
			
 
				+
			
 
				+        # 丢掉 NaN 行，避免回归报错
			
 
				+        mask = X.notna().all(axis=1) & y.notna()
			
 
				+        X = X[mask]
			
 
				+        y = y[mask]
			
 
				 
			
 
				-        model = sm.OLS(y, X).fit()
			
 
				+        if len(X) < 2:
			
 
				+            return [], np.array([]), np.array([]), np.array([])
			
 
				 
			
 
				-        params = model.params
			
 
				-        t_stats = model.tvalues
			
 
				-        p_values = model.pvalues
			
 
				-        conf_int = model.conf_int()
			
 
				+        X = sm.add_constant(X, has_constant="add")
			
 
				+
			
 
				+        try:
			
 
				+            if weights is not None:
			
 
				+                w = np.asarray(weights)[mask.to_numpy()]
			
 
				+                model = sm.WLS(y, X, weights=w).fit()
			
 
				+            else:
			
 
				+                model = sm.OLS(y, X).fit()
			
 
				+        except Exception as exc:
			
 
				+            # 某个账号数据有问题时，打个日志/print，跳过即可
			
 
				+            print(f"[WARN] OLS/WLS failed: {exc}")
			
 
				+            return [], np.array([]), np.array([]), np.array([])
			
 
				+
			
 
				+        param_names = list(model.params.index)
			
 
				+        params = model.params.to_numpy()
			
 
				+        t_stats = model.tvalues.to_numpy()
			
 
				+        p_values = model.pvalues.to_numpy()
			
 
				 
			
 
				         if print_residual:
			
 
				             predict_y = model.predict(X)
			
 
				             residuals = y - predict_y
			
 
				-            new_x = raw_dataframe[["title", "category"]].copy()
			
 
				+            new_x = raw_dataframe.loc[mask, ["title", "category"]].copy()
			
 
				             new_x["residual"] = residuals
			
 
				             new_x["y"] = y
			
 
				             select_idx = []
			
 
				             for index, row in new_x.iterrows():
			
 
				                 param_name = self.category_map.get(row["category"], None)
			
 
				-                if not param_name:
			
 
				+                if not param_name or param_name not in param_names:
			
 
				                     continue
			
 
				-                param_index = self.features.index(param_name) + 1
			
 
				-                param = params.iloc[param_index]
			
 
				-                p_value = p_values.iloc[param_index]
			
 
				+                param_index = param_names.index(param_name)
			
 
				+                param = params[param_index]
			
 
				+                p_value = p_values[param_index]
			
 
				                 if p_value < print_p_value_threshold:
			
 
				                     print(
			
 
				-                        f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title'][0:30]}"
			
 
				+                        f"{row['y']:.3f}\t{row['residual']:.3f}\t"
			
 
				+                        f"{row['category']}\t{param:.2f}\t"
			
 
				+                        f"{row['title'][0:30]}"
			
 
				                     )
			
 
				                     select_idx.append(index)
			
 
				-            has_category_residuals = residuals.loc[select_idx]
			
 
				-            r_min = has_category_residuals.min()
			
 
				-            r_max = has_category_residuals.max()
			
 
				-            r_avg = has_category_residuals.mean()
			
 
				-            print(f"residuals min: {r_min:.3f}, max: {r_max:.3f}, mean: {r_avg:.3f}")
			
 
				-
			
 
				-        return params, t_stats, p_values
			
 
				+            if select_idx:
			
 
				+                has_category_residuals = residuals.loc[select_idx]
			
 
				+                r_min = has_category_residuals.min()
			
 
				+                r_max = has_category_residuals.max()
			
 
				+                r_avg = has_category_residuals.mean()
			
 
				+                print(
			
 
				+                    f"residuals min: {r_min:.3f}, "
			
 
				+                    f"max: {r_max:.3f}, mean: {r_avg:.3f}"
			
 
				+                )
			
 
				+
			
 
				+        return param_names, params, t_stats, p_values