|
@@ -1,4 +1,7 @@
|
|
|
-import time, json
|
|
|
|
|
|
|
+import time
|
|
|
|
|
+import json
|
|
|
|
|
+import numpy as np
|
|
|
|
|
+import pandas as pd
|
|
|
from datetime import datetime, timedelta
|
|
from datetime import datetime, timedelta
|
|
|
from pandas import DataFrame
|
|
from pandas import DataFrame
|
|
|
from tqdm.asyncio import tqdm
|
|
from tqdm.asyncio import tqdm
|
|
@@ -11,7 +14,7 @@ class AccountCategoryConst:
|
|
|
SAMPLE_MIN_SIZE = 5
|
|
SAMPLE_MIN_SIZE = 5
|
|
|
POSITIVE_STATUS = 1
|
|
POSITIVE_STATUS = 1
|
|
|
NEGATIVE_STATUS = 0
|
|
NEGATIVE_STATUS = 0
|
|
|
- VERSION = 2
|
|
|
|
|
|
|
+ CURRENT_VERSION = 3
|
|
|
|
|
|
|
|
# SOME THRESHOLDS
|
|
# SOME THRESHOLDS
|
|
|
SIMILARITY_THRESHOLD = 0
|
|
SIMILARITY_THRESHOLD = 0
|
|
@@ -22,7 +25,13 @@ class AccountCategoryConst:
|
|
|
INDEX_MAX = 3
|
|
INDEX_MAX = 3
|
|
|
|
|
|
|
|
# MAX VALUE
|
|
# MAX VALUE
|
|
|
- MAX_VALUE = 0.5
|
|
|
|
|
|
|
+ MAX_VALUE = 0.25
|
|
|
|
|
+
|
|
|
|
|
+ # === NEW: 策略相关常量 ===
|
|
|
|
|
+ # 达到这个样本数后,账号权重占比接近 1
|
|
|
|
|
+ MERGE_SAMPLE_COUNT = 90
|
|
|
|
|
+ # 时间权重半衰期(天):晚 30 天,权重衰减一半
|
|
|
|
|
+ TIME_HALF_LIFE_DAYS = 90
|
|
|
|
|
|
|
|
|
|
|
|
|
class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
@@ -38,6 +47,7 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
self.view_only = data.get("view_only")
|
|
self.view_only = data.get("view_only")
|
|
|
|
|
|
|
|
def reverse_category_map(self):
|
|
def reverse_category_map(self):
|
|
|
|
|
+ # param_name -> category_name
|
|
|
return {v: k for k, v in self.category_map.items()}
|
|
return {v: k for k, v in self.category_map.items()}
|
|
|
|
|
|
|
|
def init_execute_date(self):
|
|
def init_execute_date(self):
|
|
@@ -47,11 +57,10 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
run_date = datetime.today()
|
|
run_date = datetime.today()
|
|
|
|
|
|
|
|
end_dt = (run_date - timedelta(1)).strftime("%Y%m%d")
|
|
end_dt = (run_date - timedelta(1)).strftime("%Y%m%d")
|
|
|
- begin_dt = (run_date - timedelta(61)).strftime("%Y%m%d")
|
|
|
|
|
|
|
+ begin_dt = (run_date - timedelta(181)).strftime("%Y%m%d")
|
|
|
return begin_dt, end_dt
|
|
return begin_dt, end_dt
|
|
|
|
|
|
|
|
async def prepare_raw_data(self, end_dt, begin_dt: str = "20250401"):
|
|
async def prepare_raw_data(self, end_dt, begin_dt: str = "20250401"):
|
|
|
- begin_dt = "20250401"
|
|
|
|
|
query = """
|
|
query = """
|
|
|
select dt, gh_id, account_name, title, similarity, view_count_rate, category,
|
|
select dt, gh_id, account_name, title, similarity, view_count_rate, category,
|
|
|
read_avg, read_avg_rate, first_pub_interval, `index`
|
|
read_avg, read_avg_rate, first_pub_interval, `index`
|
|
@@ -63,7 +72,6 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
and read_avg_rate between %s and %s
|
|
and read_avg_rate between %s and %s
|
|
|
and view_count_rate > %s
|
|
and view_count_rate > %s
|
|
|
and `index` < %s
|
|
and `index` < %s
|
|
|
- and account_name in ('生活慢时光', '美好时光阅读汇', '史趣探秘', '趣味生活漫谈', '趣味生活方式', '趣味生活漫时光')
|
|
|
|
|
;
|
|
;
|
|
|
"""
|
|
"""
|
|
|
fetch_response = await self.pool.async_fetch(
|
|
fetch_response = await self.pool.async_fetch(
|
|
@@ -98,11 +106,53 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
dataframe = dataframe.drop_duplicates(["dt", "gh_id", "title"])
|
|
dataframe = dataframe.drop_duplicates(["dt", "gh_id", "title"])
|
|
|
return dataframe
|
|
return dataframe
|
|
|
|
|
|
|
|
|
|
+ # === NEW: 构建 WLS 权重(时间 + 曝光) ===
|
|
|
|
|
+ def build_sample_weights(self, df: DataFrame) -> pd.Series:
|
|
|
|
|
+ """
|
|
|
|
|
+ 时间越近 + read_avg 越大,权重越高。
|
|
|
|
|
+ 最后做一个均值归一化,避免数值过大。
|
|
|
|
|
+ """
|
|
|
|
|
+ # dt 形如 "20251124"
|
|
|
|
|
+ dt_series = pd.to_datetime(df["dt"], format="%Y%m%d", errors="coerce")
|
|
|
|
|
+ latest_dt = dt_series.max()
|
|
|
|
|
+ days_ago = (latest_dt - dt_series).dt.days.fillna(0)
|
|
|
|
|
+ time_weight = np.power(0.5, days_ago / self.TIME_HALF_LIFE_DAYS)
|
|
|
|
|
+
|
|
|
|
|
+ exposure_weight = np.log1p(df["read_avg"]).fillna(1.0)
|
|
|
|
|
+ weights = time_weight * exposure_weight
|
|
|
|
|
+
|
|
|
|
|
+ mean_w = weights.mean()
|
|
|
|
|
+ if mean_w > 0:
|
|
|
|
|
+ weights = weights / mean_w
|
|
|
|
|
+ return weights
|
|
|
|
|
+
|
|
|
|
|
+ # === NEW: 把回归参数转成 {category: score} ===
|
|
|
|
|
+ def _extract_category_scores(
|
|
|
|
|
+ self,
|
|
|
|
|
+ param_names,
|
|
|
|
|
+ params,
|
|
|
|
|
+ p_values,
|
|
|
|
|
+ param_to_category_map,
|
|
|
|
|
+ ):
|
|
|
|
|
+ scores = {}
|
|
|
|
|
+ for name, param, p_value in zip(param_names, params, p_values):
|
|
|
|
|
+ category_name = param_to_category_map.get(name)
|
|
|
|
|
+ if not category_name:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if abs(param) <= 0.1 or p_value >= self.P_VALUE_THRESHOLD:
|
|
|
|
|
+ continue
|
|
|
|
|
+ scale_factor = min(0.1 / p_value, 1.0)
|
|
|
|
|
+ truncate_param = max(min(param, self.MAX_VALUE), -self.MAX_VALUE)
|
|
|
|
|
+ truncate_param *= scale_factor
|
|
|
|
|
+ scores[category_name] = round(truncate_param, 6)
|
|
|
|
|
+ return scores
|
|
|
|
|
+
|
|
|
async def update_each_account(self, record):
|
|
async def update_each_account(self, record):
|
|
|
now_timestamp = int(time.time())
|
|
now_timestamp = int(time.time())
|
|
|
query = """
|
|
query = """
|
|
|
- insert ignore into account_category (dt, gh_id, category_map, status, version, create_timestamp, update_timestamp)
|
|
|
|
|
- values (%s, %s, %s, %s, %s, %s, %s)
|
|
|
|
|
|
|
+ insert ignore into account_category
|
|
|
|
|
+ (dt, gh_id, category_map, status, version, create_timestamp, update_timestamp)
|
|
|
|
|
+ values (%s, %s, %s, %s, %s, %s, %s)
|
|
|
"""
|
|
"""
|
|
|
insert_rows = await self.pool.async_save(
|
|
insert_rows = await self.pool.async_save(
|
|
|
query=query,
|
|
query=query,
|
|
@@ -111,15 +161,16 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
record["gh_id"],
|
|
record["gh_id"],
|
|
|
record["category_map"],
|
|
record["category_map"],
|
|
|
self.POSITIVE_STATUS,
|
|
self.POSITIVE_STATUS,
|
|
|
- self.VERSION,
|
|
|
|
|
|
|
+ self.CURRENT_VERSION,
|
|
|
now_timestamp,
|
|
now_timestamp,
|
|
|
now_timestamp,
|
|
now_timestamp,
|
|
|
),
|
|
),
|
|
|
)
|
|
)
|
|
|
if insert_rows:
|
|
if insert_rows:
|
|
|
update_query = """
|
|
update_query = """
|
|
|
- update account_category set status = %s, update_timestamp = %s
|
|
|
|
|
- where gh_id = %s and dt < %s and status = %s and version = %s;
|
|
|
|
|
|
|
+ update account_category
|
|
|
|
|
+ set status = %s, update_timestamp = %s
|
|
|
|
|
+ where gh_id = %s and dt < %s and status = %s;
|
|
|
"""
|
|
"""
|
|
|
await self.pool.async_save(
|
|
await self.pool.async_save(
|
|
|
query=update_query,
|
|
query=update_query,
|
|
@@ -129,60 +180,75 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
record["gh_id"],
|
|
record["gh_id"],
|
|
|
record["dt"],
|
|
record["dt"],
|
|
|
self.POSITIVE_STATUS,
|
|
self.POSITIVE_STATUS,
|
|
|
- self.VERSION,
|
|
|
|
|
),
|
|
),
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
async def predict_each_account(
|
|
async def predict_each_account(
|
|
|
- self, df, account_id, account_id_map, end_dt, param_to_category_map
|
|
|
|
|
|
|
+ self,
|
|
|
|
|
+ sub_df,
|
|
|
|
|
+ account_id,
|
|
|
|
|
+ account_id_map,
|
|
|
|
|
+ end_dt,
|
|
|
|
|
+ param_to_category_map,
|
|
|
|
|
+ global_scores,
|
|
|
|
|
+ global_weights,
|
|
|
):
|
|
):
|
|
|
- sub_df = df[df["gh_id"] == account_id]
|
|
|
|
|
account_name = account_id_map[account_id]
|
|
account_name = account_id_map[account_id]
|
|
|
sample_count = sub_df.shape[0]
|
|
sample_count = sub_df.shape[0]
|
|
|
if sample_count < self.SAMPLE_MIN_SIZE:
|
|
if sample_count < self.SAMPLE_MIN_SIZE:
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
- params, t_stats, p_values = self.run_ols_linear_regression(
|
|
|
|
|
- sub_df, self.view_only, self.P_VALUE_THRESHOLD
|
|
|
|
|
|
|
+ # 对应子集的权重
|
|
|
|
|
+ if global_weights is not None:
|
|
|
|
|
+ weights = global_weights[sub_df.index.to_numpy()]
|
|
|
|
|
+ else:
|
|
|
|
|
+ weights = None
|
|
|
|
|
+
|
|
|
|
|
+ param_names, params, t_stats, p_values = self.run_ols_linear_regression(
|
|
|
|
|
+ sub_df,
|
|
|
|
|
+ weights=weights,
|
|
|
|
|
+ print_residual=self.view_only,
|
|
|
|
|
+ print_p_value_threshold=self.P_VALUE_THRESHOLD,
|
|
|
)
|
|
)
|
|
|
- current_record = {
|
|
|
|
|
- "dt": end_dt,
|
|
|
|
|
- "gh_id": account_id,
|
|
|
|
|
- "category_map": {},
|
|
|
|
|
- "name": account_name,
|
|
|
|
|
- }
|
|
|
|
|
- params_names = self.get_param_names()
|
|
|
|
|
- for name, param, p_value in zip(params_names, params, p_values):
|
|
|
|
|
- category_name = param_to_category_map.get(name, None)
|
|
|
|
|
- if (
|
|
|
|
|
- abs(param) > 0.1
|
|
|
|
|
- and p_value < self.P_VALUE_THRESHOLD
|
|
|
|
|
- and category_name is not None
|
|
|
|
|
- ):
|
|
|
|
|
- scale_factor = min(0.1 / p_value, 1)
|
|
|
|
|
- print(
|
|
|
|
|
- f"{account_id} {account_name} {category_name} {param:.3f} {p_value:.3f}"
|
|
|
|
|
- )
|
|
|
|
|
- truncate_param = round(
|
|
|
|
|
- max(min(param, self.MAX_VALUE), -self.MAX_VALUE) * scale_factor, 6
|
|
|
|
|
- )
|
|
|
|
|
- current_record["category_map"][category_name] = truncate_param
|
|
|
|
|
|
|
+ if not len(params):
|
|
|
|
|
+ return
|
|
|
|
|
|
|
|
|
|
+ # 账号层品类得分
|
|
|
|
|
+ account_scores = self._extract_category_scores(
|
|
|
|
|
+ param_names, params, p_values, param_to_category_map
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # 记录负向品类(账号自身)
|
|
|
|
|
+ for name, param, p_value in zip(param_names, params, p_values):
|
|
|
|
|
+ category_name = param_to_category_map.get(name)
|
|
|
if (
|
|
if (
|
|
|
- param < -0.1
|
|
|
|
|
|
|
+ category_name is not None
|
|
|
|
|
+ and param < -0.1
|
|
|
and p_value < self.P_VALUE_THRESHOLD
|
|
and p_value < self.P_VALUE_THRESHOLD
|
|
|
- and category_name is not None
|
|
|
|
|
):
|
|
):
|
|
|
self.account_negative_categories[account_id].append(category_name)
|
|
self.account_negative_categories[account_id].append(category_name)
|
|
|
|
|
|
|
|
- if not current_record["category_map"]:
|
|
|
|
|
- return
|
|
|
|
|
|
|
+ # 样本少时多依赖全局,样本多时更信账号自身
|
|
|
|
|
+ alpha = min(sample_count / self.MERGE_SAMPLE_COUNT, 1.0)
|
|
|
|
|
+ merged_scores = {}
|
|
|
|
|
+ all_categories = set(global_scores.keys()) | set(account_scores.keys())
|
|
|
|
|
+ for cat in all_categories:
|
|
|
|
|
+ g = global_scores.get(cat, 0.0)
|
|
|
|
|
+ a = account_scores.get(cat, 0.0)
|
|
|
|
|
+ final = (1 - alpha) * g + alpha * a
|
|
|
|
|
+ if abs(final) > 1e-6:
|
|
|
|
|
+ merged_scores[cat] = round(final, 6)
|
|
|
|
|
|
|
|
- current_record["category_map"] = json.dumps(
|
|
|
|
|
- current_record["category_map"], ensure_ascii=False
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ if not merged_scores:
|
|
|
|
|
+ return
|
|
|
|
|
|
|
|
- await self.update_each_account(current_record)
|
|
|
|
|
|
|
+ record = {
|
|
|
|
|
+ "dt": end_dt,
|
|
|
|
|
+ "gh_id": account_id,
|
|
|
|
|
+ "category_map": json.dumps(merged_scores, ensure_ascii=False),
|
|
|
|
|
+ "name": account_name,
|
|
|
|
|
+ }
|
|
|
|
|
+ await self.update_each_account(record)
|
|
|
|
|
|
|
|
async def deal(self):
|
|
async def deal(self):
|
|
|
begin_dt, end_dt = self.init_execute_date()
|
|
begin_dt, end_dt = self.init_execute_date()
|
|
@@ -191,12 +257,32 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
# prepare data for model
|
|
# prepare data for model
|
|
|
pre_processed_dataframe = self.preprocess_data(raw_dataframe)
|
|
pre_processed_dataframe = self.preprocess_data(raw_dataframe)
|
|
|
|
|
|
|
|
|
|
+ if pre_processed_dataframe.empty:
|
|
|
|
|
+ print(f"[INFO] no valid data between {begin_dt} and {end_dt}")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
if self.view_only:
|
|
if self.view_only:
|
|
|
self.build_and_print_matrix(pre_processed_dataframe)
|
|
self.build_and_print_matrix(pre_processed_dataframe)
|
|
|
return
|
|
return
|
|
|
|
|
|
|
|
param_to_category_map = self.reverse_category_map()
|
|
param_to_category_map = self.reverse_category_map()
|
|
|
|
|
|
|
|
|
|
+ # === 1) 全局模型:带权重的 WLS,得到 global_scores ===
|
|
|
|
|
+ global_weights = self.build_sample_weights(pre_processed_dataframe)
|
|
|
|
|
+ g_param_names, g_params, g_t_stats, g_p_values = self.run_ols_linear_regression(
|
|
|
|
|
+ pre_processed_dataframe,
|
|
|
|
|
+ weights=global_weights,
|
|
|
|
|
+ print_residual=False,
|
|
|
|
|
+ print_p_value_threshold=self.P_VALUE_THRESHOLD,
|
|
|
|
|
+ )
|
|
|
|
|
+ if len(g_params):
|
|
|
|
|
+ global_scores = self._extract_category_scores(
|
|
|
|
|
+ g_param_names, g_params, g_p_values, param_to_category_map
|
|
|
|
|
+ )
|
|
|
|
|
+ else:
|
|
|
|
|
+ global_scores = {}
|
|
|
|
|
+
|
|
|
|
|
+ # 账号信息准备
|
|
|
account_ids = pre_processed_dataframe["gh_id"].unique()
|
|
account_ids = pre_processed_dataframe["gh_id"].unique()
|
|
|
account_id_map = (
|
|
account_id_map = (
|
|
|
pre_processed_dataframe[["account_name", "gh_id"]]
|
|
pre_processed_dataframe[["account_name", "gh_id"]]
|
|
@@ -204,15 +290,20 @@ class AccountCategoryAnalysis(CategoryRegression, AccountCategoryConst):
|
|
|
.set_index("gh_id")["account_name"]
|
|
.set_index("gh_id")["account_name"]
|
|
|
.to_dict()
|
|
.to_dict()
|
|
|
)
|
|
)
|
|
|
|
|
+ self.account_negative_categories = {key: [] for key in account_ids}
|
|
|
|
|
|
|
|
- account_negative_categories = {key: [] for key in account_ids}
|
|
|
|
|
- self.account_negative_categories = account_negative_categories
|
|
|
|
|
-
|
|
|
|
|
- for account_id in tqdm(account_ids, desc="analysis each account"):
|
|
|
|
|
|
|
+ # === 2) per-account 模型:在全局的基础上做微调 ===
|
|
|
|
|
+ for account_id, sub_df in tqdm(
|
|
|
|
|
+ pre_processed_dataframe.groupby("gh_id"),
|
|
|
|
|
+ desc="analysis each account",
|
|
|
|
|
+ ):
|
|
|
await self.predict_each_account(
|
|
await self.predict_each_account(
|
|
|
- pre_processed_dataframe,
|
|
|
|
|
|
|
+ sub_df,
|
|
|
account_id,
|
|
account_id,
|
|
|
account_id_map,
|
|
account_id_map,
|
|
|
end_dt,
|
|
end_dt,
|
|
|
param_to_category_map,
|
|
param_to_category_map,
|
|
|
|
|
+ global_scores,
|
|
|
|
|
+ global_weights,
|
|
|
)
|
|
)
|
|
|
|
|
+
|