|
@@ -0,0 +1,181 @@
|
|
|
+#! /usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# vim:fenc=utf-8
|
|
|
+#
|
|
|
+# Copyright © 2024 StrayWarrior <i@straywarrior.com>
|
|
|
+
|
|
|
+
|
|
|
+import sys
|
|
|
+import os
|
|
|
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
|
|
|
+
|
|
|
+import time
|
|
|
+import json
|
|
|
+from datetime import datetime, timedelta
|
|
|
+import pandas as pd
|
|
|
+from argparse import ArgumentParser
|
|
|
+from long_articles.category_models import CategoryRegressionV1
|
|
|
+from common.database import MySQLManager
|
|
|
+from common import db_operation
|
|
|
+from common.logging import LOG
|
|
|
+from config.dev import Config
|
|
|
+
|
|
|
+
|
|
|
+def prepare_raw_data(dt_begin, dt_end):
|
|
|
+ data_fields = ['dt', 'gh_id', 'account_name', 'title', 'similarity',
|
|
|
+ 'view_count_rate', 'category', 'read_avg',
|
|
|
+ 'read_avg_rate']
|
|
|
+ fields_str = ','.join(data_fields)
|
|
|
+ db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
|
|
|
+ sql = f"""
|
|
|
+ SELECT {fields_str} FROM datastat_score WHERE dt BETWEEN {dt_begin} AND {dt_end}
|
|
|
+ AND similarity > 0 AND category IS NOT NULL AND read_avg > 500
|
|
|
+ AND read_avg_rate BETWEEN 0 AND 3
|
|
|
+ AND `index` = 1
|
|
|
+ """
|
|
|
+ rows = db_manager.select(sql)
|
|
|
+ df = pd.DataFrame(rows, columns=data_fields)
|
|
|
+ df = df.drop_duplicates(['dt', 'gh_id', 'title'])
|
|
|
+ return df
|
|
|
+
|
|
|
+def run_once(dt):
|
|
|
+ df = pd.read_excel('src/long_articles/20241101_read_rate_samples.xlsx')
|
|
|
+ df['read_avg'] = df['阅读均值']
|
|
|
+ df['read_avg_rate'] = df['阅读倍数']
|
|
|
+ df['dt'] = df['日期']
|
|
|
+ df['similarity'] = df['Similarity']
|
|
|
+ filter_condition = 'read_avg > 500 ' \
|
|
|
+ 'and read_avg_rate > 0 and read_avg_rate < 3 ' \
|
|
|
+ 'and dt > 20240914 and similarity > 0'
|
|
|
+ df = df.query(filter_condition).copy()
|
|
|
+ #df = pd.read_excel('20241112-new-account-samples.xlsx')
|
|
|
+
|
|
|
+ cate_model = CategoryRegressionV1()
|
|
|
+
|
|
|
+ create_timestamp = int(time.time())
|
|
|
+ update_timestamp = create_timestamp
|
|
|
+
|
|
|
+ records_to_save = []
|
|
|
+ df = cate_model.preprocess_data(df)
|
|
|
+
|
|
|
+ param_to_category_map = cate_model.reverse_category_name_map
|
|
|
+ account_ids = df['ghID'].unique()
|
|
|
+ account_id_map = df[['账号名称', 'ghID']].drop_duplicates().set_index('ghID')['账号名称'].to_dict()
|
|
|
+
|
|
|
+ account_negative_cates = {k: [] for k in account_ids}
|
|
|
+ for account_id in account_ids:
|
|
|
+ sub_df = df[df['ghID'] == account_id]
|
|
|
+ account_name = account_id_map[account_id]
|
|
|
+ sample_count = len(sub_df)
|
|
|
+ if sample_count < 5:
|
|
|
+ continue
|
|
|
+ params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df)
|
|
|
+ current_record = {}
|
|
|
+ current_record['dt'] = dt
|
|
|
+ current_record['gh_id'] = account_id
|
|
|
+ current_record['category_map'] = {}
|
|
|
+ param_names = cate_model.get_param_names()
|
|
|
+ for name, param, p_value in zip(param_names, params, p_values):
|
|
|
+ cate_name = param_to_category_map.get(name, None)
|
|
|
+ if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
|
|
|
+ #print(f"{account_id} {cate_name} {param:.3f} {p_value:.3f}")
|
|
|
+ current_record['category_map'][cate_name] = round(param, 6)
|
|
|
+ if param < -0.1 and cate_name is not None and p_value < 0.3:
|
|
|
+ account_negative_cates[account_id].append(cate_name)
|
|
|
+ print((account_name, cate_name, param, p_value))
|
|
|
+ current_record['category_map'] = json.dumps(current_record['category_map'], ensure_ascii=False)
|
|
|
+ current_record['status'] = 1
|
|
|
+ current_record['create_timestamp'] = create_timestamp
|
|
|
+ current_record['update_timestamp'] = update_timestamp
|
|
|
+ records_to_save.append(current_record)
|
|
|
+ db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
|
|
|
+ #db_manager.batch_insert('account_category', records_to_save)
|
|
|
+
|
|
|
+ for account_id in [*account_negative_cates.keys()]:
|
|
|
+ if not account_negative_cates[account_id]:
|
|
|
+ account_negative_cates.pop(account_id)
|
|
|
+
|
|
|
+ print(json.dumps(account_negative_cates, ensure_ascii=False, indent=2))
|
|
|
+ for k, v in account_negative_cates.items():
|
|
|
+ print('{}\t{}'.format(k, ','.join(v)))
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ parser = ArgumentParser()
|
|
|
+ parser.add_argument('-n', '--dry-run', action='store_true', help='do not update database')
|
|
|
+ parser.add_argument('--run-at', help='dt, also for version')
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ run_date = datetime.today()
|
|
|
+ if args.run_at:
|
|
|
+ run_date = datetime.strptime(args.run_at, "%Y%m%d")
|
|
|
+ begin_dt = 20240914
|
|
|
+ end_dt = (run_date - timedelta(1)).strftime("%Y%m%d")
|
|
|
+ dt_version = end_dt
|
|
|
+ LOG.info(f"data range: {begin_dt} - {end_dt}")
|
|
|
+
|
|
|
+ raw_df = prepare_raw_data(begin_dt, end_dt)
|
|
|
+
|
|
|
+ cate_model = CategoryRegressionV1()
|
|
|
+ df = cate_model.preprocess_data(raw_df)
|
|
|
+
|
|
|
+ if args.dry_run:
|
|
|
+ cate_model.build(df)
|
|
|
+
|
|
|
+ create_timestamp = int(time.time())
|
|
|
+ update_timestamp = create_timestamp
|
|
|
+
|
|
|
+ records_to_save = []
|
|
|
+
|
|
|
+ param_to_category_map = cate_model.reverse_category_name_map
|
|
|
+ account_ids = df['gh_id'].unique()
|
|
|
+ account_id_map = df[['account_name', 'gh_id']].drop_duplicates() \
|
|
|
+ .set_index('gh_id')['account_name'].to_dict()
|
|
|
+
|
|
|
+ account_negative_cates = {k: [] for k in account_ids}
|
|
|
+ for account_id in account_ids:
|
|
|
+ sub_df = df[df['gh_id'] == account_id]
|
|
|
+ account_name = account_id_map[account_id]
|
|
|
+ sample_count = len(sub_df)
|
|
|
+ if sample_count < 5:
|
|
|
+ continue
|
|
|
+ params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df)
|
|
|
+ current_record = {}
|
|
|
+ current_record['dt'] = dt_version
|
|
|
+ current_record['gh_id'] = account_id
|
|
|
+ current_record['category_map'] = {}
|
|
|
+ param_names = cate_model.get_param_names()
|
|
|
+ for name, param, p_value in zip(param_names, params, p_values):
|
|
|
+ cate_name = param_to_category_map.get(name, None)
|
|
|
+ # 用于排序的品类相关性
|
|
|
+ if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
|
|
|
+ print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}")
|
|
|
+ current_record['category_map'][cate_name] = round(param, 6)
|
|
|
+ # 用于冷启文章分配的负向品类
|
|
|
+ if param < -0.1 and cate_name is not None and p_value < 0.3:
|
|
|
+ account_negative_cates[account_id].append(cate_name)
|
|
|
+ # print((account_name, cate_name, param, p_value))
|
|
|
+ if not current_record['category_map']:
|
|
|
+ continue
|
|
|
+ current_record['category_map'] = json.dumps(current_record['category_map'], ensure_ascii=False)
|
|
|
+ current_record['status'] = 1
|
|
|
+ current_record['create_timestamp'] = create_timestamp
|
|
|
+ current_record['update_timestamp'] = update_timestamp
|
|
|
+ records_to_save.append(current_record)
|
|
|
+ if args.dry_run:
|
|
|
+ for record in records_to_save:
|
|
|
+ print(record)
|
|
|
+ return
|
|
|
+
|
|
|
+ db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
|
|
|
+ db_manager.batch_insert('account_category', records_to_save)
|
|
|
+
|
|
|
+ # 过滤空账号
|
|
|
+ for account_id in [*account_negative_cates.keys()]:
|
|
|
+ if not account_negative_cates[account_id]:
|
|
|
+ account_negative_cates.pop(account_id)
|
|
|
+
|
|
|
+ # print(json.dumps(account_negative_cates, ensure_ascii=False, indent=2))
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ main()
|