|
@@ -38,66 +38,16 @@ def prepare_raw_data(dt_begin, dt_end):
|
|
|
df = df.drop_duplicates(['dt', 'gh_id', 'title'])
|
|
|
return df
|
|
|
|
|
|
-def run_once(dt):
|
|
|
- df = pd.read_excel('src/long_articles/20241101_read_rate_samples.xlsx')
|
|
|
- df['read_avg'] = df['阅读均值']
|
|
|
- df['read_avg_rate'] = df['阅读倍数']
|
|
|
- df['dt'] = df['日期']
|
|
|
- df['similarity'] = df['Similarity']
|
|
|
- filter_condition = 'read_avg > 500 ' \
|
|
|
- 'and read_avg_rate > 0 and read_avg_rate < 3 ' \
|
|
|
- 'and dt > 20240914 and similarity > 0'
|
|
|
- df = df.query(filter_condition).copy()
|
|
|
- #df = pd.read_excel('20241112-new-account-samples.xlsx')
|
|
|
|
|
|
- cate_model = CategoryRegressionV1()
|
|
|
-
|
|
|
- create_timestamp = int(time.time())
|
|
|
- update_timestamp = create_timestamp
|
|
|
-
|
|
|
- records_to_save = []
|
|
|
- df = cate_model.preprocess_data(df)
|
|
|
-
|
|
|
- param_to_category_map = cate_model.reverse_category_name_map
|
|
|
- account_ids = df['ghID'].unique()
|
|
|
- account_id_map = df[['账号名称', 'ghID']].drop_duplicates().set_index('ghID')['账号名称'].to_dict()
|
|
|
-
|
|
|
- account_negative_cates = {k: [] for k in account_ids}
|
|
|
- for account_id in account_ids:
|
|
|
- sub_df = df[df['ghID'] == account_id]
|
|
|
- account_name = account_id_map[account_id]
|
|
|
- sample_count = len(sub_df)
|
|
|
- if sample_count < 5:
|
|
|
- continue
|
|
|
- params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df)
|
|
|
- current_record = {}
|
|
|
- current_record['dt'] = dt
|
|
|
- current_record['gh_id'] = account_id
|
|
|
- current_record['category_map'] = {}
|
|
|
- param_names = cate_model.get_param_names()
|
|
|
- for name, param, p_value in zip(param_names, params, p_values):
|
|
|
- cate_name = param_to_category_map.get(name, None)
|
|
|
- if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
|
|
|
- #print(f"{account_id} {cate_name} {param:.3f} {p_value:.3f}")
|
|
|
- current_record['category_map'][cate_name] = round(param, 6)
|
|
|
- if param < -0.1 and cate_name is not None and p_value < 0.3:
|
|
|
- account_negative_cates[account_id].append(cate_name)
|
|
|
- print((account_name, cate_name, param, p_value))
|
|
|
- current_record['category_map'] = json.dumps(current_record['category_map'], ensure_ascii=False)
|
|
|
- current_record['status'] = 1
|
|
|
- current_record['create_timestamp'] = create_timestamp
|
|
|
- current_record['update_timestamp'] = update_timestamp
|
|
|
- records_to_save.append(current_record)
|
|
|
- db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
|
|
|
- #db_manager.batch_insert('account_category', records_to_save)
|
|
|
-
|
|
|
- for account_id in [*account_negative_cates.keys()]:
|
|
|
- if not account_negative_cates[account_id]:
|
|
|
- account_negative_cates.pop(account_id)
|
|
|
-
|
|
|
- print(json.dumps(account_negative_cates, ensure_ascii=False, indent=2))
|
|
|
- for k, v in account_negative_cates.items():
|
|
|
- print('{}\t{}'.format(k, ','.join(v)))
|
|
|
+def clear_old_version(db_manager, dt):
|
|
|
+ update_timestamp = int(time.time())
|
|
|
+ sql = f"""
|
|
|
+ UPDATE account_category
|
|
|
+ SET status = 0, update_timestamp = {update_timestamp}
|
|
|
+ WHERE dt < {dt} and status = 1
|
|
|
+ """
|
|
|
+ rows = db_manager.execute(sql)
|
|
|
+ print(f"updated rows: {rows}")
|
|
|
|
|
|
|
|
|
def main():
|
|
@@ -119,8 +69,9 @@ def main():
|
|
|
cate_model = CategoryRegressionV1()
|
|
|
df = cate_model.preprocess_data(raw_df)
|
|
|
|
|
|
- if args.dry_run:
|
|
|
+ if args.dry_run and False:
|
|
|
cate_model.build(df)
|
|
|
+ return
|
|
|
|
|
|
create_timestamp = int(time.time())
|
|
|
update_timestamp = create_timestamp
|
|
@@ -150,7 +101,8 @@ def main():
|
|
|
# 用于排序的品类相关性
|
|
|
if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
|
|
|
print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}")
|
|
|
- current_record['category_map'][cate_name] = round(param, 6)
|
|
|
+ truncate_param = round(max(min(param, 0.25), -0.3), 6)
|
|
|
+ current_record['category_map'][cate_name] = truncate_param
|
|
|
# 用于冷启文章分配的负向品类
|
|
|
if param < -0.1 and cate_name is not None and p_value < 0.3:
|
|
|
account_negative_cates[account_id].append(cate_name)
|
|
@@ -169,6 +121,7 @@ def main():
|
|
|
|
|
|
db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
|
|
|
db_manager.batch_insert('account_category', records_to_save)
|
|
|
+ clear_old_version(db_manager, dt_version)
|
|
|
|
|
|
# 过滤空账号
|
|
|
for account_id in [*account_negative_cates.keys()]:
|