|
@@ -66,7 +66,7 @@ def compare_version(db_manager, dt_version, new_version, account_id_map):
|
|
|
# new record
|
|
|
all_gh_ids = set(list(new_version.keys()) + list(last_version.keys()))
|
|
|
for gh_id in all_gh_ids:
|
|
|
- account_name = account_id_map[gh_id]
|
|
|
+ account_name = account_id_map.get(gh_id, None)
|
|
|
if gh_id not in last_version:
|
|
|
print(f"new account {account_name}: {new_version[gh_id]}")
|
|
|
elif gh_id not in new_version:
|
|
@@ -86,6 +86,8 @@ def main():
|
|
|
parser = ArgumentParser()
|
|
|
parser.add_argument('-n', '--dry-run', action='store_true', help='do not update database')
|
|
|
parser.add_argument('--run-at', help='dt, also for version')
|
|
|
+ parser.add_argument('--print-matrix', action='store_true')
|
|
|
+ parser.add_argument('--print-residual', action='store_true')
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
run_date = datetime.today()
|
|
@@ -101,8 +103,8 @@ def main():
|
|
|
cate_model = CategoryRegressionV1()
|
|
|
df = cate_model.preprocess_data(raw_df)
|
|
|
|
|
|
- if args.dry_run and False:
|
|
|
- cate_model.build(df)
|
|
|
+ if args.dry_run and args.print_matrix:
|
|
|
+ cate_model.build_and_print_matrix(df)
|
|
|
return
|
|
|
|
|
|
create_timestamp = int(time.time())
|
|
@@ -116,14 +118,15 @@ def main():
|
|
|
.set_index('gh_id')['account_name'].to_dict()
|
|
|
|
|
|
account_negative_cates = {k: [] for k in account_ids}
|
|
|
+ P_VALUE_THRESHOLD = 0.15
|
|
|
for account_id in account_ids:
|
|
|
sub_df = df[df['gh_id'] == account_id]
|
|
|
account_name = account_id_map[account_id]
|
|
|
sample_count = len(sub_df)
|
|
|
if sample_count < 5:
|
|
|
continue
|
|
|
- print_error = False
|
|
|
- params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df, print_error)
|
|
|
+ params, t_stats, p_values = cate_model.run_ols_linear_regression(
|
|
|
+ sub_df, args.print_residual, P_VALUE_THRESHOLD)
|
|
|
current_record = {}
|
|
|
current_record['dt'] = dt_version
|
|
|
current_record['gh_id'] = account_id
|
|
@@ -132,12 +135,13 @@ def main():
|
|
|
for name, param, p_value in zip(param_names, params, p_values):
|
|
|
cate_name = param_to_category_map.get(name, None)
|
|
|
# 用于排序的品类相关性
|
|
|
- if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
|
|
|
+ if abs(param) > 0.1 and p_value < P_VALUE_THRESHOLD and cate_name is not None:
|
|
|
+ scale_factor = min(0.1 / p_value, 1)
|
|
|
print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}")
|
|
|
- truncate_param = round(max(min(param, 0.25), -0.3), 6)
|
|
|
+ truncate_param = round(max(min(param, 0.25), -0.3) * scale_factor, 6)
|
|
|
current_record['category_map'][cate_name] = truncate_param
|
|
|
# 用于冷启文章分配的负向品类
|
|
|
- if param < -0.1 and cate_name is not None and p_value < 0.3:
|
|
|
+ if param < -0.1 and cate_name is not None and p_value < P_VALUE_THRESHOLD:
|
|
|
account_negative_cates[account_id].append(cate_name)
|
|
|
# print((account_name, cate_name, param, p_value))
|
|
|
if not current_record['category_map']:
|