| 
														
															@@ -38,66 +38,16 @@ def prepare_raw_data(dt_begin, dt_end): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     df = df.drop_duplicates(['dt', 'gh_id', 'title']) 
														 | 
														
														 | 
														
															     df = df.drop_duplicates(['dt', 'gh_id', 'title']) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     return df 
														 | 
														
														 | 
														
															     return df 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-def run_once(dt): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df = pd.read_excel('src/long_articles/20241101_read_rate_samples.xlsx') 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df['read_avg'] = df['阅读均值'] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df['read_avg_rate'] = df['阅读倍数'] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df['dt'] = df['日期'] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df['similarity'] = df['Similarity'] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    filter_condition = 'read_avg > 500 ' \ 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        'and read_avg_rate > 0 and read_avg_rate < 3 ' \ 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        'and dt > 20240914 and similarity > 0'  
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df = df.query(filter_condition).copy() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    #df = pd.read_excel('20241112-new-account-samples.xlsx') 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    cate_model = CategoryRegressionV1() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    create_timestamp = int(time.time()) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    update_timestamp = create_timestamp 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    records_to_save = [] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    df = cate_model.preprocess_data(df) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    param_to_category_map = cate_model.reverse_category_name_map 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    account_ids = df['ghID'].unique() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    account_id_map = df[['账号名称', 'ghID']].drop_duplicates().set_index('ghID')['账号名称'].to_dict() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    account_negative_cates = {k: [] for k in account_ids} 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    for account_id in account_ids: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        sub_df = df[df['ghID'] == account_id]   
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        account_name = account_id_map[account_id] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        sample_count = len(sub_df) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        if sample_count < 5: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            continue 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record = {} 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['dt'] = dt 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['gh_id'] = account_id 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['category_map'] = {} 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        param_names = cate_model.get_param_names() 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        for name, param, p_value in zip(param_names, params, p_values): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            cate_name = param_to_category_map.get(name, None) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                #print(f"{account_id} {cate_name} {param:.3f} {p_value:.3f}") 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                current_record['category_map'][cate_name] = round(param, 6) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            if param < -0.1 and cate_name is not None and p_value < 0.3: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                account_negative_cates[account_id].append(cate_name) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                print((account_name, cate_name, param, p_value)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['category_map'] = json.dumps(current_record['category_map'], ensure_ascii=False) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['status'] = 1 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['create_timestamp'] = create_timestamp 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        current_record['update_timestamp'] = update_timestamp 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        records_to_save.append(current_record)  
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    #db_manager.batch_insert('account_category', records_to_save) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    for account_id in [*account_negative_cates.keys()]: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        if not account_negative_cates[account_id]: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-            account_negative_cates.pop(account_id) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															- 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    print(json.dumps(account_negative_cates, ensure_ascii=False, indent=2)) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    for k, v in account_negative_cates.items(): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-        print('{}\t{}'.format(k, ','.join(v))) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+def clear_old_version(db_manager, dt): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    update_timestamp = int(time.time()) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    sql = f""" 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        UPDATE account_category 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        SET status = 0, update_timestamp = {update_timestamp} 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        WHERE dt < {dt} and status = 1 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    """ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    rows = db_manager.execute(sql) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    print(f"updated rows: {rows}") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															 def main(): 
														 | 
														
														 | 
														
															 def main(): 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -119,8 +69,9 @@ def main(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     cate_model = CategoryRegressionV1() 
														 | 
														
														 | 
														
															     cate_model = CategoryRegressionV1() 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     df = cate_model.preprocess_data(raw_df) 
														 | 
														
														 | 
														
															     df = cate_model.preprocess_data(raw_df) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    if args.dry_run: 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    if args.dry_run and False: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															         cate_model.build(df) 
														 | 
														
														 | 
														
															         cate_model.build(df) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+        return 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     create_timestamp = int(time.time()) 
														 | 
														
														 | 
														
															     create_timestamp = int(time.time()) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     update_timestamp = create_timestamp 
														 | 
														
														 | 
														
															     update_timestamp = create_timestamp 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -150,7 +101,8 @@ def main(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             # 用于排序的品类相关性 
														 | 
														
														 | 
														
															             # 用于排序的品类相关性 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None: 
														 | 
														
														 | 
														
															             if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                 print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}") 
														 | 
														
														 | 
														
															                 print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}") 
														 | 
													
												
											
												
													
														| 
														 | 
														
															-                current_record['category_map'][cate_name] = round(param, 6) 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                truncate_param = round(max(min(param, 0.25), -0.3), 6) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+                current_record['category_map'][cate_name] = truncate_param 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             # 用于冷启文章分配的负向品类 
														 | 
														
														 | 
														
															             # 用于冷启文章分配的负向品类 
														 | 
													
												
											
												
													
														| 
														 | 
														
															             if param < -0.1 and cate_name is not None and p_value < 0.3: 
														 | 
														
														 | 
														
															             if param < -0.1 and cate_name is not None and p_value < 0.3: 
														 | 
													
												
											
												
													
														| 
														 | 
														
															                 account_negative_cates[account_id].append(cate_name) 
														 | 
														
														 | 
														
															                 account_negative_cates[account_id].append(cate_name) 
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -169,6 +121,7 @@ def main(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES) 
														 | 
														
														 | 
														
															     db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     db_manager.batch_insert('account_category', records_to_save) 
														 | 
														
														 | 
														
															     db_manager.batch_insert('account_category', records_to_save) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    clear_old_version(db_manager, dt_version) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															     # 过滤空账号 
														 | 
														
														 | 
														
															     # 过滤空账号 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     for account_id in [*account_negative_cates.keys()]: 
														 | 
														
														 | 
														
															     for account_id in [*account_negative_cates.keys()]: 
														 |