|
@@ -15,7 +15,7 @@ from datetime import datetime, timedelta
|
|
|
import pandas as pd
|
|
|
from argparse import ArgumentParser
|
|
|
from long_articles.category_models import CategoryRegressionV1
|
|
|
-from long_articles.consts import reverse_category_name_map
|
|
|
+from long_articles.consts import category_feature_v2, category_name_map_v2, reverse_category_name_map_v2
|
|
|
from common.database import MySQLManager
|
|
|
from common import db_operation
|
|
|
from common.logging import LOG
|
|
@@ -38,6 +38,7 @@ def prepare_raw_data(dt_begin, dt_end):
|
|
|
AND `index` in (1, 2)
|
|
|
AND (FROM_UNIXTIME(coalesce(publish_timestamp, 0), '%H') < '15'
|
|
|
OR gh_id in {night_accounts_condition})
|
|
|
+ AND dt NOT BETWEEN 20250105 AND 20250215
|
|
|
"""
|
|
|
rows = db_manager.select(sql)
|
|
|
df = pd.DataFrame(rows, columns=data_fields)
|
|
@@ -51,7 +52,7 @@ def clear_old_version(db_manager, dt):
|
|
|
sql = f"""
|
|
|
UPDATE account_category
|
|
|
SET status = 0, update_timestamp = {update_timestamp}
|
|
|
- WHERE dt < {dt} and status = 1
|
|
|
+ WHERE dt < {dt} and status = 1 and version = 2
|
|
|
"""
|
|
|
rows = db_manager.execute(sql)
|
|
|
print(f"updated rows for clear: {rows}")
|
|
@@ -60,7 +61,8 @@ def get_last_version(db_manager, dt):
|
|
|
sql = f"""
|
|
|
SELECT gh_id, category_map
|
|
|
FROM account_category
|
|
|
- WHERE dt = (SELECT max(dt) FROM account_category WHERE dt < {dt})
|
|
|
+ WHERE dt = (SELECT max(dt) FROM account_category WHERE dt < {dt} AND
|
|
|
+ status = 1)
|
|
|
"""
|
|
|
data = db_manager.select(sql)
|
|
|
return data
|
|
@@ -106,7 +108,7 @@ def main():
|
|
|
|
|
|
raw_df = prepare_raw_data(begin_dt, end_dt)
|
|
|
|
|
|
- cate_model = CategoryRegressionV1()
|
|
|
+ cate_model = CategoryRegressionV1(category_feature_v2, category_name_map_v2)
|
|
|
df = cate_model.preprocess_data(raw_df)
|
|
|
|
|
|
if args.dry_run and args.print_matrix:
|
|
@@ -118,7 +120,7 @@ def main():
|
|
|
|
|
|
records_to_save = []
|
|
|
|
|
|
- param_to_category_map = reverse_category_name_map
|
|
|
+ param_to_category_map = reverse_category_name_map_v2
|
|
|
account_ids = df['gh_id'].unique()
|
|
|
account_id_map = df[['account_name', 'gh_id']].drop_duplicates() \
|
|
|
.set_index('gh_id')['account_name'].to_dict()
|
|
@@ -154,6 +156,7 @@ def main():
|
|
|
continue
|
|
|
current_record['category_map'] = json.dumps(current_record['category_map'], ensure_ascii=False)
|
|
|
current_record['status'] = 1
|
|
|
+ current_record['version'] = 2
|
|
|
current_record['create_timestamp'] = create_timestamp
|
|
|
current_record['update_timestamp'] = update_timestamp
|
|
|
records_to_save.append(current_record)
|
|
@@ -172,6 +175,7 @@ def main():
|
|
|
update_timestamp = {record['update_timestamp']}
|
|
|
WHERE dt = {record['dt']} AND gh_id = '{record['gh_id']}'
|
|
|
AND category_map != '{record['category_map']}'
|
|
|
+ AND version = 2
|
|
|
"""
|
|
|
update_rows = db_manager.execute(sql)
|
|
|
print(f"updated rows: {update_rows}, {record['gh_id']}")
|