Explorar el Código

Update run_category_model_v1: change to category version 2

StrayWarrior hace 1 mes
padre
commit
eff784db60
Se han modificado 3 ficheros con 57 adiciones y 22 borrados
  1. 9 5
      run_category_model_v1.py
  2. 7 14
      src/long_articles/category_models.py
  3. 41 3
      src/long_articles/consts.py

+ 9 - 5
run_category_model_v1.py

@@ -15,7 +15,7 @@ from datetime import datetime, timedelta
 import pandas as pd
 from argparse import ArgumentParser
 from long_articles.category_models import CategoryRegressionV1
-from long_articles.consts import reverse_category_name_map
+from long_articles.consts import category_feature_v2, category_name_map_v2, reverse_category_name_map_v2
 from common.database import MySQLManager
 from common import db_operation
 from common.logging import LOG
@@ -38,6 +38,7 @@ def prepare_raw_data(dt_begin, dt_end):
             AND `index` in (1, 2)
             AND (FROM_UNIXTIME(coalesce(publish_timestamp, 0), '%H') < '15'
                 OR gh_id in {night_accounts_condition})
+            AND dt NOT BETWEEN 20250105 AND 20250215
         """
     rows = db_manager.select(sql)
     df = pd.DataFrame(rows, columns=data_fields)
@@ -51,7 +52,7 @@ def clear_old_version(db_manager, dt):
     sql = f"""
         UPDATE account_category
         SET status = 0, update_timestamp = {update_timestamp}
-        WHERE dt < {dt} and status = 1
+        WHERE dt < {dt} and status = 1 and version = 2
     """
     rows = db_manager.execute(sql)
     print(f"updated rows for clear: {rows}")
@@ -60,7 +61,8 @@ def get_last_version(db_manager, dt):
     sql = f"""
         SELECT gh_id, category_map
         FROM account_category
-        WHERE dt = (SELECT max(dt) FROM account_category WHERE dt < {dt})
+        WHERE dt = (SELECT max(dt) FROM account_category WHERE dt < {dt} AND
+        status = 1)
     """
     data = db_manager.select(sql)
     return data
@@ -106,7 +108,7 @@ def main():
 
     raw_df = prepare_raw_data(begin_dt, end_dt)
 
-    cate_model = CategoryRegressionV1()
+    cate_model = CategoryRegressionV1(category_feature_v2, category_name_map_v2)
     df = cate_model.preprocess_data(raw_df)
 
     if args.dry_run and args.print_matrix:
@@ -118,7 +120,7 @@ def main():
 
     records_to_save = []
 
-    param_to_category_map = reverse_category_name_map
+    param_to_category_map = reverse_category_name_map_v2
     account_ids = df['gh_id'].unique()
     account_id_map = df[['account_name', 'gh_id']].drop_duplicates() \
         .set_index('gh_id')['account_name'].to_dict()
@@ -154,6 +156,7 @@ def main():
             continue
         current_record['category_map'] = json.dumps(current_record['category_map'], ensure_ascii=False)
         current_record['status'] = 1
+        current_record['version'] = 2
         current_record['create_timestamp'] = create_timestamp
         current_record['update_timestamp'] = update_timestamp
         records_to_save.append(current_record)
@@ -172,6 +175,7 @@ def main():
                     update_timestamp = {record['update_timestamp']}
                 WHERE dt = {record['dt']} AND gh_id = '{record['gh_id']}'
                     AND category_map != '{record['category_map']}'
+                    AND version = 2
             """
             update_rows = db_manager.execute(sql)
             print(f"updated rows: {update_rows}, {record['gh_id']}")

+ 7 - 14
src/long_articles/category_models.py

@@ -16,22 +16,15 @@ from sklearn.metrics import accuracy_score, confusion_matrix, classification_rep
 from sklearn.metrics import mean_squared_error, r2_score
 import statsmodels.api as sm
 import numpy as np
-from .consts import category_name_map, reverse_category_name_map
 
 class CategoryRegressionV1:
-    def __init__(self):
-        self.features = [
-            'CateOddities', 'CateFamily', 'CateHeartwarm',
-            'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
-            'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
-            # 'CateFinTech', 'CateSocialCustom', 'CateWorkExperience',
-            # 'CateEducation',
-            'view_count_rate' #, 'days_decrease'
-        ]
+    def __init__(self, features, category_name_map):
+        self.features = features
+        self.category_name_map = category_name_map
 
     def preprocess_data(self, df):
-        for cate in category_name_map:
-            colname = category_name_map[cate]
+        for cate in self.category_name_map:
+            colname = self.category_name_map[cate]
             df[colname] = df['category'] == cate
             df[colname] = df[colname].astype(int)
 
@@ -97,7 +90,7 @@ class CategoryRegressionV1:
             new_x['y'] = y
             select_idx = []
             for index, row in new_x.iterrows():
-                param_name = category_name_map.get(row['category'], None)
+                param_name = self.category_name_map.get(row['category'], None)
                 if not param_name:
                     continue
                 param_index = self.features.index(param_name) + 1
@@ -124,7 +117,7 @@ class CategoryLR:
         ]
 
     def preprocess_data(self, df):
-        for cate in category_name_map:
+        for cate in self.category_name_map:
             colname = category_name_map[cate]
             df[colname] = df['category'] == cate
             df[colname] = df[colname].astype(int)

+ 41 - 3
src/long_articles/consts.py

@@ -8,7 +8,14 @@
 Constants for long articles.
 """
 
-category_name_map = {
+category_feature_v1 = [
+    'CateOddities', 'CateFamily', 'CateHeartwarm',
+    'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
+    'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
+    'view_count_rate'
+]
+
+category_name_map_v1 = {
     '奇闻趣事': 'CateOddities',
     '历史人物': 'CateHistory',
     '家长里短': 'CateFamily',
@@ -26,6 +33,37 @@ category_name_map = {
     '子女教育': 'CateEducation'
 }
 
-reverse_category_name_map = {
-    v: k for k, v in category_name_map.items()
+reverse_category_name_map_v1 = {
+    v: k for k, v in category_name_map_v1.items()
 }
+
+category_feature_v2 = [
+    'CateSciencePop', 'CateMilitaryHistory', 'CateFamily', 'CateSocialRule',
+    'CateOddities', 'CateGossip', 'CateHealth', 'CateEmotional',
+    'CateNational', 'CateModernFigure', 'CateNostalgic', 'CatePolitics',
+    'CateHistoryFigure', 'CateSocialPhenomena', 'CateFinTech',
+    'view_count_rate'
+]
+
+category_name_map_v2 = {
+    '知识科普': 'CateSciencePop',
+    '军事历史': 'CateMilitaryHistory',
+    '家长里短': 'CateFamily',
+    '社会法治': 'CateSocialRule',
+    '奇闻趣事': 'CateOddities',
+    '名人八卦': 'CateGossip',
+    '健康养生': 'CateHealth',
+    '情感故事': 'CateEmotional',
+    '国家大事': 'CateNational',
+    '现代人物': 'CateModernFigure',
+    '怀旧时光': 'CateNostalgic',
+    '政治新闻': 'CatePolitics',
+    '历史人物': 'CateHistoryFigure',
+    '社会现象': 'CateSocialPhenomena',
+    '财经科技': 'CateFinTech',
+}
+
+reverse_category_name_map_v2 = {
+    v: k for k, v in category_name_map_v2.items()
+}
+