浏览代码

Update run_category_model_v1: remove late samples and add index = 2

StrayWarrior 4 月之前
父节点
当前提交
e9502de530
共有 2 个文件被更改,包括 4 次插入3 次删除
  1. 3 2
      run_category_model_v1.py
  2. 1 1
      src/long_articles/category_models.py

+ 3 - 2
run_category_model_v1.py

@@ -25,14 +25,15 @@ from config.dev import Config
 def prepare_raw_data(dt_begin, dt_end):
 def prepare_raw_data(dt_begin, dt_end):
     data_fields = ['dt', 'gh_id', 'account_name', 'title', 'similarity',
     data_fields = ['dt', 'gh_id', 'account_name', 'title', 'similarity',
                    'view_count_rate', 'category', 'read_avg',
                    'view_count_rate', 'category', 'read_avg',
-                   'read_avg_rate']
+                   'read_avg_rate', 'first_pub_interval']
     fields_str = ','.join(data_fields)
     fields_str = ','.join(data_fields)
     db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
     db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
     sql = f"""
     sql = f"""
         SELECT {fields_str} FROM datastat_score WHERE dt BETWEEN {dt_begin} AND {dt_end}
         SELECT {fields_str} FROM datastat_score WHERE dt BETWEEN {dt_begin} AND {dt_end}
             AND similarity > 0 AND category IS NOT NULL AND read_avg > 500
             AND similarity > 0 AND category IS NOT NULL AND read_avg > 500
             AND read_avg_rate BETWEEN 0 AND 3
             AND read_avg_rate BETWEEN 0 AND 3
-            AND `index` = 1
+            AND `index` in (1, 2)
+            AND FROM_UNIXTIME(coalesce(publish_timestamp, 0), '%H') < '15'
         """
         """
     rows = db_manager.select(sql)
     rows = db_manager.select(sql)
     df = pd.DataFrame(rows, columns=data_fields)
     df = pd.DataFrame(rows, columns=data_fields)

+ 1 - 1
src/long_articles/category_models.py

@@ -23,7 +23,7 @@ class CategoryRegressionV1:
             'CateOddities', 'CateFamily', 'CateHeartwarm',
             'CateOddities', 'CateFamily', 'CateHeartwarm',
             'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
             'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
             'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
             'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
-            'view_count_rate'
+            'view_count_rate' #, 'days_decrease'
         ]
         ]
 
 
     def preprocess_data(self, df):
     def preprocess_data(self, df):