Преглед изворни кода

小年糕账户历史数据处理

zhangliang пре 3 дана
родитељ
комит
5f8220a0bf

+ 39 - 0
core/models/activity_config_models.py

@@ -0,0 +1,39 @@
+from typing import List, Dict, Optional
+from pydantic import BaseModel, Field
+from pydantic import ConfigDict
+
+
+class ActivityThresholds(BaseModel):
+    """活跃度阈值配置"""
+    extreme_active_recent_7d: int = Field(default=15, description="极高活跃:近7天发布阈值")
+    high_active_recent_7d: int = Field(default=8, description="高活跃:近7天发布阈值")
+    medium_active_recent_7d: int = Field(default=3, description="中活跃:近7天发布阈值")
+    low_active_recent_7d: int = Field(default=1, description="低活跃:近7天发布阈值")
+    dormant_recent_30d: int = Field(default=1, description="休眠:近30天发布阈值")
+    new_user_total: int = Field(default=0, description="新用户:总视频数阈值")
+
+
+class ActivityLevel(BaseModel):
+    """活跃度等级配置"""
+    label: str = Field(description="活跃度标签")
+    priority: int = Field(description="优先级")
+
+
+class ActivityLevels(BaseModel):
+    """活跃度等级映射"""
+    new_user: ActivityLevel = Field(default=ActivityLevel(label="新用户", priority=0))
+    extreme_active: ActivityLevel = Field(default=ActivityLevel(label="极高活跃", priority=1))
+    high_active: ActivityLevel = Field(default=ActivityLevel(label="高活跃", priority=2))
+    medium_active: ActivityLevel = Field(default=ActivityLevel(label="中活跃", priority=3))
+    low_active: ActivityLevel = Field(default=ActivityLevel(label="低活跃", priority=4))
+    dormant: ActivityLevel = Field(default=ActivityLevel(label="休眠", priority=5))
+    zombie: ActivityLevel = Field(default=ActivityLevel(label="僵尸", priority=6))
+
+
+class ActivityCalculatorConfig(BaseModel):
+    """活跃度计算器配置"""
+    platforms: List[str] = Field(default=["xiaoniangao", "zhongqingkandian"], description="要分析的平台列表")
+    activity_thresholds: ActivityThresholds = Field(default=ActivityThresholds(), description="活跃度阈值配置")
+    activity_levels: ActivityLevels = Field(default=ActivityLevels(), description="活跃度等级配置")
+    
+    model_config = ConfigDict(extra='forbid')  # 使用V2兼容的方式

+ 19 - 0
core/models/crawler_account_info.py

@@ -0,0 +1,19 @@
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from typing import Optional
+from datetime import datetime
+
+class CrawlerAccountInfo(BaseModel):
+    """
+    爬虫账户信息模型
+    对应数据库表: crawler_account_info
+    """
+    id: Optional[int] = None
+    platform: str
+    platform_mode: str
+    priority: int = 0
+    created_at: Optional[datetime] = None
+    updated_at: Optional[datetime] = None
+    last_crawled_at: Optional[datetime] = None
+
+    model_config = ConfigDict(from_attributes=True)  # 使用V2兼容的方式

+ 5 - 4
core/models/rule_models.py

@@ -1,4 +1,6 @@
-from pydantic import BaseModel, validator
+from pydantic import BaseModel
+from pydantic.functional_validators import field_validator
+from pydantic import ConfigDict
 from typing import Dict, Any, Optional, Union
 
 
@@ -30,11 +32,10 @@ class RuleModel(BaseModel):
     width: Optional[RuleField] = None
     height: Optional[RuleField] = None
 
-    @validator('*')
+    @field_validator('*')
     def validate_rule_fields(cls, v):
         if v is not None and not isinstance(v, RuleField):
             raise ValueError('Rule fields must be of type RuleField')
         return v
 
-    class Config:
-        extra = "allow"  # 允许额外的字段
+    model_config = ConfigDict(extra='allow')  # 使用V2兼容的方式允许额外字段

+ 84 - 11
main.py

@@ -2,10 +2,18 @@ import time
 import signal  # 新增:导入signal模块
 from multiprocessing import Process, cpu_count
 from typing import Dict
+from datetime import datetime
+import asyncio
 
 from core.utils.log.logger_manager import LoggerManager
 from scheduler.process_manager import split_topics, start_worker_process
 from spiders.spider_registry import SPIDER_CLASS_MAP
+from scripts.activity_calculator import ActivityCalculator
+
+# 从 APScheduler 导入异步调度器
+from apscheduler.schedulers.asyncio import AsyncIOScheduler
+from apscheduler.triggers.cron import CronTrigger
+
 
 # 全局关闭标志
 shutdown_flag = False
@@ -19,18 +27,70 @@ def handle_signal(sig, frame):
     shutdown_flag = True
 
 
-def main():
-    global shutdown_flag
+async def run_activity_calculator_with_scheduler(shutdown_event):
+    """使用 APScheduler 运行活跃度计算器 - 仅执行每日增量更新"""
+    logger = LoggerManager.get_logger()
+    logger.info("[活跃度计算器] AsyncIOScheduler 启动(仅增量更新)")
+    
+    scheduler = AsyncIOScheduler()
+    
+    # 添加每日凌晨0点的增量更新任务
+    async def incremental_update_job():
+        logger.info("[活跃度计算器] 开始执行每日增量更新")
+        try:
+            calc = ActivityCalculator(update_mode="incremental")
+            try:
+                await calc.initialize()
+                result = await calc.calculate_and_update()
+                logger.info(f"[活跃度计算器] 增量更新完成,处理了 {result} 个用户")
+            finally:
+                await calc.close()
+        except Exception as e:
+            logger.error(f"[活跃度计算器] 增量更新失败: {e}")
+    
+    scheduler.add_job(
+        func=incremental_update_job,
+        trigger=CronTrigger(hour=0, minute=0),  # 每天凌晨0点执行
+        id='incremental_update_daily',
+        name='每日增量更新',
+        misfire_grace_time=3600
+    )
+    
+    # 启动调度器
+    scheduler.start()
+    logger.info("[活跃度计算器] AsyncIOScheduler 已启动并运行(仅增量更新)")
+    
+    # 等待关闭事件
+    await shutdown_event.wait()
+    
+    # 关闭调度器
+    scheduler.shutdown()
+    logger.info("[活跃度计算器] AsyncIOScheduler 已关闭")
+
+
+async def main():
     logger = LoggerManager.get_logger()
     aliyun_log = LoggerManager.get_aliyun_logger()
 
+    # 创建关闭事件
+    shutdown_event = asyncio.Event()
+
+    def handle_signal_async(sig, frame):
+        """异步信号处理函数"""
+        logger.warning(f"[主进程] 接收到信号 {sig},开始优雅关闭...")
+        shutdown_event.set()
+
     # 注册信号处理器 - 关键新增
-    signal.signal(signal.SIGTERM, handle_signal)  # 处理 kill
-    signal.signal(signal.SIGINT, handle_signal)  # 处理 Ctrl+C
-    signal.signal(signal.SIGHUP, handle_signal)  # 处理终端断开
+    signal.signal(signal.SIGTERM, handle_signal_async)  # 处理 kill
+    signal.signal(signal.SIGINT, handle_signal_async)  # 处理 Ctrl+C
+    signal.signal(signal.SIGHUP, handle_signal_async)  # 处理终端断开
 
     logger.info(f"[主进程] 启动,PID={os.getpid()}")  # 记录PID便于管理
 
+    # 启动活跃度计算器调度器(仅增量更新)
+    activity_scheduler_task = asyncio.create_task(run_activity_calculator_with_scheduler(shutdown_event))
+    logger.info("[主进程] 活跃度计算器调度器已启动(仅增量更新)")
+
     topic_list = list(SPIDER_CLASS_MAP.keys())
     logger.info(f"[主进程] 监听 Topics: {topic_list}")
 
@@ -51,10 +111,10 @@ def main():
 
     # 主进程持续监控子进程状态
     try:
-        while not shutdown_flag:  # 检查关闭标志
-            time.sleep(5)
+        while not shutdown_event.is_set():  # 检查关闭标志
+            await asyncio.sleep(5)  # 使用异步sleep
             for group_id, p in list(process_map.items()):
-                if shutdown_flag:  # 如果正在关闭,跳过检查
+                if shutdown_event.is_set():  # 如果正在关闭,跳过检查
                     break
 
                 if not p.is_alive():
@@ -92,12 +152,12 @@ def main():
                     restart_count[group_id] += 1
                     last_restart_time[group_id] = current_time
 
-                    time.sleep(2)
+                    await asyncio.sleep(2)  # 使用异步sleep
                     start_worker_process(group_id, topic_groups[group_id], process_map)
 
     except KeyboardInterrupt:
         logger.warning("[主进程] 接收到退出信号,终止所有子进程...")
-        shutdown_flag = True
+        shutdown_event.set()
 
     # 优雅终止所有子进程 - 统一处理关闭
     logger.info("[主进程] 终止所有子进程...")
@@ -113,10 +173,23 @@ def main():
             p.kill()  # 强制终止
             p.join()
 
+    # 设置关闭事件并等待活动调度器完成
+    shutdown_event.set()
+    
+    # 取消活动调度器任务
+    if not activity_scheduler_task.done():
+        activity_scheduler_task.cancel()
+        try:
+            await asyncio.wait_for(activity_scheduler_task, timeout=10.0)
+        except asyncio.TimeoutError:
+            logger.warning("[活跃度计算器] 调度器关闭超时")
+        except asyncio.CancelledError:
+            logger.info("[活跃度计算器] 调度器任务已被取消")
+
     logger.info("[主进程] 所有子进程已终止,退出")
 
 
 if __name__ == '__main__':
     import os
 
-    main()
+    asyncio.run(main())

+ 310 - 0
scripts/activity_calculator.py

@@ -0,0 +1,310 @@
+# activity_calculator.py
+import asyncio
+from typing import Optional, List, Dict, Any
+from datetime import datetime, date
+
+from config import settings
+from core.utils.log.logger_manager import LoggerManager
+from services.async_mysql_service import AsyncMysqlService
+from core.models.activity_config_models import ActivityCalculatorConfig, ActivityThresholds, ActivityLevels, ActivityLevel
+
+
+class ActivityCalculator:
+    """优化版活跃度计算器"""
+
+    def __init__(self, platform: str = "activity_calculator", mode: str = "activity", config: Optional[ActivityCalculatorConfig] = None, update_mode: str = "incremental"):
+        self.platform = platform
+        self.mode = mode
+        self.update_mode = update_mode  # "full" 或 "incremental"
+        self.logger = LoggerManager.get_logger(platform=platform, mode=mode)
+        self.aliyun_logger = LoggerManager.get_aliyun_logger(platform=platform, mode=mode)
+        self.db_service = None
+
+        
+        # 使用传入的配置或创建默认配置
+        if config:
+            self.config = config
+        else:
+            self.config = ActivityCalculatorConfig()
+
+    async def initialize(self):
+        """异步初始化数据库连接"""
+        self.db_service = AsyncMysqlService(platform=self.platform, mode=self.mode)
+        await self.db_service.__aenter__()  # 初始化连接池
+
+    async def calculate_and_update(self):
+        """计算并更新所有用户的活跃度"""
+        try:
+            # 1. 获取爬取统计数据
+            crawl_stats = await self._get_crawl_statistics()
+            self.logger.info(f"获取爬取统计数据成功:{len(crawl_stats)}条, 更新模式: {self.update_mode}")
+
+            # 2. 并发批量更新用户活跃度
+            updated_count = await self._concurrent_batch_update_user_activities(crawl_stats)
+            self.logger.info(f"批量更新用户活跃度成功:{updated_count}条")
+
+            success_msg = f"活跃度计算完成:更新{updated_count}用户"
+            self.logger.info(success_msg)
+            return updated_count
+
+        except Exception as e:
+            error_msg = f"活跃度计算失败: {e}"
+            self.logger.error(error_msg)
+            self.aliyun_logger.logging(code="9022", message=error_msg, data={"error": str(e)})
+            raise
+
+    async def _get_crawl_statistics(self):
+        """获取爬取统计数据"""
+        # 构建平台列表的SQL片段,包含合并的平台
+        platforms = self.config.platforms.copy()
+        # 如果配置中包含xiaoniangao,同时查询xiaoniangaotuijianliu
+        if 'xiaoniangao' in platforms:
+            platforms.extend(['xiaoniangaotuijianliu'])
+        # 去重
+        platforms = list(set(platforms))
+        
+        platforms_str = '"' + '","'.join(platforms) + '"'
+        
+        if self.update_mode == "full":
+            # 全量更新:查询所有数据并计算完整统计,将两个平台合并为一个
+            sql = f"""
+                 SELECT 
+                    out_user_id,
+                    CASE 
+                        WHEN platform = 'xiaoniangaotuijianliu' THEN 'xiaoniangao'
+                        ELSE platform 
+                    END as platform,
+                    COUNT(*) as total_videos,
+                    SUM(CASE WHEN create_time >= DATE_SUB(NOW(), INTERVAL 30 DAY) THEN 1 ELSE 0 END) as recent_30d_videos,
+                    SUM(CASE WHEN create_time >= DATE_SUB(NOW(), INTERVAL 7 DAY) THEN 1 ELSE 0 END)  as recent_7d_videos
+                    FROM crawler_video
+                    WHERE out_user_id IS NOT NULL
+                    AND out_user_id != ''
+                    AND out_user_id != 0
+                    AND `platform` in ({platforms_str})
+                    GROUP BY out_user_id, 
+                        CASE 
+                            WHEN platform = 'xiaoniangaotuijianliu' THEN 'xiaoniangao'
+                            ELSE platform 
+                        END;
+                  """
+            results = await self.db_service.fetch_all(sql)
+        else:  # incremental
+            # 增量更新:获取需要更新的所有用户(不仅是今天的,还包括在统计窗口内的)
+            # 首先找出今天有活动的用户(包含两个平台)
+            today_users_sql = f"""
+                SELECT DISTINCT 
+                    out_user_id,
+                    CASE 
+                        WHEN platform = 'xiaoniangaotuijianliu' THEN 'xiaoniangao'
+                        ELSE platform 
+                    END as platform
+                FROM crawler_video
+                WHERE out_user_id IS NOT NULL
+                AND out_user_id != ''
+                AND out_user_id != 0
+                AND `platform` in ({platforms_str})
+                AND DATE(create_time) = DATE_SUB(CURDATE(), INTERVAL 1 DAY)
+            """
+            today_users = await self.db_service.fetch_all(today_users_sql)
+            
+            if not today_users:
+                return []  # 如果今天没有新数据,直接返回空列表
+            
+            # 构建用户条件用于查询最近30天的完整统计数据
+            user_conditions = []
+            params = []
+            for user in today_users:
+                user_conditions.append("(out_user_id=%s AND (platform=%s OR platform='xiaoniangaotuijianliu'))")
+                params.extend([user['out_user_id'], user['platform']])
+            
+            user_where_clause = " OR ".join(user_conditions)
+            sql = f"""
+                 SELECT 
+                    out_user_id,
+                    CASE 
+                        WHEN platform = 'xiaoniangaotuijianliu' THEN 'xiaoniangao'
+                        ELSE platform 
+                    END as platform,
+                    COUNT(*) as total_videos,
+                    SUM(CASE WHEN create_time >= DATE_SUB(NOW(), INTERVAL 30 DAY) THEN 1 ELSE 0 END) as recent_30d_videos,
+                    SUM(CASE WHEN create_time >= DATE_SUB(NOW(), INTERVAL 7 DAY) THEN 1 ELSE 0 END)  as recent_7d_videos
+                    FROM crawler_video
+                    WHERE out_user_id IS NOT NULL
+                    AND out_user_id != ''
+                    AND `platform` in ({platforms_str})
+                    AND ({user_where_clause})
+                    GROUP BY out_user_id, 
+                        CASE 
+                            WHEN platform = 'xiaoniangaotuijianliu' THEN 'xiaoniangao'
+                            ELSE platform 
+                        END;
+                  """
+            
+            results = await self.db_service.fetch_all(sql, params)
+
+        stats_list = []
+        for row in results:
+            stats = {
+                'out_user_id': row['out_user_id'],
+                'platform': row['platform'],
+                'total_videos': row['total_videos'] or 0,
+                'recent_30d_videos': row['recent_30d_videos'] or 0,
+                'recent_7d_videos': row['recent_7d_videos'] or 0
+            }
+            stats_list.append(stats)
+
+        return stats_list
+
+    async def _process_single_batch(self, batch):
+        """处理单个批次的数据"""
+        # 准备当前批次的数据
+        batch_data = []
+        for stats in batch:
+            # 确定活跃度级别和优先级
+            activity_level, priority_level = self._determine_level(stats)
+
+            # 判断是否活跃(僵尸用户不活跃)
+            is_active = 0 if activity_level == self.config.activity_levels.zombie.label else 1
+
+            # 新用户标记 - 从stats中获取初始值,如果是新用户则设为1,否则为0
+            is_new_user = 1 if activity_level == self.config.activity_levels.new_user.label else 0
+
+            # 添加到批次数据
+            batch_data.append([
+                stats['out_user_id'],
+                stats['platform'],
+                stats['total_videos'],
+                stats['recent_30d_videos'],
+                stats['recent_7d_videos'],
+                activity_level,
+                priority_level,
+                is_active,
+                is_new_user
+            ])
+        
+        # 批量执行SQL
+        update_sql = """
+                     INSERT INTO external_user_activity
+                     (out_user_id, platform,
+                      total_videos, recent_30d_videos, recent_7d_videos,
+                      activity_level, priority_level, is_active, is_new_user,
+                      update_time)
+                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, NOW()) ON DUPLICATE KEY 
+                     UPDATE 
+                         total_videos = VALUES(total_videos), 
+                         recent_30d_videos = VALUES(recent_30d_videos), 
+                         recent_7d_videos = VALUES(recent_7d_videos), 
+                         activity_level = VALUES(activity_level), 
+                         priority_level = VALUES(priority_level), 
+                         is_active = VALUES(is_active), 
+                         is_new_user = VALUES(is_new_user), 
+                         update_time = NOW() 
+                     """
+
+        try:
+            affected_rows = await self.db_service.executemany(update_sql, batch_data)
+            return affected_rows
+        except Exception as e:
+            error_msg = f"批次更新用户活跃度失败: {e}"
+            self.logger.error(error_msg)
+            self.aliyun_logger.logging(code="9023", message=error_msg, data={"error": str(e)})
+            raise
+
+    async def _concurrent_batch_update_user_activities(self, stats_list, batch_size=10000, concurrency=4):
+        """并发批量更新用户活跃度,分批处理以提高性能"""
+        if not stats_list:
+            return 0
+            
+        # 将数据分成分批
+        batches = [stats_list[i:i + batch_size] for i in range(0, len(stats_list), batch_size)]
+        total_batches = len(batches)
+        
+        self.logger.info(f"准备处理 {len(stats_list)} 条记录,分为 {total_batches} 个批次,每个批次最多 {batch_size} 条记录")
+        
+        # 使用信号量控制并发数量
+        semaphore = asyncio.Semaphore(concurrency)
+        
+        async def process_batch_with_semaphore(batch, index):
+            async with semaphore:
+                try:
+                    affected_rows = await self._process_single_batch(batch)
+                    
+                    # 记录批次进度
+                    processed_count = min((index + 1) * batch_size, len(stats_list))
+                    self.logger.info(f"并发批次处理进度: [{index + 1}/{total_batches}] 已处理: {processed_count}/{len(stats_list)}, 本次更新: {affected_rows}条")
+                    
+                    return affected_rows
+                except Exception as e:
+                    error_msg = f"并发批次处理失败 (批次 {index}): {e}"
+                    self.logger.error(error_msg)
+                    raise
+
+        # 并发处理所有批次
+        tasks = [process_batch_with_semaphore(batch, i) for i, batch in enumerate(batches)]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        
+        # 检查是否有异常
+        total_updated = 0
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                error_msg = f"批次 {i} 处理失败: {result}"
+                self.logger.error(error_msg)
+                raise result
+            total_updated += result
+        
+        return total_updated
+
+    def _determine_level(self, stats):
+        """基于三个指标确定活跃度级别和优先级(统一阈值)"""
+        recent_7d = stats['recent_7d_videos']
+        recent_30d = stats['recent_30d_videos']
+        total = stats['total_videos']
+
+        thresholds = self.config.activity_thresholds
+        levels = self.config.activity_levels
+
+        # 1. 新用户判断:没有历史数据
+        if total <= thresholds.new_user_total:
+            return levels.new_user.label, levels.new_user.priority
+
+        # 2. 基于近7天数据判断
+        if recent_7d >= 10:      # extreme_active_recent_7d (极高活跃阈值)
+            return levels.extreme_active.label, levels.extreme_active.priority
+        elif recent_7d >= 7:     # high_active_recent_7d (高活跃阈值)
+            return levels.high_active.label, levels.high_active.priority
+        elif recent_7d >= 3:     # medium_active_recent_7d (中活跃阈值)
+            return levels.medium_active.label, levels.medium_active.priority
+        elif recent_7d >= 1:     # low_active_recent_7d (低活跃阈值)
+            return levels.low_active.label, levels.low_active.priority
+        else:
+            # 近7天没有数据,看近30天
+            if recent_30d >= 3:  # dormant_recent_30d (休眠阈值)
+                return levels.dormant.label, levels.dormant.priority
+            else:
+                # 近30天也没有数据,但是历史有数据,则为僵尸
+                return levels.zombie.label, levels.zombie.priority
+
+
+    async def close(self):
+        """关闭数据库连接"""
+        if self.db_service:
+            await self.db_service.__aexit__(None, None, None)
+        self.logger.info("活跃度计算器已关闭")
+
+async def main_full():
+        # 通过命令行参数控制更新模式
+        import sys
+        update_mode = "full"
+        calculator = ActivityCalculator(update_mode=update_mode)
+        await calculator.initialize()  # 初始化数据库连接
+        await calculator.calculate_and_update()
+        await calculator.close()  # 关闭连接
+
+
+async def main_incremental():
+    update_mode = "incremental"
+    calculator = ActivityCalculator(update_mode=update_mode)
+    await calculator.initialize()  # 初始化数据库连接
+    await calculator.calculate_and_update()
+    await calculator.close()  # 关闭连接

+ 121 - 0
scripts/activity_scheduler.py

@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+"""
+活跃度计算任务调度器
+使用 APScheduler 实现定时任务调度
+"""
+import asyncio
+import argparse
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+import os
+
+# 添加项目根目录到路径
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.triggers.date import DateTrigger
+from apscheduler.triggers.cron import CronTrigger
+from scripts.activity_calculator import ActivityCalculator
+
+
+async def run_full_update():
+    """执行全量更新"""
+    print(f"[{datetime.now()}] 开始执行全量更新...")
+    calculator = ActivityCalculator(update_mode="full")
+    try:
+        await calculator.initialize()
+        result = await calculator.calculate_and_update()
+        print(f"[{datetime.now()}] 全量更新完成,处理了 {result} 个用户")
+        return result
+    except Exception as e:
+        print(f"[{datetime.now()}] 全量更新失败: {e}")
+        raise
+    finally:
+        await calculator.close()
+
+
+async def run_incremental_update():
+    """执行增量更新"""
+    print(f"[{datetime.now()}] 开始执行增量更新...")
+    calculator = ActivityCalculator(update_mode="incremental")
+    try:
+        await calculator.initialize()
+        result = await calculator.calculate_and_update()
+        print(f"[{datetime.now()}] 增量更新完成,处理了 {result} 个用户")
+        return result
+    except Exception as e:
+        print(f"[{datetime.now()}] 增量更新失败: {e}")
+        raise
+    finally:
+        await calculator.close()
+
+
+def setup_logging():
+    """设置日志"""
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser(description='活跃度计算任务调度器')
+    parser.add_argument('task', choices=['full', 'incremental', 'scheduler'], 
+                       help='任务类型: full(全量更新), incremental(增量更新), scheduler(启动调度器)')
+    parser.add_argument('--run-once', action='store_true', 
+                       help='只执行一次(用于测试)')
+    
+    args = parser.parse_args()
+    setup_logging()
+    
+    if args.task == 'full':
+        print("=" * 50)
+        print(f"执行全量更新任务 - {datetime.now()}")
+        print("=" * 50)
+        result = asyncio.run(run_full_update())
+        print(f"全量更新结果: {result} 个用户被处理")
+        
+    elif args.task == 'incremental':
+        print("=" * 50)
+        print(f"执行增量更新任务 - {datetime.now()}")
+        print("=" * 50)
+        result = asyncio.run(run_incremental_update())
+        print(f"增量更新结果: {result} 个用户被处理")
+    
+    elif args.task == 'scheduler':
+        print("=" * 50)
+        print(f"启动调度器 - {datetime.now()}")
+        print("=" * 50)
+        
+        scheduler = BlockingScheduler()
+        
+        # 添加一次性全量更新任务(可选,根据需要启用)
+        # scheduler.add_job(
+        #     func=lambda: asyncio.run(run_full_update()),
+        #     trigger=DateTrigger(run_date=datetime.now()),  # 立即执行或指定时间
+        #     id='full_update_once',
+        #     name='一次性全量更新',
+        #     misfire_grace_time=3600  # 1小时宽限时间
+        # )
+        
+        # 添加每日凌晨0点的增量更新任务
+        scheduler.add_job(
+            func=lambda: asyncio.run(run_incremental_update()),
+            trigger=CronTrigger(hour=0, minute=0),  # 每天凌晨0点执行
+            id='incremental_update_daily',
+            name='每日增量更新',
+            misfire_grace_time=3600  # 1小时宽限时间
+        )
+        
+        print("调度器已启动,按 Ctrl+C 停止...")
+        try:
+            scheduler.start()
+        except KeyboardInterrupt:
+            print("\n调度器已停止")
+            scheduler.shutdown()
+
+
+if __name__ == '__main__':
+    main()

+ 32 - 0
scripts/create_crawler_account_info_table.sql

@@ -0,0 +1,32 @@
+-- 账户活跃度表
+CREATE TABLE external_user_activity (
+    -- 基础信息
+    id BIGINT UNSIGNED NOT NULL AUTO_INCREMENT COMMENT '主键ID,自增长',
+    out_user_id VARCHAR(100) NOT NULL COMMENT '外部用户ID,唯一标识外部平台用户',
+    platform VARCHAR(50) NOT NULL COMMENT '平台名称,如:youtube、wechat、xiaoniangao等',
+    strategy VARCHAR(100) DEFAULT '' COMMENT '爬虫策略,如:author、recommend等',
+
+    -- 爬取统计信息(核心指标)
+    total_videos INT UNSIGNED DEFAULT 0 COMMENT '历史总爬取视频数,该用户在平台上所有视频总数',
+    recent_30d_videos INT UNSIGNED DEFAULT 0 COMMENT '近30天爬取视频数,反映中期活跃度',
+    recent_7d_videos INT UNSIGNED DEFAULT 0 COMMENT '近7天爬取视频数,反映短期活跃度',
+
+    -- 活跃度信息
+    activity_level VARCHAR(20) DEFAULT '未知' COMMENT '活跃度级别:新用户、极高活跃、高活跃、中活跃、低活跃、休眠、僵尸',
+    priority_level TINYINT DEFAULT 5 COMMENT '爬虫优先级:0-新用户(最高),1-极高活跃,2-高活跃,3-中活跃,4-低活跃,5-休眠,6-僵尸(最低)',
+    is_active TINYINT(1) DEFAULT 1 COMMENT '是否活跃:0-不活跃,1-活跃',
+    is_new_user TINYINT(1) DEFAULT 0 COMMENT '是否新用户:0-非新用户,1-新用户',
+
+    -- 时间信息
+    create_time DATETIME DEFAULT CURRENT_TIMESTAMP COMMENT '记录创建时间,首次插入时自动设置',
+    update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '记录更新时间,每次更新时自动更新',
+
+    -- 主键和索引
+    PRIMARY KEY (id) COMMENT '主键索引',
+    UNIQUE KEY uk_out_user_platform (out_user_id, platform, strategy) COMMENT '唯一索引,确保同一平台、策略下的用户唯一',
+    KEY idx_priority_level (priority_level) COMMENT '优先级索引,用于按优先级排序查询',
+    KEY idx_activity_level (activity_level) COMMENT '活跃度级别索引,用于按级别分组统计',
+    KEY idx_is_new_user (is_new_user) COMMENT '新用户标识索引,快速筛选新用户',
+    KEY idx_recent_7d_videos (recent_7d_videos DESC) COMMENT '核心指标索引,降序排列',
+    KEY idx_activity_composite (priority_level, recent_7d_videos DESC) COMMENT '复合索引,优化调度查询'
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='账户活跃度管理表,存储外部账户的活跃度信息和爬虫调度优先级';

Разлика између датотеке није приказан због своје велике величине
+ 213 - 0
scripts/dy_cookie_manager.py


+ 49 - 0
scripts/run_full_update.py

@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""
+执行一次性全量更新的脚本
+用于初始化数据或定期校准
+"""
+import asyncio
+import sys
+from pathlib import Path
+
+# 添加项目根目录到路径
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.activity_calculator import ActivityCalculator
+from core.utils.log.logger_manager import LoggerManager
+
+
+async def run_full_update():
+    """执行全量更新"""
+    logger = LoggerManager.get_logger()
+    logger.info("开始执行全量更新...")
+    
+    calculator = ActivityCalculator(update_mode="full")
+    try:
+        await calculator.initialize()
+        result = await calculator.calculate_and_update()
+        logger.info(f"全量更新完成,处理了 {result} 个用户")
+        return result
+    except Exception as e:
+        logger.error(f"全量更新失败: {e}")
+        raise
+    finally:
+        await calculator.close()
+
+
+def main():
+    import logging
+    logging.basicConfig(level=logging.INFO)
+    
+    print("=" * 50)
+    print("执行一次性全量更新")
+    print("=" * 50)
+    
+    result = asyncio.run(run_full_update())
+    print(f"全量更新结果: {result} 个用户被处理")
+    print("全量更新任务完成!")
+
+
+if __name__ == '__main__':
+    main()

Неке датотеке нису приказане због велике количине промена