瀏覽代碼

add crawler_detail.py

luojunhui 1 周之前
父節點
當前提交
04d6025c23
共有 2 個文件被更改,包括 74 次插入0 次删除
  1. 3 0
      applications/tasks/analysis_task/__init__.py
  2. 71 0
      applications/tasks/analysis_task/crawler_detail.py

+ 3 - 0
applications/tasks/analysis_task/__init__.py

@@ -0,0 +1,3 @@
+from .crawler_detail import CrawlerVideoDetailAnalysis
+
+__all__ = ["CrawlerVideoDetailAnalysis"]

+ 71 - 0
applications/tasks/analysis_task/crawler_detail.py

@@ -0,0 +1,71 @@
+class CrawlerDetailAnalysisConst:
+    CATEGORY_LIST = [
+        "知识科普",
+        "国家大事",
+        "历史人物",
+        "奇闻趣事",
+        "名人八卦",
+        "怀旧时光",
+        "情感故事",
+        "社会法治",
+        "现代人物",
+        "社会现象",
+        "健康养生",
+        "家长里短",
+        "军事历史",
+        "财经科技",
+        "政治新闻",
+    ]
+
+    TRANSFORMED_STATUS = 1
+
+
+class CrawlerDetail(CrawlerDetailAnalysisConst):
+    pass
+
+
+class CrawlerVideoDetailAnalysis(CrawlerDetail):
+    def __init__(self, pool, trace_id):
+        self.pool = pool
+        self.trace_id = trace_id
+
+    async def get_crawler_videos_by_platform(self, start_date, end_data):
+        """
+        获取 start_dt && end_dt 之间每个渠道抓取的视频数量
+        """
+        query = """
+            SELECT FROM_UNIXTIME(crawler_timestamp, '%Y-%m-%d') AS crawler_date, platform, count(1) AS video_count
+            FROM publish_single_video_source
+            WHERE FROM_UNIXTIME(crawler_timestamp, '%Y-%m-%d') BETWEEN %s AND %s
+            GROUP BY FROM_UNIXTIME(crawler_timestamp, '%Y-%m-%d'), platform;
+        """
+        return await self.pool.async_fetch(query=query, params=(start_date, end_data))
+
+    async def get_crawler_videos_by_category(self, start_date, end_data):
+        """
+        获取 start_dt && end_dt 之间每个品类抓取的视频数量
+        """
+        query = """
+            SELECT FROM_UNIXTIME(crawler_timestamp, '%Y-%m-%d') AS crawler_date, category, count(1) AS video_count
+            FROM publish_single_video_source
+            WHERE FROM_UNIXTIME(crawler_timestamp) BETWEEN %s AND %s AND category in %s
+            GROUP BY FROM_UNIXTIME(crawler_timestamp, '%Y-%m-%d'), category;
+        """
+        return await self.pool.async_fetch(
+            query=query, params=(start_date, end_data, tuple(self.CATEGORY_LIST))
+        )
+
+    async def get_transform_videos_by_platform(self, start_date, end_data):
+        query = """
+            SELECT DATE_FORMAT(create_timestamp, '%Y-%m-%d') AS dt, platform, 
+                   count(*) AS video_count, avg(score) AS average_similarity_score
+            FROM single_video_transform_queue
+            WHERE create_timestamp BETWEEN %s AND %s AND status = %s
+            GROUP BY DATE_FORMAT(create_timestamp, '%Y-%m-%d'), platform;
+        """
+        return await self.pool.async_fetch(
+            query=query, params=(start_date, end_data, self.TRANSFORMED_STATUS)
+        )
+
+    async def get_transform_videos_by_category(self, start_date, end_data):
+        pass