crawler_detail.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. from applications.api import feishu_robot
  2. class CrawlerDetailAnalysisConst:
  3. CATEGORY_LIST = [
  4. "知识科普",
  5. "国家大事",
  6. "历史人物",
  7. "奇闻趣事",
  8. "名人八卦",
  9. "怀旧时光",
  10. "情感故事",
  11. "社会法治",
  12. "现代人物",
  13. "社会现象",
  14. "健康养生",
  15. "家长里短",
  16. "军事历史",
  17. "财经科技",
  18. "政治新闻",
  19. ]
  20. TRANSFORMED_STATUS = 1
  21. NOT_TRANSFORMED_STATUS = 0
  22. CRAWLER_DETAIL_TASK_PLATFORM = "crawler_detail_by_platform"
  23. CRAWLER_DETAIL_TASK_CATEGORY = "crawler_detail_by_category"
  24. TRANSFORM_DETAIL_TASK_PLATFORM = "transform_detail_by_platform"
  25. TRANSFORM_DETAIL_TASK_CATEGORY = "transform_detail_by_category"
  26. class CrawlerDetailBase(CrawlerDetailAnalysisConst):
  27. def __init__(self):
  28. super().__init__()
  29. @staticmethod
  30. def create_feishu_column_map() -> dict:
  31. date_column = feishu_robot.create_feishu_columns_sheet(
  32. sheet_type="plain_text", sheet_name="dt", display_name="日期"
  33. )
  34. category_column = feishu_robot.create_feishu_columns_sheet(
  35. sheet_type="plain_text", sheet_name="category", display_name="文章品类"
  36. )
  37. platform_column = feishu_robot.create_feishu_columns_sheet(
  38. sheet_type="plain_text", sheet_name="platform", display_name="抓取渠道"
  39. )
  40. video_cnt_column = feishu_robot.create_feishu_columns_sheet(
  41. sheet_type="number", sheet_name="video_count", display_name="视频数量"
  42. )
  43. avg_score_column = feishu_robot.create_feishu_columns_sheet(
  44. sheet_type="number",
  45. sheet_name="average_similarity_score",
  46. display_name="相关性分均值",
  47. )
  48. return {
  49. "dt": date_column,
  50. "category": category_column,
  51. "platform": platform_column,
  52. "video_count": video_cnt_column,
  53. "average_similarity_score": avg_score_column,
  54. }
  55. class CrawlerArticleDetailAnalysis(CrawlerDetailBase):
  56. pass
  57. # raise NotImplementedError
  58. class CrawlerVideoDetailAnalysis(CrawlerDetailBase):
  59. def __init__(self, pool, trace_id):
  60. super().__init__()
  61. self.pool = pool
  62. self.trace_id = trace_id
  63. async def get_crawler_videos_by_platform(self, start_date, end_date):
  64. """
  65. 获取 start_dt && end_dt 之间每个渠道抓取的视频数量
  66. """
  67. query = """
  68. SELECT CAST(DATE(FROM_UNIXTIME(crawler_timestamp)) AS CHAR) AS dt, platform, count(1) AS video_count
  69. FROM publish_single_video_source
  70. WHERE crawler_timestamp BETWEEN UNIX_TIMESTAMP(%s) AND UNIX_TIMESTAMP(%s)
  71. GROUP BY dt, platform;
  72. """
  73. return await self.pool.async_fetch(query=query, params=(start_date, end_date))
  74. async def get_crawler_videos_by_category(self, start_date, end_date):
  75. """
  76. 获取 start_dt && end_dt 之间每个品类抓取的视频数量
  77. """
  78. category_place_holders = ", ".join(["%s"] * len(self.CATEGORY_LIST))
  79. query = f"""
  80. SELECT CAST(DATE(FROM_UNIXTIME(crawler_timestamp)) AS CHAR) AS dt, category, count(1) AS video_count
  81. FROM publish_single_video_source
  82. WHERE crawler_timestamp BETWEEN UNIX_TIMESTAMP(%s) AND UNIX_TIMESTAMP(%s)
  83. AND category IN ({category_place_holders})
  84. GROUP BY dt, category;
  85. """
  86. return await self.pool.async_fetch(
  87. query=query, params=tuple([start_date, end_date] + self.CATEGORY_LIST)
  88. )
  89. async def get_transform_videos_by_platform(self, start_date, end_date):
  90. query = """
  91. SELECT CAST(DATE(create_timestamp) AS CHAR) AS dt, platform,
  92. count(*) AS video_count, avg(score) AS average_similarity_score
  93. FROM single_video_transform_queue
  94. WHERE create_timestamp BETWEEN %s AND %s AND status = %s
  95. GROUP BY dt, platform;
  96. """
  97. return await self.pool.async_fetch(
  98. query=query, params=(start_date, end_date, self.TRANSFORMED_STATUS)
  99. )
  100. async def get_transform_videos_by_category(self, start_date, end_date):
  101. raise NotImplementedError()
  102. class CrawlerDetailDeal(CrawlerVideoDetailAnalysis, CrawlerArticleDetailAnalysis):
  103. def __init__(self, pool, trace_id):
  104. super().__init__(pool, trace_id)
  105. async def analysis_video_pool(self, task, start_date, end_date):
  106. match task:
  107. case self.CRAWLER_DETAIL_TASK_PLATFORM:
  108. return await self.get_crawler_videos_by_platform(start_date, end_date)
  109. case self.CRAWLER_DETAIL_TASK_CATEGORY:
  110. return await self.get_crawler_videos_by_category(start_date, end_date)
  111. case self.TRANSFORM_DETAIL_TASK_PLATFORM:
  112. return await self.get_transform_videos_by_platform(start_date, end_date)
  113. case self.TRANSFORM_DETAIL_TASK_CATEGORY:
  114. return await self.get_transform_videos_by_category(start_date, end_date)
  115. case _:
  116. return None
  117. async def analysis_article_pool(self, task, start_date, end_date):
  118. raise NotImplementedError()
  119. async def deal(self, params):
  120. start_date = params.get("start_date")
  121. end_date = params.get("end_date")
  122. media_type = params.get("media_type", "video")
  123. sub_task = params.get("sub_task_name", self.CRAWLER_DETAIL_TASK_PLATFORM)
  124. column_dict = self.create_feishu_column_map()
  125. match media_type:
  126. case "video":
  127. crawler_detail = await self.analysis_video_pool(
  128. sub_task, start_date, end_date
  129. )
  130. case "article":
  131. crawler_detail = await self.analysis_article_pool(
  132. sub_task, start_date, end_date
  133. )
  134. case _:
  135. return None
  136. column_list = list(crawler_detail[0].keys())
  137. columns = [column_dict[key] for key in column_list]
  138. await feishu_robot.bot(
  139. title=f"[{start_date}, {end_date}) 抓取 {media_type} 统计",
  140. detail={
  141. "columns": columns,
  142. "rows": crawler_detail,
  143. },
  144. table=True,
  145. mention=False,
  146. )
  147. return None