|
@@ -93,7 +93,6 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
|
|
|
}
|
|
|
)
|
|
|
|
|
|
-
|
|
|
case _:
|
|
|
raise ValueError("Invalid strategy")
|
|
|
|
|
@@ -105,16 +104,31 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
|
|
|
total_length = dataframe.shape[0]
|
|
|
filter_df = dataframe[dataframe["score"] > self.SIMILARITY_SCORE_THRESHOLD]
|
|
|
|
|
|
- await feishu_robot.bot(
|
|
|
- title="冷启动创建抓取计划",
|
|
|
- detail={
|
|
|
- "渠道": crawl_method,
|
|
|
- "总文章数量": total_length,
|
|
|
- "相关性分数过滤剩余": filter_df.shape[0],
|
|
|
- },
|
|
|
- mention=False,
|
|
|
- )
|
|
|
- return filter_df[: self.DAILY_ARTICLE_NUM]
|
|
|
+ match strategy:
|
|
|
+ case "strategy_v1":
|
|
|
+ daily_article_num = self.DAILY_ARTICLE_NUM
|
|
|
+ await feishu_robot.bot(
|
|
|
+ title="冷启动创建抓取计划",
|
|
|
+ detail={
|
|
|
+ "渠道": crawl_method,
|
|
|
+ "总文章数量": total_length,
|
|
|
+ "相关性分数过滤剩余": filter_df.shape[0],
|
|
|
+ },
|
|
|
+ mention=False,
|
|
|
+ )
|
|
|
+ case "strategy_v2":
|
|
|
+ daily_article_num = self.DAILY_CATEGORY_ARTICLE_NUM
|
|
|
+ self.cold_start_records.append(
|
|
|
+ {
|
|
|
+ "category": category,
|
|
|
+ "cold_start_num": min(daily_article_num, len(filter_df))
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ case _:
|
|
|
+ raise ValueError("Invalid strategy")
|
|
|
+
|
|
|
+ return filter_df.head(daily_article_num)
|
|
|
|
|
|
async def article_pool_filter(
|
|
|
self, strategy, platform, dataframe, crawl_method, category
|