|
@@ -25,7 +25,8 @@ class ArticlePoolColdStartConst:
|
|
|
INIT_STATUS = 1
|
|
|
BAD_STATUS = 0
|
|
|
READ_TIMES_THRESHOLD = 1.3
|
|
|
- READ_THRESHOLD = 5000
|
|
|
+ # READ_THRESHOLD = 5000
|
|
|
+ READ_THRESHOLD = 1000
|
|
|
|
|
|
TITLE_LENGTH_LIMIT = 15
|
|
|
TITLE_LENGTH_MAX = 50
|
|
@@ -33,43 +34,12 @@ class ArticlePoolColdStartConst:
|
|
|
DEFAULT_CRAWLER_METHODS = ["1030-手动挑号", "account_association"]
|
|
|
|
|
|
|
|
|
-class ArticlePoolColdStart(ArticlePoolColdStartConst):
|
|
|
+class ArticlePoolColdStartStrategy(ArticlePoolColdStartConst):
|
|
|
def __init__(self, pool, log_client, trace_id):
|
|
|
self.pool = pool
|
|
|
self.log_client = log_client
|
|
|
self.trace_id = trace_id
|
|
|
|
|
|
- async def get_article_from_meta_table(
|
|
|
- self, platform: str, crawl_method: str, strategy: str
|
|
|
- ) -> DataFrame:
|
|
|
- """
|
|
|
- @param platform: 文章抓取平台
|
|
|
- @param crawl_method: 文章抓取模式
|
|
|
- @param strategy: 供给策略
|
|
|
- """
|
|
|
- match platform:
|
|
|
- case "weixin":
|
|
|
- article_list = await self.get_weixin_cold_start_articles(
|
|
|
- crawl_method, strategy
|
|
|
- )
|
|
|
- case "toutiao":
|
|
|
- article_list = await self.get_toutiao_cold_start_articles(
|
|
|
- crawl_method, strategy
|
|
|
- )
|
|
|
- case _:
|
|
|
- raise ValueError("Invalid platform")
|
|
|
- return DataFrame(
|
|
|
- article_list,
|
|
|
- columns=[
|
|
|
- "article_id",
|
|
|
- "title",
|
|
|
- "link",
|
|
|
- "llm_sensitivity",
|
|
|
- "score",
|
|
|
- "category_by_ai",
|
|
|
- ],
|
|
|
- )
|
|
|
-
|
|
|
async def get_weixin_cold_start_articles(
|
|
|
self, crawl_method: str, strategy: str
|
|
|
) -> List[Dict]:
|
|
@@ -124,20 +94,8 @@ class ArticlePoolColdStart(ArticlePoolColdStartConst):
|
|
|
case _:
|
|
|
raise ValueError("Invalid strategy")
|
|
|
|
|
|
- async def filter_published_titles(self, plan_id):
|
|
|
- """
|
|
|
- 过滤已添加至aigc中的标题
|
|
|
- """
|
|
|
- published_title_tuple = await get_titles_from_produce_plan(self.pool, plan_id)
|
|
|
- update_query = f"""
|
|
|
- update crawler_meta_article set status = %s where title in %s and status = %s;
|
|
|
- """
|
|
|
- changed_rows = await self.pool.async_save(
|
|
|
- query=update_query,
|
|
|
- params=(self.PUBLISHED_STATUS, published_title_tuple, self.INIT_STATUS),
|
|
|
- )
|
|
|
- return changed_rows
|
|
|
|
|
|
+class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
|
|
|
async def filter_weixin_articles(self, dataframe, crawl_method):
|
|
|
"""微信过滤漏斗"""
|
|
|
total_length: int = dataframe.shape[0]
|
|
@@ -215,6 +173,56 @@ class ArticlePoolColdStart(ArticlePoolColdStartConst):
|
|
|
)
|
|
|
return filter_df[: self.DAILY_ARTICLE_NUM]
|
|
|
|
|
|
+
|
|
|
+class ArticlePoolColdStart(ArticlePoolColdStartStrategy, ArticlePoolFilterStrategy):
|
|
|
+ def __init__(self, pool, log_client, trace_id):
|
|
|
+ super().__init__(pool, log_client, trace_id)
|
|
|
+
|
|
|
+ async def get_article_from_meta_table(
|
|
|
+ self, platform: str, crawl_method: str, strategy: str
|
|
|
+ ) -> DataFrame:
|
|
|
+ """
|
|
|
+ @param platform: 文章抓取平台
|
|
|
+ @param crawl_method: 文章抓取模式
|
|
|
+ @param strategy: 供给策略
|
|
|
+ """
|
|
|
+ match platform:
|
|
|
+ case "weixin":
|
|
|
+ article_list = await self.get_weixin_cold_start_articles(
|
|
|
+ crawl_method, strategy
|
|
|
+ )
|
|
|
+ case "toutiao":
|
|
|
+ article_list = await self.get_toutiao_cold_start_articles(
|
|
|
+ crawl_method, strategy
|
|
|
+ )
|
|
|
+ case _:
|
|
|
+ raise ValueError("Invalid platform")
|
|
|
+ return DataFrame(
|
|
|
+ article_list,
|
|
|
+ columns=[
|
|
|
+ "article_id",
|
|
|
+ "title",
|
|
|
+ "link",
|
|
|
+ "llm_sensitivity",
|
|
|
+ "score",
|
|
|
+ "category_by_ai",
|
|
|
+ ],
|
|
|
+ )
|
|
|
+
|
|
|
+ async def filter_published_titles(self, plan_id):
|
|
|
+ """
|
|
|
+ 过滤已添加至aigc中的标题
|
|
|
+ """
|
|
|
+ published_title_tuple = await get_titles_from_produce_plan(self.pool, plan_id)
|
|
|
+ update_query = f"""
|
|
|
+ update crawler_meta_article set status = %s where title in %s and status = %s;
|
|
|
+ """
|
|
|
+ changed_rows = await self.pool.async_save(
|
|
|
+ query=update_query,
|
|
|
+ params=(self.PUBLISHED_STATUS, published_title_tuple, self.INIT_STATUS),
|
|
|
+ )
|
|
|
+ return changed_rows
|
|
|
+
|
|
|
async def insert_crawler_plan_into_database(
|
|
|
self, crawler_plan_id, crawler_plan_name, create_timestamp
|
|
|
):
|
|
@@ -297,7 +305,7 @@ class ArticlePoolColdStart(ArticlePoolColdStartConst):
|
|
|
# create_crawler_plan
|
|
|
crawler_plan_response = await auto_create_crawler_task(
|
|
|
plan_id=None,
|
|
|
- plan_name="自动绑定-{}-{}-{}--{}".format(
|
|
|
+ plan_name="自动绑定strategyV2-{}-{}-{}--{}".format(
|
|
|
crawl_method,
|
|
|
ai_category,
|
|
|
datetime.date.today().__str__(),
|