|
@@ -1,7 +1,7 @@
|
|
|
import time
|
|
|
import traceback
|
|
|
|
|
|
-from tqdm import tqdm
|
|
|
+from typing import Optional, List, Dict, Tuple
|
|
|
|
|
|
from applications.api import fetch_deepseek_completion
|
|
|
from applications.utils import yield_batch
|
|
@@ -48,7 +48,7 @@ class TitleProcess(Const):
|
|
|
self.trace_id = trace_id
|
|
|
|
|
|
@staticmethod
|
|
|
- def generate_title_rewrite_prompt(ori_title):
|
|
|
+ def generate_title_rewrite_prompt(ori_title: str) -> str:
|
|
|
"""
|
|
|
生成prompt
|
|
|
"""
|
|
@@ -131,7 +131,7 @@ class TitleProcess(Const):
|
|
|
return prompt
|
|
|
|
|
|
@staticmethod
|
|
|
- def category_generation_from_title(title_list):
|
|
|
+ def category_generation_from_title(title_list: List[Tuple[str, str]]) -> str:
|
|
|
"""
|
|
|
generate prompt category for given title
|
|
|
"""
|
|
@@ -252,22 +252,24 @@ class TitleProcess(Const):
|
|
|
"""
|
|
|
return prompt
|
|
|
|
|
|
- async def _roll_back_lock_tasks(self, table_name):
|
|
|
+ async def _roll_back_lock_tasks(self, table_name: str) -> int:
|
|
|
query = f"""
|
|
|
update {table_name}
|
|
|
set category_status = %s
|
|
|
where category_status = %s and category_status_update_ts <= %s;
|
|
|
"""
|
|
|
- await self.pool.async_save(
|
|
|
+ return await self.pool.async_save(
|
|
|
query=query,
|
|
|
params=(
|
|
|
self.INIT_STATUS,
|
|
|
self.PROCESSING_STATUS,
|
|
|
int(time.time()) - self.MAX_PROCESSING_TIME,
|
|
|
- )
|
|
|
+ ),
|
|
|
)
|
|
|
|
|
|
- async def process_single_article(self, content_type, article):
|
|
|
+ async def process_single_article(
|
|
|
+ self, content_type: str, article: Dict
|
|
|
+ ) -> Optional[Dict]:
|
|
|
match content_type:
|
|
|
case "video":
|
|
|
article_id = article["id"]
|
|
@@ -291,158 +293,13 @@ class TitleProcess(Const):
|
|
|
contents={
|
|
|
"trace_id": self.trace_id,
|
|
|
"data": {
|
|
|
- "article_id": article_id,
|
|
|
- "error": str(e),
|
|
|
- "traceback": traceback.format_exc(),
|
|
|
- }
|
|
|
- }
|
|
|
- )
|
|
|
- return None
|
|
|
-
|
|
|
-
|
|
|
-class TitleRewrite(TitleProcess):
|
|
|
-
|
|
|
- async def roll_back_blocked_tasks(self):
|
|
|
- """
|
|
|
- rollback blocked tasks
|
|
|
- """
|
|
|
- query = f"""
|
|
|
- select id, title_rewrite_status_update_timestamp
|
|
|
- from publish_single_video_source
|
|
|
- where title_rewrite_status = {self.TITLE_REWRITE_LOCK_STATUS};
|
|
|
- """
|
|
|
- article_list = await self.pool.async_fetch(
|
|
|
- query=query,
|
|
|
- db_name="long_articles",
|
|
|
- )
|
|
|
- if article_list:
|
|
|
- blocked_id_list = [
|
|
|
- i["id"]
|
|
|
- for i in article_list
|
|
|
- if (int(time.time()) - i["title_rewrite_status_update_timestamp"])
|
|
|
- > self.TITLE_REWRITE_LOCK_TIME
|
|
|
- ]
|
|
|
- if blocked_id_list:
|
|
|
- update_query = f"""
|
|
|
- update publish_single_video_source
|
|
|
- set title_rewrite_status = %s
|
|
|
- where id in %s and title_rewrite_status = %s;
|
|
|
- """
|
|
|
- await self.pool.async_save(
|
|
|
- query=update_query,
|
|
|
- params=(
|
|
|
- self.TITLE_REWRITE_INIT_STATUS,
|
|
|
- tuple(blocked_id_list),
|
|
|
- self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
- ),
|
|
|
- )
|
|
|
-
|
|
|
- async def get_articles_batch(self, batch_size=1000):
|
|
|
- query = f"""
|
|
|
- select content_trace_id, article_title
|
|
|
- from publish_single_video_source
|
|
|
- where bad_status = {self.ARTICLE_POSITIVE_STATUS}
|
|
|
- and audit_status = {self.ARTICLE_AUDIT_PASSED_STATUS}
|
|
|
- and title_rewrite_status = {self.TITLE_REWRITE_INIT_STATUS}
|
|
|
- and platform in ('hksp', 'sph')
|
|
|
- limit {batch_size};
|
|
|
- """
|
|
|
- return await self.pool.async_fetch(query=query, db_name="long_articles")
|
|
|
-
|
|
|
- async def update_title_rewrite_status(
|
|
|
- self, content_trace_id, ori_status, new_status
|
|
|
- ):
|
|
|
- query = f"""
|
|
|
- update publish_single_video_source
|
|
|
- set title_rewrite_status = %s, title_rewrite_status_update_timestamp = %s
|
|
|
- where content_trace_id = %s and title_rewrite_status= %s;
|
|
|
- """
|
|
|
- affected_rows = await self.pool.async_save(
|
|
|
- query=query,
|
|
|
- params=(new_status, int(time.time()), content_trace_id, ori_status),
|
|
|
- )
|
|
|
- return affected_rows
|
|
|
-
|
|
|
- async def insert_into_rewrite_table(self, content_trace_id, new_title):
|
|
|
- """
|
|
|
- insert into rewrite_table
|
|
|
- """
|
|
|
- insert_sql = f"""
|
|
|
- insert into video_title_rewrite
|
|
|
- (content_trace_id, new_title, status, prompt_version)
|
|
|
- values (%s, %s, %s, %s);
|
|
|
- """
|
|
|
- await self.pool.async_save(
|
|
|
- query=insert_sql,
|
|
|
- params=(
|
|
|
- content_trace_id,
|
|
|
- new_title,
|
|
|
- self.TITLE_USEFUL_STATUS,
|
|
|
- self.PROMPT_VERSION,
|
|
|
- ),
|
|
|
- )
|
|
|
-
|
|
|
- async def rewrite_each_article(self, article):
|
|
|
- """
|
|
|
- rewrite each article
|
|
|
- """
|
|
|
- content_trace_id = article["content_trace_id"]
|
|
|
- article_title = article["article_title"]
|
|
|
-
|
|
|
- # lock each task
|
|
|
- affected_rows = await self.update_title_rewrite_status(
|
|
|
- content_trace_id=content_trace_id,
|
|
|
- ori_status=self.TITLE_REWRITE_INIT_STATUS,
|
|
|
- new_status=self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
- )
|
|
|
- if not affected_rows:
|
|
|
- return
|
|
|
-
|
|
|
- try:
|
|
|
- prompt = self.generate_title_rewrite_prompt(article_title)
|
|
|
- new_title = fetch_deepseek_completion(model="default", prompt=prompt)
|
|
|
-
|
|
|
- # insert into rewrite table
|
|
|
- await self.insert_into_rewrite_table(
|
|
|
- content_trace_id=content_trace_id, new_title=new_title
|
|
|
- )
|
|
|
-
|
|
|
- # unlock
|
|
|
- await self.update_title_rewrite_status(
|
|
|
- content_trace_id=content_trace_id,
|
|
|
- ori_status=self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
- new_status=self.TITLE_REWRITE_SUCCESS_STATUS,
|
|
|
- )
|
|
|
- except Exception as e:
|
|
|
- await self.aliyun_log.log(
|
|
|
- contents={
|
|
|
- "task": "title rewrite task",
|
|
|
- "function": "rewrite_each_article",
|
|
|
- "message": content_trace_id,
|
|
|
- "status": "fail",
|
|
|
- "data": {
|
|
|
- "error_message": str(e),
|
|
|
- "error_type": type(e).__name__,
|
|
|
+ "article_id": article_id,
|
|
|
+ "error": str(e),
|
|
|
"traceback": traceback.format_exc(),
|
|
|
},
|
|
|
}
|
|
|
)
|
|
|
- await self.update_title_rewrite_status(
|
|
|
- content_trace_id=content_trace_id,
|
|
|
- ori_status=self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
- new_status=self.TITLE_REWRITE_FAIL_STATUS,
|
|
|
- )
|
|
|
-
|
|
|
- async def deal(self):
|
|
|
- """title rewrite task deal"""
|
|
|
- await self.roll_back_blocked_tasks()
|
|
|
-
|
|
|
- task_list = await self.get_articles_batch()
|
|
|
-
|
|
|
- bar = tqdm(task_list, desc="title rewrite task")
|
|
|
- for article in bar:
|
|
|
- await self.rewrite_each_article(article)
|
|
|
- bar.set_description("title rewrite task")
|
|
|
+ return None
|
|
|
|
|
|
|
|
|
class VideoPoolCategoryGeneration:
|
|
@@ -473,16 +330,23 @@ class ArticlePoolCategoryGeneration(TitleProcess):
|
|
|
|
|
|
async def get_task_list(self, limit):
|
|
|
query = f"""
|
|
|
- select article_id, title from long_articles.crawler_meta_article
|
|
|
+ select article_id, title from crawler_meta_article
|
|
|
where category_status = %s and status = %s and score > %s
|
|
|
order by score desc limit %s;
|
|
|
"""
|
|
|
return await self.pool.async_fetch(
|
|
|
query=query,
|
|
|
- params=(self.INIT_STATUS, self.ARTICLE_INIT_STATUS, self.LIMIT_SCORE, limit),
|
|
|
+ params=(
|
|
|
+ self.INIT_STATUS,
|
|
|
+ self.ARTICLE_INIT_STATUS,
|
|
|
+ self.LIMIT_SCORE,
|
|
|
+ limit,
|
|
|
+ ),
|
|
|
)
|
|
|
|
|
|
- async def set_category_status_as_success(self,article_id: int, category: str) -> int:
|
|
|
+ async def set_category_status_as_success(
|
|
|
+ self, article_id: int, category: str
|
|
|
+ ) -> int:
|
|
|
update_query = f"""
|
|
|
update long_articles.crawler_meta_article
|
|
|
set category_by_ai = %s, category_status = %s, category_status_update_ts = %s
|
|
@@ -546,17 +410,21 @@ class ArticlePoolCategoryGeneration(TitleProcess):
|
|
|
"article_id": id_tuple,
|
|
|
"error": str(e),
|
|
|
"traceback": traceback.format_exc(),
|
|
|
- }
|
|
|
+ },
|
|
|
}
|
|
|
)
|
|
|
- for article in tqdm(title_batch):
|
|
|
- single_completion = await self.process_single_article(content_type="article", article=article)
|
|
|
- article_id = article[0]
|
|
|
+ for article in task_batch:
|
|
|
+ single_completion = await self.process_single_article(
|
|
|
+ content_type="article", article=article
|
|
|
+ )
|
|
|
+ article_id = article["article_id"]
|
|
|
if single_completion:
|
|
|
category = single_completion.get(str(article_id))
|
|
|
if category:
|
|
|
# set as success
|
|
|
- await self.set_category_status_as_success(article_id, category)
|
|
|
+ await self.set_category_status_as_success(
|
|
|
+ article_id, category
|
|
|
+ )
|
|
|
else:
|
|
|
await self.set_category_status_as_fail(article_id)
|
|
|
else:
|
|
@@ -567,19 +435,28 @@ class ArticlePoolCategoryGeneration(TitleProcess):
|
|
|
return
|
|
|
|
|
|
async def deal(self, limit):
|
|
|
- await self._roll_back_lock_tasks(table_name="crawler_meta_article")
|
|
|
+ # await self._roll_back_lock_tasks(table_name="crawler_meta_article")
|
|
|
|
|
|
if not limit:
|
|
|
limit = self.PROCESS_NUM
|
|
|
|
|
|
task_list = await self.get_task_list(limit=limit)
|
|
|
+ print(task_list)
|
|
|
+ await self.aliyun_log.log(
|
|
|
+ contents={
|
|
|
+ "task": "ArticlePoolCategoryGeneration",
|
|
|
+ "function": "deal",
|
|
|
+ "trace_id": self.trace_id,
|
|
|
+ "message": f"总共获取{len(task_list)}条文章",
|
|
|
+ }
|
|
|
+ )
|
|
|
task_batch_list = yield_batch(data=task_list, batch_size=self.BATCH_SIZE)
|
|
|
batch_index = 0
|
|
|
for task_batch in task_batch_list:
|
|
|
batch_index += 1
|
|
|
try:
|
|
|
await self.process_each_batch(task_batch)
|
|
|
-
|
|
|
+ print(f"batch :{batch_index} 处理成功")
|
|
|
except Exception as e:
|
|
|
await self.aliyun_log.log(
|
|
|
contents={
|
|
@@ -591,18 +468,149 @@ class ArticlePoolCategoryGeneration(TitleProcess):
|
|
|
"data": {
|
|
|
"error": str(e),
|
|
|
"traceback": traceback.format_exc(),
|
|
|
- }
|
|
|
+ },
|
|
|
}
|
|
|
)
|
|
|
|
|
|
|
|
|
+class TitleRewrite(TitleProcess):
|
|
|
|
|
|
+ async def roll_back_blocked_tasks(self):
|
|
|
+ """
|
|
|
+ rollback blocked tasks
|
|
|
+ """
|
|
|
+ query = f"""
|
|
|
+ select id, title_rewrite_status_update_timestamp
|
|
|
+ from publish_single_video_source
|
|
|
+ where title_rewrite_status = {self.TITLE_REWRITE_LOCK_STATUS};
|
|
|
+ """
|
|
|
+ article_list = await self.pool.async_fetch(
|
|
|
+ query=query,
|
|
|
+ db_name="long_articles",
|
|
|
+ )
|
|
|
+ if article_list:
|
|
|
+ blocked_id_list = [
|
|
|
+ i["id"]
|
|
|
+ for i in article_list
|
|
|
+ if (int(time.time()) - i["title_rewrite_status_update_timestamp"])
|
|
|
+ > self.TITLE_REWRITE_LOCK_TIME
|
|
|
+ ]
|
|
|
+ if blocked_id_list:
|
|
|
+ update_query = f"""
|
|
|
+ update publish_single_video_source
|
|
|
+ set title_rewrite_status = %s
|
|
|
+ where id in %s and title_rewrite_status = %s;
|
|
|
+ """
|
|
|
+ await self.pool.async_save(
|
|
|
+ query=update_query,
|
|
|
+ params=(
|
|
|
+ self.TITLE_REWRITE_INIT_STATUS,
|
|
|
+ tuple(blocked_id_list),
|
|
|
+ self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
+ ),
|
|
|
+ )
|
|
|
|
|
|
+ async def get_articles_batch(self, batch_size=1000):
|
|
|
+ query = f"""
|
|
|
+ select content_trace_id, article_title
|
|
|
+ from publish_single_video_source
|
|
|
+ where bad_status = {self.ARTICLE_POSITIVE_STATUS}
|
|
|
+ and audit_status = {self.ARTICLE_AUDIT_PASSED_STATUS}
|
|
|
+ and title_rewrite_status = {self.TITLE_REWRITE_INIT_STATUS}
|
|
|
+ and platform in ('hksp', 'sph')
|
|
|
+ limit {batch_size};
|
|
|
+ """
|
|
|
+ return await self.pool.async_fetch(query=query, db_name="long_articles")
|
|
|
|
|
|
+ async def update_title_rewrite_status(
|
|
|
+ self, content_trace_id, ori_status, new_status
|
|
|
+ ):
|
|
|
+ query = f"""
|
|
|
+ update publish_single_video_source
|
|
|
+ set title_rewrite_status = %s, title_rewrite_status_update_timestamp = %s
|
|
|
+ where content_trace_id = %s and title_rewrite_status= %s;
|
|
|
+ """
|
|
|
+ affected_rows = await self.pool.async_save(
|
|
|
+ query=query,
|
|
|
+ params=(new_status, int(time.time()), content_trace_id, ori_status),
|
|
|
+ )
|
|
|
+ return affected_rows
|
|
|
|
|
|
+ async def insert_into_rewrite_table(self, content_trace_id, new_title):
|
|
|
+ """
|
|
|
+ insert into rewrite_table
|
|
|
+ """
|
|
|
+ insert_sql = f"""
|
|
|
+ insert into video_title_rewrite
|
|
|
+ (content_trace_id, new_title, status, prompt_version)
|
|
|
+ values (%s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ await self.pool.async_save(
|
|
|
+ query=insert_sql,
|
|
|
+ params=(
|
|
|
+ content_trace_id,
|
|
|
+ new_title,
|
|
|
+ self.TITLE_USEFUL_STATUS,
|
|
|
+ self.PROMPT_VERSION,
|
|
|
+ ),
|
|
|
+ )
|
|
|
|
|
|
+ async def rewrite_each_article(self, article):
|
|
|
+ """
|
|
|
+ rewrite each article
|
|
|
+ """
|
|
|
+ content_trace_id = article["content_trace_id"]
|
|
|
+ article_title = article["article_title"]
|
|
|
+
|
|
|
+ # lock each task
|
|
|
+ affected_rows = await self.update_title_rewrite_status(
|
|
|
+ content_trace_id=content_trace_id,
|
|
|
+ ori_status=self.TITLE_REWRITE_INIT_STATUS,
|
|
|
+ new_status=self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
+ )
|
|
|
+ if not affected_rows:
|
|
|
+ return
|
|
|
+
|
|
|
+ try:
|
|
|
+ prompt = self.generate_title_rewrite_prompt(article_title)
|
|
|
+ new_title = fetch_deepseek_completion(model="default", prompt=prompt)
|
|
|
|
|
|
+ # insert into rewrite table
|
|
|
+ await self.insert_into_rewrite_table(
|
|
|
+ content_trace_id=content_trace_id, new_title=new_title
|
|
|
+ )
|
|
|
|
|
|
+ # unlock
|
|
|
+ await self.update_title_rewrite_status(
|
|
|
+ content_trace_id=content_trace_id,
|
|
|
+ ori_status=self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
+ new_status=self.TITLE_REWRITE_SUCCESS_STATUS,
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ await self.aliyun_log.log(
|
|
|
+ contents={
|
|
|
+ "task": "title rewrite task",
|
|
|
+ "function": "rewrite_each_article",
|
|
|
+ "message": content_trace_id,
|
|
|
+ "status": "fail",
|
|
|
+ "data": {
|
|
|
+ "error_message": str(e),
|
|
|
+ "error_type": type(e).__name__,
|
|
|
+ "traceback": traceback.format_exc(),
|
|
|
+ },
|
|
|
+ }
|
|
|
+ )
|
|
|
+ await self.update_title_rewrite_status(
|
|
|
+ content_trace_id=content_trace_id,
|
|
|
+ ori_status=self.TITLE_REWRITE_LOCK_STATUS,
|
|
|
+ new_status=self.TITLE_REWRITE_FAIL_STATUS,
|
|
|
+ )
|
|
|
|
|
|
+ async def deal(self):
|
|
|
+ """title rewrite task deal"""
|
|
|
+ await self.roll_back_blocked_tasks()
|
|
|
|
|
|
+ task_list = await self.get_articles_batch()
|
|
|
|
|
|
+ for article in task_list:
|
|
|
+ await self.rewrite_each_article(article)
|