|
|
@@ -32,7 +32,7 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
await self.log_service.log(
|
|
|
contents={
|
|
|
"article_id": article_id,
|
|
|
- "task": "create_decode_task",
|
|
|
+ "task": self.LogTaskKey.CREATE_SINGLE,
|
|
|
"status": "skip",
|
|
|
"message": "acquire lock failed",
|
|
|
}
|
|
|
@@ -50,7 +50,7 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
await self.log_service.log(
|
|
|
contents={
|
|
|
"article_id": article_id,
|
|
|
- "task": "create_decode_task",
|
|
|
+ "task": self.LogTaskKey.CREATE_SINGLE,
|
|
|
"status": "fail",
|
|
|
"data": response,
|
|
|
}
|
|
|
@@ -68,7 +68,7 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
await self.log_service.log(
|
|
|
contents={
|
|
|
"article_id": article_id,
|
|
|
- "task": "create_decode_task",
|
|
|
+ "task": self.LogTaskKey.CREATE_SINGLE,
|
|
|
"status": "fail",
|
|
|
"data": response,
|
|
|
}
|
|
|
@@ -79,7 +79,7 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
await self.log_service.log(
|
|
|
contents={
|
|
|
"article_id": article_id,
|
|
|
- "task": "create_decode_task",
|
|
|
+ "task": self.LogTaskKey.CREATE_SINGLE,
|
|
|
"status": "success",
|
|
|
"data": response,
|
|
|
}
|
|
|
@@ -96,7 +96,7 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
await self.log_service.log(
|
|
|
contents={
|
|
|
"article_id": article_id,
|
|
|
- "task": "record_decode_task",
|
|
|
+ "task": self.LogTaskKey.RECORD_QUEUE,
|
|
|
"status": "fail",
|
|
|
"message": "创建 decode 记录失败",
|
|
|
"data": response,
|
|
|
@@ -114,13 +114,15 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
if not article_list:
|
|
|
await self.log_service.log(
|
|
|
contents={
|
|
|
- "task": "create_tasks",
|
|
|
+ "task": self.LogTaskKey.CREATE_BATCH,
|
|
|
"message": "No more articles to decode",
|
|
|
}
|
|
|
)
|
|
|
return
|
|
|
|
|
|
- for article in tqdm(article_list, desc="Creating decode tasks"):
|
|
|
+ for article in tqdm(
|
|
|
+ article_list, desc=self.AdPlatformDecodeBatch.TQDM_DESCRIPTION
|
|
|
+ ):
|
|
|
await self.create_single_decode_task(article)
|
|
|
|
|
|
async def deal(self):
|
|
|
@@ -128,11 +130,6 @@ class CreateAdPlatformArticlesDecodeTask(DecodeTaskConst):
|
|
|
|
|
|
|
|
|
class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
- CREATE_TASK_NAME = "create_inner_articles_decode_task"
|
|
|
- MAX_CREATE_RETRY_TIMES = 3
|
|
|
- LOCK_TIMEOUT_SECONDS = 30 * 60
|
|
|
- CREATE_MAX_CONCURRENCY = 5
|
|
|
-
|
|
|
def __init__(self, pool: DatabaseManager, log_service: LogService):
|
|
|
self.pool = pool
|
|
|
self.log_service = log_service
|
|
|
@@ -141,11 +138,13 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
|
|
|
async def _log_create_event(self, **contents):
|
|
|
await self.log_service.log(
|
|
|
- contents={"task": self.CREATE_TASK_NAME, **contents}
|
|
|
+ contents={"task": self.InnerDecodeCreate.SCHEDULER_TASK_NAME, **contents}
|
|
|
)
|
|
|
|
|
|
@staticmethod
|
|
|
- def _trim_error_message(message: str, limit: int = 500):
|
|
|
+ def _trim_error_message(message: str, limit: int = None):
|
|
|
+ if limit is None:
|
|
|
+ limit = DecodeTaskConst.InnerDecodeCreate.ERROR_MESSAGE_MAX_CHARS
|
|
|
if not message:
|
|
|
return ""
|
|
|
return message[:limit]
|
|
|
@@ -160,7 +159,9 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
):
|
|
|
now_ts = int(time.time())
|
|
|
retry_count = (state or {}).get("retry_count", 0)
|
|
|
- should_retry = retryable and retry_count < self.MAX_CREATE_RETRY_TIMES
|
|
|
+ should_retry = (
|
|
|
+ retryable and retry_count < self.InnerDecodeCreate.MAX_RETRY_TIMES
|
|
|
+ )
|
|
|
error_message = self._trim_error_message(error_message)
|
|
|
|
|
|
if should_retry:
|
|
|
@@ -202,27 +203,31 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
"title": article["title"],
|
|
|
"cover_img": article["cover_img_url"],
|
|
|
"channel_content_id": article.get("wx_sn", source_id),
|
|
|
- "content_type": self.ContentType.TITLE_COVER
|
|
|
+ "content_type": self.ContentType.TITLE_COVER,
|
|
|
}
|
|
|
|
|
|
case self.TaskType.SOURCE_IMAGES_TEXT:
|
|
|
- crawl_source_info = await self.mapper.fetch_article_crawler_source_info(source_id)
|
|
|
+ crawl_source_info = await self.mapper.fetch_article_crawler_source_info(
|
|
|
+ source_id
|
|
|
+ )
|
|
|
if not crawl_source_info:
|
|
|
raise ValueError("未找到文章抓取源信息")
|
|
|
|
|
|
crawl_info = crawl_source_info[0]
|
|
|
channel_content_id = crawl_info["channel_content_id"]
|
|
|
raw_body_text = crawl_info["body_text"]
|
|
|
- body_text, images = self.tool.extract_body_text_and_images(raw_body_text)
|
|
|
+ body_text, images = self.tool.extract_body_text_and_images(
|
|
|
+ raw_body_text
|
|
|
+ )
|
|
|
if not body_text and not images:
|
|
|
raise ValueError("文章正文和图片均为空,无法创建解构任务")
|
|
|
|
|
|
return {
|
|
|
"source_id": source_id,
|
|
|
"images": images,
|
|
|
- "body_text": body_text,
|
|
|
+ "body_text": raw_body_text,
|
|
|
"channel_content_id": channel_content_id or source_id,
|
|
|
- "content_type": self.ContentType.LONG_ARTICLE
|
|
|
+ "content_type": self.ContentType.LONG_ARTICLE,
|
|
|
}
|
|
|
|
|
|
case self.TaskType.MINI_TITLE_CARD:
|
|
|
@@ -235,15 +240,32 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
article = task["article"]
|
|
|
task_type = task["task_type"]
|
|
|
source_id = article["source_id"]
|
|
|
+ exist_task = await self.mapper.fetch_exist_source_id(source_id, task_type)
|
|
|
+ if exist_task:
|
|
|
+ await self.mapper.mark_create_success(
|
|
|
+ source_id=source_id,
|
|
|
+ task_type=task_type,
|
|
|
+ remote_task_id=self.InnerDecodeCreate.DUPLICATE_SKIP_REMOTE_TASK_ID,
|
|
|
+ now_ts=int(time.time()),
|
|
|
+ remark="任务已存在,跳过重复创建",
|
|
|
+ )
|
|
|
+ await self._log_create_event(
|
|
|
+ source_id=source_id,
|
|
|
+ task_type=task_type,
|
|
|
+ status="skip",
|
|
|
+ message="decode task already exists",
|
|
|
+ )
|
|
|
+ return
|
|
|
+
|
|
|
now_ts = int(time.time())
|
|
|
- lock_expire_before = now_ts - self.LOCK_TIMEOUT_SECONDS
|
|
|
+ lock_expire_before = now_ts - self.INNER_DECODE_LOCK_TIMEOUT_SECONDS
|
|
|
|
|
|
await self.mapper.init_create_state(source_id, task_type, now_ts)
|
|
|
acquire_lock = await self.mapper.acquire_create_lock(
|
|
|
source_id=source_id,
|
|
|
task_type=task_type,
|
|
|
now_ts=now_ts,
|
|
|
- max_retry_times=self.MAX_CREATE_RETRY_TIMES,
|
|
|
+ max_retry_times=self.InnerDecodeCreate.MAX_RETRY_TIMES,
|
|
|
lock_expire_before=lock_expire_before,
|
|
|
)
|
|
|
if not acquire_lock:
|
|
|
@@ -258,28 +280,11 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
state = await self.mapper.fetch_create_state(source_id, task_type)
|
|
|
|
|
|
try:
|
|
|
- # exist_task = await self.mapper.fetch_exist_source_id(source_id, task_type)
|
|
|
- # if exist_task:
|
|
|
- # await self.mapper.mark_create_success(
|
|
|
- # source_id=source_id,
|
|
|
- # task_type=task_type,
|
|
|
- # remote_task_id="existing_task",
|
|
|
- # now_ts=int(time.time()),
|
|
|
- # remark="任务已存在,跳过重复创建",
|
|
|
- # )
|
|
|
- # await self._log_create_event(
|
|
|
- # source_id=source_id,
|
|
|
- # task_type=task_type,
|
|
|
- # status="skip",
|
|
|
- # message="decode task already exists",
|
|
|
- # )
|
|
|
- # return
|
|
|
-
|
|
|
payload = await self._build_payload(article, task_type)
|
|
|
|
|
|
response = await self.tool.create_decode_task_with_retry(
|
|
|
payload=payload,
|
|
|
- retry_times=self.MAX_CREATE_RETRY_TIMES,
|
|
|
+ retry_times=self.InnerDecodeCreate.MAX_RETRY_TIMES,
|
|
|
)
|
|
|
response_code = response.get("code")
|
|
|
if response_code != self.RequestDecode.SUCCESS:
|
|
|
@@ -313,7 +318,10 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
payload=json.dumps(payload, ensure_ascii=False),
|
|
|
remark=remark,
|
|
|
)
|
|
|
- if record_row not in (0, 1):
|
|
|
+ if record_row not in (
|
|
|
+ self.InnerDecodeCreate.INSERT_IGNORE_AFFECTED_NOOP,
|
|
|
+ self.InnerDecodeCreate.INSERT_IGNORE_AFFECTED_INSERTED,
|
|
|
+ ):
|
|
|
await self._mark_retry_or_failed(
|
|
|
source_id=source_id,
|
|
|
task_type=task_type,
|
|
|
@@ -363,7 +371,7 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
)
|
|
|
|
|
|
async def create_tasks(self, date_string: str = None, max_concurrency: int = None):
|
|
|
- article_list = await self.mapper.fetch_inner_articles(date_string or "20260401")
|
|
|
+ article_list = await self.mapper.fetch_inner_articles(date_string)
|
|
|
if not article_list:
|
|
|
await self._log_create_event(
|
|
|
status="empty",
|
|
|
@@ -384,10 +392,11 @@ class CreateInnerArticlesDecodeTask(DecodeTaskConst):
|
|
|
result = await run_tasks_with_asyncio_task_group(
|
|
|
task_list=task_list,
|
|
|
handler=self.create_single_decode_task,
|
|
|
- description="Creating inner decode tasks",
|
|
|
- unit="task",
|
|
|
- max_concurrency=max_concurrency or self.CREATE_MAX_CONCURRENCY,
|
|
|
- fail_fast=False,
|
|
|
+ description=self.InnerDecodeCreate.ASYNC_BATCH_DESCRIPTION,
|
|
|
+ unit=self.InnerDecodeCreate.ASYNC_BATCH_UNIT,
|
|
|
+ max_concurrency=max_concurrency
|
|
|
+ or self.InnerDecodeCreate.DEFAULT_MAX_CONCURRENCY,
|
|
|
+ fail_fast=self.InnerDecodeCreate.ASYNC_BATCH_FAIL_FAST,
|
|
|
)
|
|
|
if result["errors"]:
|
|
|
await self._log_create_event(
|