|
@@ -1,11 +1,22 @@
|
|
|
import time
|
|
|
|
|
|
+from typing import Any, Dict, Tuple, Callable
|
|
|
+
|
|
|
+from pydantic import BaseModel
|
|
|
+
|
|
|
from applications.api import AsyncApolloApi
|
|
|
from applications.utils import CrawlerMetaArticle
|
|
|
+from applications.utils import CrawlerMetaAccount
|
|
|
|
|
|
|
|
|
class CrawlerPipeline(AsyncApolloApi):
|
|
|
|
|
|
+ MODEL_TABLE_MAP: Dict[str, Tuple[type[BaseModel], str]] = {
|
|
|
+ "article": (CrawlerMetaArticle, "crawler_meta_article"),
|
|
|
+ "account": (CrawlerMetaAccount, "crawler_candidate_account_pool"),
|
|
|
+ # 如后续有新类型,直接在这里加即可
|
|
|
+ }
|
|
|
+
|
|
|
def __init__(self, pool, log_client):
|
|
|
super().__init__()
|
|
|
self.pool = pool
|
|
@@ -23,64 +34,35 @@ class CrawlerPipeline(AsyncApolloApi):
|
|
|
duplicated_id = await self.pool.async_fetch(query=query, params=(title,))
|
|
|
return True if duplicated_id else False
|
|
|
|
|
|
- async def save_article(self, article_item: dict) -> None:
|
|
|
- """save articles into database"""
|
|
|
- query = f"""
|
|
|
- insert into crawler_meta_article
|
|
|
- (platform, mode, category, out_account_id, article_index, title, link,
|
|
|
- read_cnt, like_cnt, description, publish_time, crawler_time, score, status,
|
|
|
- unique_index, source_article_title, source_account, title_sensitivity)
|
|
|
- values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s);
|
|
|
- """
|
|
|
- await self.pool.async_save(
|
|
|
- query=query,
|
|
|
- params=(
|
|
|
- article_item.get("platform", "weixin"),
|
|
|
- article_item.get("mode"),
|
|
|
- article_item.get("category"),
|
|
|
- article_item.get("out_account_id"),
|
|
|
- article_item.get("article_index"),
|
|
|
- article_item.get("title"),
|
|
|
- article_item.get("link"),
|
|
|
- article_item.get("read_cnt", 0),
|
|
|
- article_item.get("like_cnt", 0),
|
|
|
- article_item.get("description"),
|
|
|
- article_item.get("publish_time"),
|
|
|
- article_item.get("crawler_time", int(time.time())),
|
|
|
- article_item.get("score"),
|
|
|
- article_item.get("status", 1),
|
|
|
- article_item.get("unique_index"),
|
|
|
- article_item.get("source_article_title", None),
|
|
|
- article_item.get("source_account", None),
|
|
|
- article_item.get("title_sensitivity", 0),
|
|
|
- ),
|
|
|
+ async def whether_account_exist(self, account_id: str, media_type: str) -> bool:
|
|
|
+ query = f"select id from crawler_candidate_account_pool where account_id = %s and media_type = %s;"
|
|
|
+ duplicated_id = await self.pool.async_fetch(
|
|
|
+ query=query, params=(account_id, media_type)
|
|
|
)
|
|
|
+ return True if duplicated_id else False
|
|
|
|
|
|
- async def save_article_v2(self, article_item: dict) -> None:
|
|
|
- """save articles into database"""
|
|
|
- new_article = CrawlerMetaArticle(**article_item)
|
|
|
- new_article_dict = new_article.model_dump()
|
|
|
- insert_template = (
|
|
|
- """insert into crawler_meta_article ({columns}) values ({values});"""
|
|
|
- )
|
|
|
- insert_data = {k: v for k, v in new_article_dict.items() if v is not None}
|
|
|
- columns = ", ".join(insert_data.keys())
|
|
|
- values = ", ".join([f"%s" for i in range(len(insert_data))])
|
|
|
- query = insert_template.format(columns=columns, values=values)
|
|
|
- await self.pool.async_save(
|
|
|
- query=query,
|
|
|
- params=tuple(list(insert_data.values())),
|
|
|
- )
|
|
|
+ async def save_single_record(self, media_type: str, item: dict) -> None:
|
|
|
+ try:
|
|
|
+ model_cls, table_name = self.MODEL_TABLE_MAP[media_type]
|
|
|
+
|
|
|
+ except KeyError:
|
|
|
+ raise ValueError(f"Unknown media type: {media_type!r}")
|
|
|
|
|
|
- async def save_video(self, video_item: dict) -> str:
|
|
|
- pass
|
|
|
+ record = model_cls(**item).model_dump(mode="python")
|
|
|
+ insert_data = {k: v for k, v in record.items() if v is not None}
|
|
|
+ if not insert_data:
|
|
|
+ raise ValueError("All fields are None, nothing to insert")
|
|
|
+
|
|
|
+ columns = ", ".join(f"`{col}`" for col in insert_data)
|
|
|
+ placeholders = ", ".join(["%s"] * len(insert_data))
|
|
|
+ sql = f"INSERT INTO `{table_name}` ({columns}) VALUES ({placeholders})"
|
|
|
+ await self.pool.async_save(sql, tuple(insert_data.values()))
|
|
|
|
|
|
async def save_item_to_database(self, media_type: str, item: dict):
|
|
|
"""deal function"""
|
|
|
match media_type:
|
|
|
case "video":
|
|
|
- await self.save_video(item)
|
|
|
-
|
|
|
+ await self.save_single_record(media_type, item)
|
|
|
case "article":
|
|
|
log_data = {
|
|
|
"title": item["title"],
|
|
@@ -96,7 +78,6 @@ class CrawlerPipeline(AsyncApolloApi):
|
|
|
"code": 1001,
|
|
|
}
|
|
|
)
|
|
|
- # 判断文章标题是否已经存在
|
|
|
if await self.whether_article_title_duplicate(log_data["title"]):
|
|
|
await self.log_client.log(
|
|
|
contents={
|
|
@@ -107,7 +88,7 @@ class CrawlerPipeline(AsyncApolloApi):
|
|
|
}
|
|
|
)
|
|
|
return
|
|
|
- # 判断标题是否敏感
|
|
|
+
|
|
|
if await self.whether_title_sensitive(item["title"]):
|
|
|
await self.log_client.log(
|
|
|
contents={
|
|
@@ -118,8 +99,16 @@ class CrawlerPipeline(AsyncApolloApi):
|
|
|
}
|
|
|
)
|
|
|
item["title_sensitive"] = 1
|
|
|
- # save article
|
|
|
- await self.save_article_v2(item)
|
|
|
+
|
|
|
+ await self.save_single_record(media_type, item)
|
|
|
+
|
|
|
+ case "account":
|
|
|
+ if await self.whether_account_exist(
|
|
|
+ item["account_id"], item["media_type"]
|
|
|
+ ):
|
|
|
+ return
|
|
|
+
|
|
|
+ await self.save_single_record(media_type, item)
|
|
|
|
|
|
case _:
|
|
|
raise Exception("Unknown media type")
|