|
@@ -9,10 +9,12 @@ from typing import List, Dict
|
|
|
from pymysql.cursors import DictCursor
|
|
|
|
|
|
from applications import aiditApi
|
|
|
+from applications.api import ElasticSearchClient
|
|
|
from applications.api import fetch_deepseek_completion
|
|
|
from applications.api import similarity_between_title_list
|
|
|
from applications.db import DatabaseConnector
|
|
|
from config import long_articles_config, denet_config
|
|
|
+from config.es_mappings import index_name
|
|
|
|
|
|
extract_keywords_prompt = """
|
|
|
你是一名优秀的中文专家
|
|
@@ -36,6 +38,8 @@ class TopArticleGeneralize:
|
|
|
self.denet_client = DatabaseConnector(denet_config)
|
|
|
self.denet_client.connect()
|
|
|
|
|
|
+ self.elastic_search = ElasticSearchClient(index_=index_name)
|
|
|
+
|
|
|
def fetch_distinct_top_titles(self) -> List[Dict]:
|
|
|
"""
|
|
|
获取top100生成计划中的文章标题
|
|
@@ -94,19 +98,45 @@ class TopArticleGeneralize:
|
|
|
)
|
|
|
return response["keys"]
|
|
|
|
|
|
+ def migrate_article_to_es(self, max_article_id: int = 0):
|
|
|
+ fetch_query = f"""
|
|
|
+ select article_id, platform, out_account_id, title
|
|
|
+ from crawler_meta_article
|
|
|
+ where status = 1 and article_id > %s
|
|
|
+ order by article_id limit 10000;
|
|
|
+ """
|
|
|
+ # 执行查询
|
|
|
+ results = self.long_articles_client.fetch(
|
|
|
+ fetch_query, cursor_type=DictCursor, params=(max_article_id,)
|
|
|
+ )
|
|
|
+ docs = [
|
|
|
+ {
|
|
|
+ "_index": index_name,
|
|
|
+ "_id": item["article_id"],
|
|
|
+ "_source": {
|
|
|
+ "article_id": item["article_id"],
|
|
|
+ "platform": item["platform"],
|
|
|
+ "out_account_id": item["out_account_id"],
|
|
|
+ "title": item["title"],
|
|
|
+ },
|
|
|
+ }
|
|
|
+ for item in results
|
|
|
+ ]
|
|
|
+ self.elastic_search.bulk_insert(docs)
|
|
|
+
|
|
|
|
|
|
class TopArticleGeneralizeFromArticlePool(TopArticleGeneralize):
|
|
|
|
|
|
- def get_candidate_articles(self, key):
|
|
|
+ def get_candidate_articles(self, article_id_tuple):
|
|
|
fetch_query = f"""
|
|
|
select article_id, title, link, llm_sensitivity, score, category_by_ai
|
|
|
from crawler_meta_article
|
|
|
- where status = 1
|
|
|
- and title_sensitivity = 0
|
|
|
- and title like '%{key}%';
|
|
|
+ where status = %s
|
|
|
+ and title_sensitivity = %s
|
|
|
+ and article_id in %s;
|
|
|
"""
|
|
|
fetch_response = self.long_articles_client.fetch(
|
|
|
- fetch_query, cursor_type=DictCursor
|
|
|
+ fetch_query, cursor_type=DictCursor, params=(1, 0, article_id_tuple)
|
|
|
)
|
|
|
return fetch_response
|
|
|
|
|
@@ -117,34 +147,43 @@ class TopArticleGeneralizeFromArticlePool(TopArticleGeneralize):
|
|
|
:return:
|
|
|
"""
|
|
|
update_sql = f"""
|
|
|
- update crawler_meta_article
|
|
|
- set status = %s
|
|
|
- where article_id in %s and status = %s;
|
|
|
+ update crawler_meta_article
|
|
|
+ set status = %s
|
|
|
+ where article_id in %s and status = %s;
|
|
|
"""
|
|
|
affect_rows = self.long_articles_client.save(
|
|
|
query=update_sql, params=(2, tuple(article_id_list), 1)
|
|
|
)
|
|
|
|
|
|
def deal(self):
|
|
|
+ # migrate articles
|
|
|
+ max_id = self.elastic_search.get_max_article_id()
|
|
|
+ self.migrate_article_to_es(max_id)
|
|
|
+
|
|
|
+ # fetch titles
|
|
|
title_obj_list = self.fetch_distinct_top_titles()
|
|
|
publishing_article_list = []
|
|
|
for title_obj in tqdm(title_obj_list):
|
|
|
if self.get_title_read_info_detail(title_obj["title"]):
|
|
|
try:
|
|
|
- temp = []
|
|
|
keys = self.get_keys_by_ai(title_obj)
|
|
|
- for key in keys:
|
|
|
- candidate_articles = self.get_candidate_articles(key)
|
|
|
- temp += candidate_articles
|
|
|
+ related_articles = self.elastic_search.search(
|
|
|
+ search_keys=",".join(keys), size=50
|
|
|
+ )
|
|
|
+ if related_articles:
|
|
|
+ article_id_list = [i["article_id"] for i in related_articles]
|
|
|
+ article_list = self.get_candidate_articles(
|
|
|
+ tuple(article_id_list)
|
|
|
+ )
|
|
|
|
|
|
- if temp:
|
|
|
- title_list = [i["title"] for i in temp]
|
|
|
+ title_list = [i["title"] for i in article_list]
|
|
|
# 相关性排序
|
|
|
similarity_array = similarity_between_title_list(
|
|
|
title_list, [title_obj["title"]]
|
|
|
)
|
|
|
+
|
|
|
response_with_similarity_list = []
|
|
|
- for index, item in enumerate(temp):
|
|
|
+ for index, item in enumerate(article_list):
|
|
|
item["similarity"] = similarity_array[index][0]
|
|
|
response_with_similarity_list.append(item)
|
|
|
|
|
@@ -199,6 +238,7 @@ class TopArticleGeneralizeFromArticlePool(TopArticleGeneralize):
|
|
|
article_id_list = [i["article_id"] for i in publishing_article_list]
|
|
|
self.change_article_status_while_publishing(article_id_list=article_id_list)
|
|
|
|
|
|
+
|
|
|
class TopArticleGeneralizeFromVideoPool(TopArticleGeneralize):
|
|
|
def get_candidate_videos(self, key):
|
|
|
fetch_query = f"""
|
|
@@ -223,8 +263,3 @@ class TopArticleGeneralizeFromVideoPool(TopArticleGeneralize):
|
|
|
candidate_articles = self.get_candidate_videos(key)
|
|
|
temp += candidate_articles
|
|
|
print(json.dumps(temp, ensure_ascii=False, indent=4))
|
|
|
-
|
|
|
-#
|
|
|
-# TopArticleGeneralizeFromVideoPool().deal()
|
|
|
-
|
|
|
-
|