4 月之前 · b062b0e110
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,11 @@
 
				+FROM registry.cn-hangzhou.aliyuncs.com/stuuudy/cyber-crawler-base:latest
			
 
				+
			
 
				+WORKDIR /LongArticleAlgServer
			
 
				+
			
 
				+COPY . .
			
 
				+
			
 
				+ENV TZ=Asia/Shanghai
			
 
				+
			
 
				+RUN pip install -r requirements.txt -i https://mirrors.163.com/pypi/simple/ --no-cache-dir
			
 
				+
			
 
				+CMD ["hypercorn", "alg_app:app", "--config", "alg.toml"]
			
--- a/alg.toml
+++ b/alg.toml
@@ -1,6 +1,6 @@
 
				 reload = true
			
 
				 bind = "0.0.0.0:6060"
			
 
				-workers = 4
			
 
				+workers = 3
			
 
				 keep_alive_timeout = 120  # 保持连接的最大秒数，根据需要调整
			
 
				 graceful_timeout = 30    # 重启或停止之前等待当前工作完成的时间
			
 
				 loglevel = "debug"  # 日志级别
			
--- a/alg_app.py
+++ b/alg_app.py
@@ -5,11 +5,11 @@ from quart import Quart
 
				 from similarities import BertSimilarity
			
 
				 from routes import AlgRoutes
			
 
				 from applications import AsyncMySQLClient
			
 
				+from applications.embedding_manager import EmbeddingManager
			
 
				 
			
 
				 app = Quart(__name__)
			
 
				 AsyncMySQL = AsyncMySQLClient(app)
			
 
				 
			
 
				-
			
 
				 @app.before_serving
			
 
				 async def init():
			
 
				     """
			
@@ -17,8 +17,9 @@ async def init():
 
				     """
			
 
				     await AsyncMySQL.init_pool()
			
 
				     model = BertSimilarity(model_name_or_path="BAAI/bge-large-zh-v1.5")
			
 
				+    embedding_manager = EmbeddingManager(model)
			
 
				     print("模型加载成功")
			
 
				-    app_routes = AlgRoutes(AsyncMySQL, model)
			
 
				+    app_routes = AlgRoutes(AsyncMySQL, model, embedding_manager)
			
 
				     app.register_blueprint(app_routes)
			
 
				 
			
 
				 
			
@@ -32,4 +33,4 @@ async def close_db():
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    app.run(debug=True, host="0.0.0.0", port=6060)
			
 
				+    app.run()
			
--- a/applications/articleTools.py
+++ b/applications/articleTools.py
@@ -5,6 +5,7 @@
 
				 import asyncio
			
 
				 import aiomysql
			
 
				 from pandas import DataFrame
			
 
				+from datetime import datetime
			
 
				 
			
 
				 
			
 
				 class TaskMySQLClient(object):
			
@@ -64,6 +65,27 @@ class TaskMySQLClient(object):
 
				                 await coon.commit()
			
 
				 
			
 
				 
			
 
				+class AccountAvgInfo:
			
 
				+    def __init__(self, gh_id, position, update_time, account_name, fans, read_avg, like_avg, status,
			
 
				+                 account_type, account_mode, account_source, account_status, business_type, read_rate_avg):
			
 
				+        self.gh_id = gh_id
			
 
				+        self.position = position
			
 
				+        self.update_time = update_time
			
 
				+        self.account_name = account_name
			
 
				+        self.fans = fans
			
 
				+        self.read_avg = read_avg
			
 
				+        self.like_avg = like_avg
			
 
				+        self.status = status
			
 
				+        self.account_type = account_type
			
 
				+        self.account_mode = account_mode
			
 
				+        self.account_source = account_source
			
 
				+        self.account_status = account_status
			
 
				+        self.business_type = business_type
			
 
				+        self.read_rate_avg = read_rate_avg
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return f"<AccountAvgInfo {self.account_name}>"
			
 
				+
			
 
				 class ArticleDBTools(object):
			
 
				     """
			
 
				     长文数据库相关功能
			
@@ -76,7 +98,58 @@ class ArticleDBTools(object):
 
				         """
			
 
				         self.mysql_client = mysql_client
			
 
				 
			
 
				-    async def getSingleAccountArticles(self, account_name):
			
 
				+    async def getAccountAvgInfo(self, gh_id):
			
 
				+        """
			
 
				+        获取单个账号历史均值
			
 
				+        """
			
 
				+        keys = [
			
 
				+            "gh_id",
			
 
				+            "position",
			
 
				+            "update_time",
			
 
				+            "account_name",
			
 
				+            "fans",
			
 
				+            "read_avg",
			
 
				+            "like_avg",
			
 
				+            "status",
			
 
				+            "account_type",
			
 
				+            "account_mode",
			
 
				+            "account_source",
			
 
				+            "account_status",
			
 
				+            "business_type",
			
 
				+            "read_rate_avg"
			
 
				+        ]
			
 
				+        sql = f"""
			
 
				+            SELECT {", ".join(keys)}
			
 
				+            FROM account_avg_info_v3
			
 
				+            WHERE gh_id = '{gh_id}'
			
 
				+            and position = 1;"""
			
 
				+        result = await self.mysql_client.async_select(sql=sql)
			
 
				+        account_avg_info_list = [AccountAvgInfo(*row) for row in result] if result else []
			
 
				+
			
 
				+        return account_avg_info_list
			
 
				+
			
 
				+    async def get_account_avg_info(self, account_avg_info_map, available_dates, timestamp):
			
 
				+        target_date = datetime.fromtimestamp(timestamp).date()
			
 
				+        # 尝试获取指定日期
			
 
				+        info = account_avg_info_map.get(target_date.isoformat())
			
 
				+        if info is not None:
			
 
				+            return info
			
 
				+        # 如果指定日期不存在，寻找最近日期
			
 
				+        closest_date = None
			
 
				+        for date in available_dates:
			
 
				+            if closest_date is None:
			
 
				+                closest_date = date
			
 
				+                continue
			
 
				+            days = abs((datetime.fromisoformat(date).date() - target_date).days)
			
 
				+            closest_days = abs((datetime.fromisoformat(closest_date).date() - target_date).days)
			
 
				+            if days < closest_days:
			
 
				+                closest_date = date
			
 
				+            elif days > closest_days:
			
 
				+                break
			
 
				+
			
 
				+        return account_avg_info_map.get(closest_date) if closest_date else None
			
 
				+
			
 
				+    async def getSingleAccountArticles(self, gh_id):
			
 
				         """
			
 
				         获取单个账号的历史文章
			
 
				         :param gh_id:
			
@@ -95,21 +168,22 @@ class ArticleDBTools(object):
 
				         ]
			
 
				         sql = f"""
			
 
				             SELECT {", ".join(keys)}
			
 
				-            FROM official_articles
			
 
				-            WHERE accountName = '{account_name}';"""
			
 
				+            FROM official_articles_v2
			
 
				+            WHERE ghId = '{gh_id}';"""
			
 
				         result = await self.mysql_client.async_select(sql=sql)
			
 
				         return DataFrame(result, columns=keys)
			
 
				 
			
 
				     async def getArticleByFilter(
			
 
				             self,
			
 
				-            account_name,
			
 
				+            gh_id,
			
 
				+            view_count_filter=None,
			
 
				             index_list=None,
			
 
				             min_time=None,
			
 
				             max_time=None,
			
 
				             msg_type=None,
			
 
				     ):
			
 
				         """
			
 
				-        :param account_name:
			
 
				+        :param gh_id:
			
 
				         :param index_list: index ranges from 1 to 8
			
 
				         :param min_time: earliest time
			
 
				         :param max_time: latest time
			
@@ -125,18 +199,21 @@ class ArticleDBTools(object):
 
				         if not max_time:
			
 
				             # 2099年
			
 
				             max_time = 4088051123
			
 
				-        articleDataFrame = await self.getSingleAccountArticles(account_name=account_name)
			
 
				+        articleDataFrame = await self.getSingleAccountArticles(gh_id=gh_id)
			
 
				         filterDataFrame = articleDataFrame[
			
 
				             (articleDataFrame["Type"] == msg_type)
			
 
				             & (min_time < articleDataFrame["updateTime"])
			
 
				             & (articleDataFrame["updateTime"] < max_time)
			
 
				             & (articleDataFrame["ItemIndex"].isin(index_list))
			
 
				             ]
			
 
				+        if view_count_filter:
			
 
				+            filterDataFrame = filterDataFrame[(filterDataFrame["show_view_count"] > view_count_filter)]
			
 
				         return filterDataFrame
			
 
				 
			
 
				     async def get_good_bad_articles(self,
			
 
				-                                    account_name,
			
 
				-                                    method,
			
 
				+                                    gh_id,
			
 
				+                                    interest_type,
			
 
				+                                    view_count_filter,
			
 
				                                     rate=0.1,
			
 
				                                     index_list=None,
			
 
				                                     min_time=None,
			
@@ -148,7 +225,8 @@ class ArticleDBTools(object):
 
				         :return:
			
 
				         """
			
 
				         article_data_frame = await self.getArticleByFilter(
			
 
				-            account_name=account_name,
			
 
				+            gh_id=gh_id,
			
 
				+            view_count_filter=view_count_filter,
			
 
				             index_list=index_list,
			
 
				             min_time=min_time,
			
 
				             max_time=max_time,
			
@@ -156,7 +234,7 @@ class ArticleDBTools(object):
 
				         )
			
 
				         df_rows = len(article_data_frame)
			
 
				         if df_rows > 0:
			
 
				-            match method:
			
 
				+            match interest_type:
			
 
				                 case "top":
			
 
				                     sorted_df = article_data_frame.sort_values(by='show_view_count', reversed=True)
			
 
				                     topn = max(int(df_rows * rate), 1)
			
@@ -168,6 +246,25 @@ class ArticleDBTools(object):
 
				                     good_df = article_data_frame[(article_data_frame['show_view_count']) > avg_view * (1.0 + rate)]
			
 
				                     bad_df = article_data_frame[(article_data_frame['show_view_count']) > avg_view * (1.0 - rate)]
			
 
				                     return good_df, bad_df
			
 
				+                case "account_avg":
			
 
				+                    account_read_avg_list = await self.getAccountAvgInfo(
			
 
				+                        gh_id=gh_id
			
 
				+                    )
			
 
				+                    account_avg_info_map = {info.update_time: info for info in account_read_avg_list}
			
 
				+                    # 获取所有可用日期并排序
			
 
				+                    available_dates = sorted(account_avg_info_map.keys())
			
 
				+                    view_count_avg_list = []
			
 
				+                    for index, row in article_data_frame.iterrows():
			
 
				+                        update_time = row['updateTime']
			
 
				+                        info = await self.get_account_avg_info(account_avg_info_map, available_dates, update_time)
			
 
				+                        view_count_avg_list.append(info.read_avg)
			
 
				+
			
 
				+                    article_data_frame['view_count_avg'] = view_count_avg_list
			
 
				+                    good_df = article_data_frame[(article_data_frame['show_view_count']) >
			
 
				+                                                 (article_data_frame['view_count_avg']) * (1.0 + rate)]
			
 
				+                    bad_df = article_data_frame[(article_data_frame['show_view_count']) >
			
 
				+                                                (article_data_frame['view_count_avg']) * (1.0 - rate)]
			
 
				+                    return good_df, bad_df
			
 
				         else:
			
 
				             return None, None
			
 
				 
			
--- a/applications/asyncMySQL.py
+++ b/applications/asyncMySQL.py
@@ -24,6 +24,7 @@ class AsyncMySQLClient(object):
 
				             password='crawler123456@',
			
 
				             db='piaoquan-crawler',
			
 
				             charset='utf8mb4',
			
 
				+            maxsize=100,
			
 
				             connect_timeout=120,
			
 
				         )
			
 
				         print("mysql init successfully")
			
--- a/applications/config.py
+++ b/applications/config.py
@@ -2,4 +2,8 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 # 默认数据库表
			
 
				-db_config = ""
			
 
				+db_config = ""
			
 
				+
			
 
				+port = "6060"
			
 
				+
			
 
				+ip = "localhost"
			
--- a/applications/embedding_manager.py
+++ b/applications/embedding_manager.py
@@ -0,0 +1,131 @@
 
				+import os
			
 
				+import threading
			
 
				+from filelock import FileLock
			
 
				+from time import sleep
			
 
				+import numpy as np
			
 
				+import random
			
 
				+
			
 
				+
			
 
				+class EmbeddingManager:
			
 
				+    def __init__(self, model, emb_size=1024, cache_file="cache/embedding_cache", save_interval=600):
			
 
				+        self.model = model
			
 
				+        self.emb_size = emb_size
			
 
				+        self.cache_file = cache_file
			
 
				+        self.cache_file_real = self.cache_file + ".npy"
			
 
				+        self.cache_key_file = f'{self.cache_file}.keys'
			
 
				+        # avoid multiple process read and write at same time and wait for filelock
			
 
				+        self.save_interval = save_interval + random.randint(0, save_interval)
			
 
				+        self.cache = {}
			
 
				+        self.lock = threading.Lock()  # Thread-safe lock
			
 
				+        self.filelock = FileLock(self.cache_file + ".lock")
			
 
				+
			
 
				+        self.load_cache()
			
 
				+
			
 
				+        # Start the periodic saving thread
			
 
				+        self.saving_thread = threading.Thread(target=self._periodic_save, daemon=True)
			
 
				+        self.saving_thread.start()
			
 
				+
			
 
				+
			
 
				+    def _load_cache_unsafe(self):
			
 
				+        """inter-thread and inter-process safety must be guaranteed by caller"""
			
 
				+        embedding_data = np.load(self.cache_file_real)
			
 
				+        embedding_keys = open(self.cache_key_file, "r").readlines()
			
 
				+        embedding_keys = [key.strip("\n") for key in embedding_keys]
			
 
				+        for idx, key in enumerate(embedding_keys):
			
 
				+            self.cache[key] = embedding_data[idx]
			
 
				+
			
 
				+    def load_cache(self):
			
 
				+        with self.lock:
			
 
				+            if os.path.exists(self.cache_file_real):
			
 
				+                with self.filelock:
			
 
				+                    self._load_cache_unsafe()
			
 
				+            print("[EmbeddingManager]cache loaded")
			
 
				+
			
 
				+    def dump_cache(self):
			
 
				+        if os.path.dirname(self.cache_file):
			
 
				+            os.makedirs(os.path.dirname(self.cache_file), 0o755, True)
			
 
				+        tmp_cache_file = self.cache_file + ".tmp"
			
 
				+        tmp_cache_key_file = self.cache_key_file + ".tmp"
			
 
				+        with self.lock:  # Ensure thread-safe access firstly
			
 
				+            with self.filelock: # Ensure inter-process safety secondly
			
 
				+                if os.path.exists(self.cache_file_real):
			
 
				+                    self._load_cache_unsafe()
			
 
				+                keys = self.cache.keys()
			
 
				+                cache_to_save = np.zeros((len(keys), self.emb_size), np.float32)
			
 
				+                for idx, key in enumerate(keys):
			
 
				+                    cache_to_save[idx] = self.cache[key]
			
 
				+                np.save(tmp_cache_file, cache_to_save)
			
 
				+                with open(tmp_cache_key_file, 'w') as fp:
			
 
				+                    fp.write('\n'.join(keys))
			
 
				+                if os.path.exists(self.cache_file + ".npy"):
			
 
				+                    os.rename(self.cache_file + ".npy", self.cache_file + ".npy.bak")
			
 
				+                if os.path.exists(self.cache_key_file):
			
 
				+                    os.rename(self.cache_key_file, self.cache_key_file + ".bak")
			
 
				+                os.rename(tmp_cache_file + ".npy", self.cache_file + ".npy")
			
 
				+                os.rename(tmp_cache_key_file, self.cache_key_file)
			
 
				+        print("[EmbeddingManager]cache dumped")
			
 
				+
			
 
				+    def get_embeddings(self, text_list):
			
 
				+        """
			
 
				+        Search embedding for a given text. If not found, generate using the model, save to cache, and return it.
			
 
				+        """
			
 
				+        if not isinstance(text_list, list):
			
 
				+            raise Exception(f"Invalid parameter type: text_list {type(text_list)}")
			
 
				+        embedding_list = np.zeros((len(text_list), self.emb_size), np.float32)
			
 
				+        if not text_list:
			
 
				+            return embedding_list
			
 
				+        new_texts = []
			
 
				+        new_texts_ori_idx = []
			
 
				+        with self.lock:
			
 
				+            for idx, text in enumerate(text_list):
			
 
				+                if text in self.cache:
			
 
				+                    # print(f"find {text} in cache")
			
 
				+                    embedding_list[idx] = self.cache[text]
			
 
				+                else:
			
 
				+                    new_texts.append(text)
			
 
				+                    new_texts_ori_idx.append(idx)
			
 
				+
			
 
				+        new_embeddings = self.model.get_embeddings(new_texts)
			
 
				+        if new_embeddings.shape[0] > 0 and new_embeddings.shape[1] != self.emb_size:
			
 
				+            raise Exception("Embedding size mismatch")
			
 
				+
			
 
				+        # Generate embedding if not found in cache
			
 
				+        with self.lock:  # Ensure thread-safe access
			
 
				+            for idx, text in enumerate(new_texts):
			
 
				+                if text not in self.cache:  # Re-check in case another thread added it
			
 
				+                    self.cache[text] = new_embeddings[idx]
			
 
				+                embedding_list[new_texts_ori_idx[idx]] = new_embeddings[idx]
			
 
				+        return embedding_list
			
 
				+
			
 
				+    def _periodic_save(self):
			
 
				+        """Periodically save the cache to disk."""
			
 
				+        while True:
			
 
				+            sleep(self.save_interval)
			
 
				+            self.dump_cache()
			
 
				+
			
 
				+
			
 
				+# Only for testing
			
 
				+class DummyModel:
			
 
				+    def padding_text(self, text):
			
 
				+        padding_factor = 1024 // len(text)
			
 
				+        text = text * padding_factor
			
 
				+        text += text[:1024 - len(text)]
			
 
				+        return text
			
 
				+
			
 
				+    def get_embeddings(self, text_list):
			
 
				+        embeddings = np.zeros((len(text_list), 1024), np.float32)
			
 
				+        for idx, text in enumerate(text_list):
			
 
				+            text = self.padding_text(text)
			
 
				+            embedding = np.array([ord(c) for c in text], np.float32)
			
 
				+            embeddings[idx] = embedding
			
 
				+        return embeddings
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    model = DummyModel()
			
 
				+    manager = EmbeddingManager(model)
			
 
				+    print(manager.get_embeddings(["hello"]))
			
 
				+    print(manager.get_embeddings(["world"]))
			
 
				+    print(manager.get_embeddings(["hello world"]))
			
 
				+    manager.dump_cache()
			
 
				+    print(manager.get_embeddings(["new", "word"]))
			
 
				+    manager.dump_cache()
			
--- a/applications/functions/__init__.py
+++ b/applications/functions/__init__.py
@@ -2,7 +2,7 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 from .article_account import ArticleRank
			
 
				-from .article_tools import title_sim_v2_by_list
			
 
				+from .article_tools import title_sim_v2_by_list, is_bad
			
 
				 from .server_article_account import get_article_title_url_list, get_article_titles
			
 
				 
			
 
				 
			
--- a/applications/functions/article_account.py
+++ b/applications/functions/article_account.py
@@ -5,12 +5,14 @@ import json
 
				 
			
 
				 import requests
			
 
				 
			
 
				+from applications.config import port
			
 
				+
			
 
				 
			
 
				 class ArticleRank(object):
			
 
				     """
			
 
				     账号排序
			
 
				     """
			
 
				-    url = "http://192.168.100.31:8179/score_list"
			
 
				+    url = "http://localhost:{}/score_list".format(port)
			
 
				 
			
 
				     @classmethod
			
 
				     def rank(cls, account_list, text_list):
			
@@ -25,7 +27,7 @@ class ArticleRank(object):
 
				             "text_list": text_list,
			
 
				             "max_time": None,
			
 
				             "min_time": None,
			
 
				-            "interest_type": "by_avg",
			
 
				+            "interest_type": "avg",
			
 
				             "sim_type": "mean",
			
 
				             "rate": 0.1
			
 
				         }
			
--- a/applications/functions/article_tools.py
+++ b/applications/functions/article_tools.py
--- a/applications/functions/bad.txt
+++ b/applications/functions/bad.txt
@@ -0,0 +1,45 @@
 
				+男人跟婚外女人很难断，不是因为爱，而是这三个原因，别傻傻不知
			
 
				+瑞士和平会，乌有2个好消息，有得就有失，捧场的基本是西方国家
			
 
				+华春莹代表中国，用《琉球地位未定论》，好好给日本上了一课
			
 
				+再下一城，27年的等待一朝绽放，中国开始放大招
			
 
				+一把大火，文明倒退1000年
			
 
				+善恶终有报？晚年有了儿子的王刚本以为可以安度晚年，但如今的情况却让人感叹
			
 
				+赵丽蓉：中国第一位小品女皇，因一句台词识破潜伏间谍，72岁自备寿衣，临终前为巩汉林留一句忠告
			
 
				+致命一击！美国的惊天谎言被拆穿，暴露其真实嘴脸！
			
 
				+男生遭醉汉掐脖反击后被刑拘，警方最新回应
			
 
				+“夫妻不送葬，送葬必遭殃”，为什么老伴不能去送呢？亲人去世后，最忌讳的八件事，别不当回事！
			
 
				+102岁杨振宁豪宅内，宴请小他27岁的岳母听沪剧，翁帆一个举动真情流露
			
 
				+中国垫资38亿美元帮修铁路，6年过去没还，还想期限延长至50年！
			
 
				+结婚前新娘突然要增加50万彩礼，否则不嫁，新郎无奈放弃，新娘收50万彩礼另嫁他人，1年后，报应来了！
			
 
				+张学良到死都不知道，当年救他一命的不是宋美龄，而是一个痴心女
			
 
				+美大选反转再反转！笑到最后的竟是她？拜登、特朗普都始料未及
			
 
				+大妈揣80万去儿子家住一周，7天吃咸菜，孙子：等你走了吃煎牛排
			
 
				+山西农民房檐上石狮子7万没有卖，3天后发生的事情让他始料未及
			
 
				+当不成总统了？美大选有变，中方需早做准备
			
 
				+女人最佳“绝经期”已公布，不是45岁，而是这个数，越接近越健康
			
 
				+村头光棍在下雪天捡一女婴，终身未娶拉扯张长大28年，女孩回报方式看哭众人
			
 
				+杜聿明的女儿，嫁给自己的老师杨振宁，生三个孩子事业有成
			
 
				+他享有国葬殊荣，临终捐光2000亿，大儿子开出租谋生
			
 
				+男人跟婚外女人很难断，不是因为爱，而是这三个原因，别傻傻不知
			
 
				+瑞士和平会，乌有2个好消息，有得就有失，捧场的基本是西方国家
			
 
				+华春莹代表中国，用《琉球地位未定论》，好好给日本上了一课
			
 
				+再下一城，27年的等待一朝绽放，中国开始放大招
			
 
				+一把大火，文明倒退1000年
			
 
				+善恶终有报？晚年有了儿子的王刚本以为可以安度晚年，但如今的情况却让人感叹
			
 
				+赵丽蓉：中国第一位小品女皇，因一句台词识破潜伏间谍，72岁自备寿衣，临终前为巩汉林留一句忠告
			
 
				+致命一击！美国的惊天谎言被拆穿，暴露其真实嘴脸！
			
 
				+男生遭醉汉掐脖反击后被刑拘，警方最新回应
			
 
				+“夫妻不送葬，送葬必遭殃”，为什么老伴不能去送呢？亲人去世后，最忌讳的八件事，别不当回事！
			
 
				+102岁杨振宁豪宅内，宴请小他27岁的岳母听沪剧，翁帆一个举动真情流露
			
 
				+中国垫资38亿美元帮修铁路，6年过去没还，还想期限延长至50年！
			
 
				+结婚前新娘突然要增加50万彩礼，否则不嫁，新郎无奈放弃，新娘收50万彩礼另嫁他人，1年后，报应来了！
			
 
				+张学良到死都不知道，当年救他一命的不是宋美龄，而是一个痴心女
			
 
				+美大选反转再反转！笑到最后的竟是她？拜登、特朗普都始料未及
			
 
				+大妈揣80万去儿子家住一周，7天吃咸菜，孙子：等你走了吃煎牛排
			
 
				+山西农民房檐上石狮子7万没有卖，3天后发生的事情让他始料未及
			
 
				+当不成总统了？美大选有变，中方需早做准备
			
 
				+女人最佳“绝经期”已公布，不是45岁，而是这个数，越接近越健康
			
 
				+村头光棍在下雪天捡一女婴，终身未娶拉扯张长大28年，女孩回报方式看哭众人
			
 
				+杜聿明的女儿，嫁给自己的老师杨振宁，生三个孩子事业有成
			
 
				+他享有国葬殊荣，临终捐光2000亿，大儿子开出租谋生
			
 
				+到底是奶奶亲，还是姥姥亲？科学的排序现实又扎心，对照看看和你想得一样吗？
			
--- a/applications/pipeline.py
+++ b/applications/pipeline.py
@@ -5,14 +5,41 @@ import time
 
				 
			
 
				 import requests
			
 
				 
			
 
				-from applications.functions import title_sim_v2_by_list
			
 
				-from applications.functions import get_article_titles
			
 
				+from applications.functions import title_sim_v2_by_list, is_bad
			
 
				+from applications.config import port
			
 
				 
			
 
				 
			
 
				 class LongArticlesPipeline(object):
			
 
				     """
			
 
				     Long articles Pipeline
			
 
				     """
			
 
				+    @classmethod
			
 
				+    def get_titles(cls, account_name, index_list):
			
 
				+        """
			
 
				+        :param account_name:
			
 
				+        :param index_list:
			
 
				+        :return:
			
 
				+        """
			
 
				+        print("开始请求")
			
 
				+        print(account_name)
			
 
				+        print(index_list)
			
 
				+        url = "http://localhost:{}/title_list".format(port)
			
 
				+        response = requests.request(
			
 
				+            "POST",
			
 
				+            url=url,
			
 
				+            headers={},
			
 
				+            json={
			
 
				+                "account_name": account_name,
			
 
				+                "index_list": index_list,
			
 
				+                "min_time": None,
			
 
				+                "max_time": None,
			
 
				+                "msg_type": "9"
			
 
				+            }
			
 
				+        )
			
 
				+        print(response.status_code)
			
 
				+        print(response.text)
			
 
				+        print(response.json())
			
 
				+        return response.json()['title_list']
			
 
				 
			
 
				     @classmethod
			
 
				     def history_title(cls, account_nickname):
			
@@ -24,11 +51,13 @@ class LongArticlesPipeline(object):
 
				         # if "【1】" in plan_name or "【2】" in plan_name:
			
 
				         index_list_1 = [1, 2]
			
 
				         index_list_2 = [1, 2, 3, 4, 5, 6, 7, 8]
			
 
				-        account_title_list_1 = get_article_titles(
			
 
				+        print("开始请求")
			
 
				+        account_title_list_1 = cls.get_titles(
			
 
				             account_nickname,
			
 
				             index_list=index_list_1
			
 
				         )
			
 
				-        account_title_list_2 = get_article_titles(
			
 
				+        print(account_title_list_1)
			
 
				+        account_title_list_2 = cls.get_titles(
			
 
				             account_nickname,
			
 
				             index_list=index_list_2
			
 
				         )
			
@@ -57,7 +86,7 @@ class LongArticlesPipeline(object):
 
				         """
			
 
				         判断文章是否安全
			
 
				         """
			
 
				-        url = "http://192.168.100.31:8177/sensitive/is_sensitive"
			
 
				+        url = "http://61.48.133.26:8177/sensitive/is_sensitive"
			
 
				         body = {
			
 
				             "text": title
			
 
				         }
			
@@ -76,22 +105,7 @@ class LongArticlesPipeline(object):
 
				         :param account_nickname:
			
 
				         :return:
			
 
				         """
			
 
				-        url = "http://192.168.100.31:8176/bad/is_bad"
			
 
				-        headers = {
			
 
				-            "accept": "application/json",
			
 
				-            "Content-Type": "application/json"
			
 
				-        }
			
 
				-        body = {
			
 
				-            "account_nickname": account_nickname,
			
 
				-            "title": title
			
 
				-        }
			
 
				-        response = requests.request(
			
 
				-            "POST",
			
 
				-            url=url,
			
 
				-            headers=headers,
			
 
				-            json=body
			
 
				-        )
			
 
				-        return response.json()['is_bad']
			
 
				+        return is_bad(title)
			
 
				 
			
 
				     @classmethod
			
 
				     def deal(cls, article_obj, account_name, history_title_dict):
			
--- a/applications/textSimilarity.py
+++ b/applications/textSimilarity.py
@@ -17,39 +17,60 @@ def score_to_attention(score, symbol=1):
 
				     score_attn = torch.nn.functional.softmax(score_norm, dim=1)
			
 
				     return score_attn, score_norm, score_pred
			
 
				 
			
 
				+def compare_tensor(tensor1, tensor2):
			
 
				+    if tensor1.shape != tensor2.shape:
			
 
				+        print(f"[compare_tensor]shape error: {tensor1.shape} vs {tensor2.shape}")
			
 
				+        return
			
 
				+    if not torch.allclose(tensor1, tensor2):
			
 
				+        print("[compare_tensor]value error: tensor1 not close to tensor2")
			
 
				 
			
 
				 class NLPFunction(object):
			
 
				     """
			
 
				     NLP Task
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, model):
			
 
				+    def __init__(self, model, embedding_manager):
			
 
				         self.model = model
			
 
				+        self.embedding_manager = embedding_manager
			
 
				 
			
 
				-    def base_string_similarity(self, text_dict):
			
 
				+    def direct_similarity(self, a, b):
			
 
				+        return self.model.similarity(a, b)
			
 
				+
			
 
				+    def cached_similarity(self, a, b):
			
 
				+        text_emb1 = self.embedding_manager.get_embeddings(a)
			
 
				+        text_emb2 = self.embedding_manager.get_embeddings(b)
			
 
				+        score_function = self.model.score_functions['cos_sim']
			
 
				+        score_tensor = score_function(text_emb1, text_emb2)
			
 
				+        return score_tensor
			
 
				+
			
 
				+    def base_string_similarity(self, text_dict, use_cache=True):
			
 
				         """
			
 
				         基础功能，计算两个字符串的相似度
			
 
				         :param text_dict:
			
 
				         :return:
			
 
				         """
			
 
				-        score_tensor = self.model.similarity(
			
 
				-            text_dict['text_a'],
			
 
				-            text_dict['text_b']
			
 
				-        )
			
 
				+        text_a = text_dict['text_a']
			
 
				+        text_b = text_dict['text_b']
			
 
				+        if use_cache:
			
 
				+            score_tensor = self.cached_similarity(text_a, text_b)
			
 
				+        else:
			
 
				+            score_tensor = self.direct_similarity(text_a, text_b)
			
 
				         response = {
			
 
				             "score": score_tensor.squeeze().tolist()
			
 
				         }
			
 
				         return response
			
 
				 
			
 
				-    def base_list_similarity(self, pair_list_dict):
			
 
				+    def base_list_similarity(self, pair_list_dict, use_cache=True):
			
 
				         """
			
 
				         计算两个list的相似度
			
 
				         :return:
			
 
				         """
			
 
				-        score_tensor = self.model.similarity(
			
 
				-            pair_list_dict['text_list_a'],
			
 
				-            pair_list_dict['text_list_b']
			
 
				-        )
			
 
				+        text_a = pair_list_dict['text_list_a']
			
 
				+        text_b = pair_list_dict['text_list_b']
			
 
				+        if use_cache:
			
 
				+            score_tensor = self.cached_similarity(text_a, text_b)
			
 
				+        else:
			
 
				+            score_tensor = self.direct_similarity(text_a, text_b)
			
 
				         response = {
			
 
				             "score_list_list": score_tensor.tolist()
			
 
				         }
			
--- a/applications/wxSpider.py
+++ b/applications/wxSpider.py
@@ -86,7 +86,7 @@ class ArticleManager(object):
 
				         search articles in wx
			
 
				         :return:
			
 
				         """
			
 
				-        url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
			
 
				+        url = "http://47.98.154.124:8888/crawler/wei_xin/keyword"
			
 
				         payload = json.dumps({
			
 
				             "keyword": title,
			
 
				             "cursor": "1"
			
@@ -106,7 +106,7 @@ class ArticleManager(object):
 
				         :param content_link:
			
 
				         :return:
			
 
				         """
			
 
				-        url = "http://8.217.190.241:8888/crawler/wei_xin/detail"
			
 
				+        url = "http://47.98.154.124:8888/crawler/wei_xin/detail"
			
 
				         payload = json.dumps({
			
 
				             "content_link": content_link,
			
 
				             "is_count": False,
			
@@ -123,8 +123,9 @@ class ArticleManager(object):
 
				     def update_msg_list(cls, ghId, index):
			
 
				         """
			
 
				         :return:
			
 
				+        "http://47.98.154.124:8888/crawler/wei_xin/detail"
			
 
				         """
			
 
				-        url = 'http://8.217.190.241:8888/crawler/wei_xin/blogger'
			
 
				+        url = 'http://47.98.154.124:8888/crawler/wei_xin/blogger'
			
 
				         payload = {
			
 
				             'account_id': ghId,
			
 
				             'cursor': index,
			
@@ -145,7 +146,7 @@ class ArticleManager(object):
 
				         """
			
 
				         async with aiohttp.ClientSession() as session:
			
 
				             async with session.post(
			
 
				-                    url='http://8.217.190.241:8888/crawler/wei_xin/account_info',
			
 
				+                    url='http://47.98.154.124:8888/crawler/wei_xin/account_info',
			
 
				                     headers={'Content-Type': 'application/json'},
			
 
				                     json={"content_link": content_url}
			
 
				             ) as response:
			
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,19 @@
 
				+version: '3.8'
			
 
				+
			
 
				+services:
			
 
				+  server-prod:
			
 
				+
			
 
				+    build:
			
 
				+      context: .
			
 
				+      dockerfile: Dockerfile
			
 
				+    image: long_article_alg_server
			
 
				+    container_name: long_article_alg_server
			
 
				+    restart: unless-stopped
			
 
				+    ports:
			
 
				+      - '6060:6060'
			
 
				+    networks:
			
 
				+      - crawler
			
 
				+    entrypoint: ["hypercorn", "alg_app:app", "--config", "alg.toml"]
			
 
				+
			
 
				+networks:
			
 
				+  crawler:
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,4 +26,5 @@ torch~=2.3.1
 
				 tqdm~=4.66.4
			
 
				 transformers
			
 
				 pydantic~=2.6.4
			
 
				-similarities~=1.1.7
			
 
				+similarities~=1.1.7
			
 
				+filelock
			
--- a/routes/__init__.py
+++ b/routes/__init__.py
@@ -8,9 +8,9 @@ from .accountArticleRank import AccountArticleRank
 
				 from .nlpServer import NLPServer
			
 
				 from .articleDBServer import ArticleSpider
			
 
				 from .accountServer import AccountServer
			
 
				+from applications.articleTools import ArticleDBTools
			
 
				 
			
 
				-
			
 
				-def AlgRoutes(mysql_client, model):
			
 
				+def AlgRoutes(mysql_client, model, embedding_manager):
			
 
				     """
			
 
				     ALG ROUTES
			
 
				     :return:
			
@@ -45,12 +45,12 @@ def AlgRoutes(mysql_client, model):
 
				         :return:
			
 
				         """
			
 
				         params = await request.get_json()
			
 
				-        nlpS = NLPServer(params=params, model=model)
			
 
				+        nlpS = NLPServer(params=params, model=model, embedding_manager=embedding_manager)
			
 
				         response = nlpS.deal()
			
 
				         return jsonify(response)
			
 
				 
			
 
				     @blueprint.route("/score_list", methods=["POST"])
			
 
				-    async def articleAccount():
			
 
				+    async def article_account():
			
 
				         """
			
 
				         公众号文章功能等接口
			
 
				         :return:
			
@@ -60,6 +60,27 @@ def AlgRoutes(mysql_client, model):
 
				         response = await AS.deal()
			
 
				         return jsonify(response)
			
 
				 
			
 
				+    @blueprint.route("/title_list", methods=["POST"])
			
 
				+    async def accountTitle():
			
 
				+        """
			
 
				+        获取账号的标题list
			
 
				+        :return:
			
 
				+        """
			
 
				+        params = await request.get_json()
			
 
				+        print(params)
			
 
				+        ADBT = ArticleDBTools(mysql_client=mysql_client)
			
 
				+        responseDF = await ADBT.getArticleByFilter(
			
 
				+            account_name=params['account_name'],
			
 
				+            index_list=params['index_list'],
			
 
				+            min_time=params['min_time'],
			
 
				+            max_time=params['max_time'],
			
 
				+            msg_type=params['msg_type'],
			
 
				+
			
 
				+        )
			
 
				+        title_list = responseDF['title']
			
 
				+        response = {"title_list": title_list.values.tolist()}
			
 
				+        return jsonify(response)
			
 
				+
			
 
				     @blueprint.route("/article_crawler", methods=["POST"])
			
 
				     async def articleMysql():
			
 
				         """
			
--- a/routes/accountArticleRank.py
+++ b/routes/accountArticleRank.py
@@ -64,9 +64,11 @@ class AccountArticleRank(object):
 
				         """
			
 
				         self.publishArticleList = []
			
 
				         self.filter_list = []
			
 
				+        print("历史")
			
 
				         history_title_dict = self.pipeline.history_title(
			
 
				             account_nickname=self.accountName
			
 
				         )
			
 
				+        print(history_title_dict)
			
 
				         for item in tqdm(self.params["publishArticleList"]):
			
 
				             flag = self.pipeline.deal(item, self.accountName, history_title_dict)
			
 
				             if flag:
			
@@ -89,6 +91,7 @@ class AccountArticleRank(object):
 
				             self.publishNum = self.params["publishNum"]
			
 
				             print("开始校验参数")
			
 
				             self.filter()
			
 
				+            print("参数校验成功")
			
 
				             self.logger.log(code="1001", msg="参数校验成功", data=self.params)
			
 
				             return None
			
 
				         except Exception as e:
			
--- a/routes/accountServer.py
+++ b/routes/accountServer.py
@@ -5,6 +5,7 @@ import json
 
				 
			
 
				 import aiohttp
			
 
				 from applications.articleTools import ArticleDBTools
			
 
				+from applications.config import port
			
 
				 
			
 
				 
			
 
				 class AccountServer(object):
			
@@ -14,26 +15,28 @@ class AccountServer(object):
 
				 
			
 
				     def __init__(self, mysql_client, params):
			
 
				         self.account_name_list = None
			
 
				+        self.gh_id_list = None
			
 
				         self.sim_type = None
			
 
				         self.interest_type = None
			
 
				         self.min_time = None
			
 
				         self.max_time = None
			
 
				         self.rate = None
			
 
				         self.title_list = None
			
 
				+        self.view_count_filter = None
			
 
				         self.params = params
			
 
				         self.AT = ArticleDBTools(mysql_client)
			
 
				 
			
 
				-    async def request_for_nlp(self, title_list, account_interest, account_weight):
			
 
				+    async def request_for_nlp(self, title_list, account_interest, interest_weight):
			
 
				         """
			
 
				         nlp process
			
 
				         """
			
 
				         headers = {"Content-Type": "application/json"}
			
 
				-        url = "http://localhost:6060/nlp"
			
 
				+        url = "http://localhost:{}/nlp".format(port)
			
 
				         body = {
			
 
				             "data": {
			
 
				                 "text_list_a": [i.replace("'", "") for i in title_list],
			
 
				                 "text_list_b": [i.replace("'", "") for i in account_interest],
			
 
				-                "score_list_b": account_weight,
			
 
				+                "score_list_b": interest_weight,
			
 
				                 "symbol": 1,
			
 
				             },
			
 
				             "function": "similarities_cross_mean" if self.sim_type == "mean" else "similarities_cross_avg"
			
@@ -41,14 +44,14 @@ class AccountServer(object):
 
				         async with aiohttp.ClientSession() as session:
			
 
				             async with session.post(url, headers=headers, json=body) as response:
			
 
				                 response_text = await response.text()
			
 
				-                print("结果：\t", response_text)
			
 
				+                # print("结果：\t", response_text)
			
 
				                 if response_text:
			
 
				                     return await response.json()
			
 
				                 else:
			
 
				                     print("Received empty response")
			
 
				                     return {}
			
 
				 
			
 
				-    def checkParams(self):
			
 
				+    def check_params(self):
			
 
				         """
			
 
				         校验传参
			
 
				         :return:
			
@@ -56,20 +59,23 @@ class AccountServer(object):
 
				         try:
			
 
				             self.title_list = self.params["text_list"]
			
 
				             self.account_name_list = self.params.get("account_nickname_list", [])
			
 
				+            self.gh_id_list = self.params.get("gh_id_list", [])
			
 
				             self.rate = self.params.get("rate", 0.1)
			
 
				             self.max_time = self.params.get("max_time")
			
 
				             self.min_time = self.params.get("min_time")
			
 
				             self.interest_type = self.params.get("interest_type", "top")
			
 
				             self.sim_type = self.params.get("sim_type", "mean")
			
 
				+            self.view_count_filter = self.params.get("view_count_filter", None)
			
 
				             return None
			
 
				         except Exception as e:
			
 
				             response = {"error": "Params error", "detail": str(e)}
			
 
				             return response
			
 
				 
			
 
				-    async def getAccountInterest(
			
 
				+    async def get_account_interest(
			
 
				         self,
			
 
				-        account_name,
			
 
				-        method,
			
 
				+        gh_id,
			
 
				+        interest_type,
			
 
				+        view_count_filter,
			
 
				         rate=None,
			
 
				         msg_type=None,
			
 
				         index_list=None,
			
@@ -78,71 +84,81 @@ class AccountServer(object):
 
				     ):
			
 
				         """
			
 
				         获取账号的兴趣类型
			
 
				-        :param account_name:
			
 
				+        :param gh_id:
			
 
				         :param max_time:
			
 
				         :param min_time:
			
 
				         :param index_list:
			
 
				         :param msg_type:
			
 
				-        :param keys_dict:
			
 
				         :param rate:
			
 
				-        :param gh_id:
			
 
				-        :param method:
			
 
				+        :param interest_type:
			
 
				+        :param view_count_filter:
			
 
				         :return:
			
 
				         """
			
 
				         good_df, bad_df = await self.AT.get_good_bad_articles(
			
 
				-            account_name=account_name,
			
 
				-            method=method,
			
 
				+            gh_id=gh_id,
			
 
				+            interest_type=interest_type,
			
 
				             msg_type=msg_type,
			
 
				             index_list=index_list,
			
 
				             min_time=min_time,
			
 
				             max_time=max_time,
			
 
				             rate=rate,
			
 
				+            view_count_filter=view_count_filter,
			
 
				         )
			
 
				-        view_count_list = good_df["show_view_count"].values.tolist()
			
 
				-        title_list = good_df["title"].values.tolist()
			
 
				-        print(view_count_list)
			
 
				-        print(title_list)
			
 
				-        return title_list, view_count_list
			
 
				+        extend_dicts = {
			
 
				+            'view_count': good_df["show_view_count"].values.tolist(),
			
 
				+        }
			
 
				+        if 'view_count_avg' in good_df.columns:
			
 
				+            extend_dicts['view_count_rate'] = \
			
 
				+                (good_df["show_view_count"] / good_df["view_count_avg"]).values.tolist()
			
 
				 
			
 
				-    async def getEachAccountScoreList(self, account_name):
			
 
				+        account_interest = good_df["title"].values.tolist()
			
 
				+        return account_interest, extend_dicts
			
 
				+
			
 
				+    async def get_each_account_score_list(self, gh_id):
			
 
				         """
			
 
				         获取和单个账号的相关性分数
			
 
				         :return:
			
 
				         """
			
 
				-        # try:
			
 
				-        account_interest, account_weight = await self.getAccountInterest(
			
 
				-            account_name=account_name,
			
 
				-            method=self.interest_type,
			
 
				-            rate=self.rate
			
 
				-        )
			
 
				-        sim_key = "score_list_mean" if self.sim_type == "mean" else "score_list_avg"
			
 
				-        response = await self.request_for_nlp(
			
 
				-            title_list=self.title_list,
			
 
				-            account_interest=account_interest,
			
 
				-            account_weight=account_weight
			
 
				-        )
			
 
				-        return {
			
 
				-            "score_list": response[sim_key],
			
 
				-            "text_list_max": response["text_list_max"],
			
 
				-        }
			
 
				-        # except Exception as e:
			
 
				-        #     print(e)
			
 
				-        #     return {
			
 
				-        #         "score_list": [0] * len(self.title_list),
			
 
				-        #         "text_list_max": self.title_list,
			
 
				-        #     }
			
 
				+        try:
			
 
				+            account_interest, extend_dicts = await self.get_account_interest(
			
 
				+                gh_id=gh_id,
			
 
				+                interest_type=self.interest_type,
			
 
				+                rate=self.rate,
			
 
				+                view_count_filter=self.view_count_filter,
			
 
				+                min_time=self.min_time,
			
 
				+                max_time=self.max_time,
			
 
				+            )
			
 
				+            interest_weight = extend_dicts['view_count']
			
 
				+            if self.sim_type == "weighted_by_view_count_rate":
			
 
				+                interest_weight = extend_dicts['view_count_rate']
			
 
				+            response = await self.request_for_nlp(
			
 
				+                title_list=self.title_list,
			
 
				+                account_interest=account_interest,
			
 
				+                interest_weight=interest_weight
			
 
				+            )
			
 
				+            score_list_key = "score_list_mean" if self.sim_type == "mean" else "score_list_avg"
			
 
				+            return {
			
 
				+                "score_list": response[score_list_key],
			
 
				+                "text_list_max": response["text_list_max"],
			
 
				+            }
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+            return {
			
 
				+                "score_list": [0] * len(self.title_list),
			
 
				+                "text_list_max": self.title_list,
			
 
				+            }
			
 
				 
			
 
				-    async def getAccountListScoreList(self):
			
 
				+    async def get_account_list_score_list(self):
			
 
				         """
			
 
				         获取AccountList中每一个账号的相关性分数
			
 
				         :return:
			
 
				         """
			
 
				         response = {}
			
 
				-        for accountName in self.account_name_list:
			
 
				-            if response.get(accountName):
			
 
				+        for gh_id in self.gh_id_list:
			
 
				+            if response.get(gh_id):
			
 
				                 continue
			
 
				             else:
			
 
				-                response[accountName] = await self.getEachAccountScoreList(account_name=accountName)
			
 
				+                response[gh_id] = await self.get_each_account_score_list(gh_id=gh_id)
			
 
				         return response
			
 
				 
			
 
				     async def deal(self):
			
@@ -151,5 +167,5 @@ class AccountServer(object):
 
				         :return:
			
 
				         """
			
 
				         return (
			
 
				-            self.checkParams() if self.checkParams() else await self.getAccountListScoreList()
			
 
				+            self.check_params() if self.check_params() else await self.get_account_list_score_list()
			
 
				         )
			
--- a/routes/articleDBServer.py
+++ b/routes/articleDBServer.py
@@ -28,8 +28,6 @@ class ArticleSpider(object):
 
				         """
			
 
				         try:
			
 
				             self.ghId = self.params['ghId']
			
 
				-            # self.startTime = self.params['startTime']
			
 
				-            # self.endTime = self.params['endTime']
			
 
				             return None
			
 
				         except Exception as e:
			
 
				             return {
			
@@ -44,10 +42,11 @@ class ArticleSpider(object):
 
				         :return:
			
 
				         """
			
 
				         sql = f"""
			
 
				-        select accountName, updateTime 
			
 
				-        from official_articles 
			
 
				-        where ghId = '{self.ghId}' 
			
 
				-        order by updateTime DESC;"""
			
 
				+            select accountName, updateTime 
			
 
				+            from official_articles_v2 
			
 
				+            where ghId = '{self.ghId}' 
			
 
				+            order by updateTime DESC;
			
 
				+            """
			
 
				         result = await self.mysql_client.async_select(sql)
			
 
				         if result:
			
 
				             account_name, update_time = result[0]
			
@@ -76,69 +75,77 @@ class ArticleSpider(object):
 
				             appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None)
			
 
				             createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
			
 
				             updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None)
			
 
				-            if int(time.time()) - int(updateTime) <= 20 * 60 * 60:
			
 
				-                continue
			
 
				+            # if int(time.time()) - int(updateTime) <= 20 * 60 * 60:
			
 
				+            #     continue
			
 
				             Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
			
 
				             detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
			
 
				             if detail_article_list:
			
 
				                 for article in detail_article_list:
			
 
				+                    title = article.get("Title", None)
			
 
				+                    Digest = article.get("Digest", None)
			
 
				+                    ItemIndex = article.get("ItemIndex", None)
			
 
				+                    ContentUrl = article.get("ContentUrl", None)
			
 
				+                    SourceUrl = article.get("SourceUrl", None)
			
 
				+                    CoverImgUrl = article.get("CoverImgUrl", None)
			
 
				+                    CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
			
 
				+                    CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
			
 
				+                    ItemShowType = article.get("ItemShowType", None)
			
 
				+                    IsOriginal = article.get("IsOriginal", None)
			
 
				+                    ShowDesc = article.get("ShowDesc", None)
			
 
				+                    show_stat = show_desc_to_sta(ShowDesc)
			
 
				+                    ori_content = article.get("ori_content", None)
			
 
				+                    show_view_count = show_stat.get("show_view_count", 0)
			
 
				+                    show_like_count = show_stat.get("show_like_count", 0)
			
 
				+                    show_zs_count = show_stat.get("show_zs_count", 0)
			
 
				+                    show_pay_count = show_stat.get("show_pay_count", 0)
			
 
				+                    wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
			
 
				+                    info_tuple = (
			
 
				+                        gh_id,
			
 
				+                        account_name,
			
 
				+                        appMsgId,
			
 
				+                        title,
			
 
				+                        Type,
			
 
				+                        createTime,
			
 
				+                        updateTime,
			
 
				+                        Digest,
			
 
				+                        ItemIndex,
			
 
				+                        ContentUrl,
			
 
				+                        SourceUrl,
			
 
				+                        CoverImgUrl,
			
 
				+                        CoverImgUrl_1_1,
			
 
				+                        CoverImgUrl_235_1,
			
 
				+                        ItemShowType,
			
 
				+                        IsOriginal,
			
 
				+                        ShowDesc,
			
 
				+                        ori_content,
			
 
				+                        show_view_count,
			
 
				+                        show_like_count,
			
 
				+                        show_zs_count,
			
 
				+                        show_pay_count,
			
 
				+                        wx_sn,
			
 
				+                        json.dumps(baseInfo, ensure_ascii=False)
			
 
				+                    )
			
 
				                     try:
			
 
				-                        title = article.get("Title", None)
			
 
				-                        Digest = article.get("Digest", None)
			
 
				-                        ItemIndex = article.get("ItemIndex", None)
			
 
				-                        ContentUrl = article.get("ContentUrl", None)
			
 
				-                        SourceUrl = article.get("SourceUrl", None)
			
 
				-                        CoverImgUrl = article.get("CoverImgUrl", None)
			
 
				-                        CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
			
 
				-                        CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
			
 
				-                        ItemShowType = article.get("ItemShowType", None)
			
 
				-                        IsOriginal = article.get("IsOriginal", None)
			
 
				-                        ShowDesc = article.get("ShowDesc", None)
			
 
				-                        show_stat = show_desc_to_sta(ShowDesc)
			
 
				-                        ori_content = article.get("ori_content", None)
			
 
				-                        show_view_count = show_stat.get("show_view_count", 0)
			
 
				-                        show_like_count = show_stat.get("show_like_count", 0)
			
 
				-                        show_zs_count = show_stat.get("show_zs_count", 0)
			
 
				-                        show_pay_count = show_stat.get("show_pay_count", 0)
			
 
				-                        wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
			
 
				-                        info_tuple = (
			
 
				-                            gh_id,
			
 
				-                            account_name,
			
 
				-                            appMsgId,
			
 
				-                            title,
			
 
				-                            Type,
			
 
				-                            createTime,
			
 
				-                            updateTime,
			
 
				-                            Digest,
			
 
				-                            ItemIndex,
			
 
				-                            ContentUrl,
			
 
				-                            SourceUrl,
			
 
				-                            CoverImgUrl,
			
 
				-                            CoverImgUrl_1_1,
			
 
				-                            CoverImgUrl_235_1,
			
 
				-                            ItemShowType,
			
 
				-                            IsOriginal,
			
 
				-                            ShowDesc,
			
 
				-                            ori_content,
			
 
				-                            show_view_count,
			
 
				-                            show_like_count,
			
 
				-                            show_zs_count,
			
 
				-                            show_pay_count,
			
 
				-                            wx_sn,
			
 
				-                            json.dumps(baseInfo, ensure_ascii=False)
			
 
				-                        )
			
 
				                         insert_sql = f"""
			
 
				-                            INSERT INTO official_articles
			
 
				+                            INSERT INTO official_articles_v2
			
 
				                             (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo)
			
 
				                             values
			
 
				                             (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
			
 
				                             """
			
 
				                         await self.mysql_client.async_insert(sql=insert_sql, params=info_tuple)
			
 
				-                        print("更新成功")
			
 
				+                        print("插入成功")
			
 
				                     except Exception as e:
			
 
				-                        print("error")
			
 
				-                        print(e)
			
 
				-                        continue
			
 
				+                        try:
			
 
				+                            update_sql = f"""
			
 
				+                            UPDATE official_articles_v2
			
 
				+                            SET show_view_count = %s, show_like_count=%s
			
 
				+                            WHERE wx_sn = %s;
			
 
				+                            """
			
 
				+                            await self.mysql_client.async_insert(sql=update_sql, params=(show_view_count, show_like_count, wx_sn))
			
 
				+                            print("更新成功")
			
 
				+                        except Exception as e:
			
 
				+                            print("失败-{}".format(e))
			
 
				+                            continue
			
 
				 
			
 
				     async def getAccountArticleList(self, gh_id, account_name, last_update_time, cursor=None):
			
 
				         """
			
--- a/routes/nlpServer.py
+++ b/routes/nlpServer.py
@@ -3,19 +3,18 @@
 
				 """
			
 
				 from applications.textSimilarity import NLPFunction
			
 
				 
			
 
				-
			
 
				 class NLPServer(object):
			
 
				     """
			
 
				     nlp_server
			
 
				     """
			
 
				-    def __init__(self, params, model):
			
 
				+    def __init__(self, params, model, embedding_manager):
			
 
				         """
			
 
				         :param params:
			
 
				         """
			
 
				         self.data = None
			
 
				         self.function = None
			
 
				         self.params = params
			
 
				-        self.nlp = NLPFunction(model=model)
			
 
				+        self.nlp = NLPFunction(model=model, embedding_manager=embedding_manager)
			
 
				 
			
 
				     def check_params(self):
			
 
				         """
			
@@ -25,6 +24,7 @@ class NLPServer(object):
 
				         try:
			
 
				             self.data = self.params['data']
			
 
				             self.function = self.params['function']
			
 
				+            self.use_cache = self.params.get('use_cache', True)
			
 
				             print("参数校验成功")
			
 
				             return None
			
 
				         except Exception as e:
			
@@ -41,9 +41,9 @@ class NLPServer(object):
 
				         """
			
 
				         match self.function:
			
 
				             case "similarities":
			
 
				-                return self.nlp.base_string_similarity(text_dict=self.data)
			
 
				+                return self.nlp.base_string_similarity(text_dict=self.data, use_cache=self.use_cache)
			
 
				             case "similarities_cross":
			
 
				-                return self.nlp.base_list_similarity(pair_list_dict=self.data)
			
 
				+                return self.nlp.base_list_similarity(pair_list_dict=self.data, use_cache=self.use_cache)
			
 
				             case "similarities_cross_max":
			
 
				                 return self.nlp.max_cross_similarity(data=self.data)
			
 
				             case "similarities_cross_avg":
			
--- a/test/article_list.py
+++ b/test/article_list.py
@@ -0,0 +1,23 @@
 
				+"""
			
 
				+@author: luojunhui
			
 
				+"""
			
 
				+import requests
			
 
				+
			
 
				+from applications.config import port
			
 
				+
			
 
				+url = "http://localhost:{}/title_list".format(port)
			
 
				+
			
 
				+response = requests.request(
			
 
				+    "POST",
			
 
				+    url=url,
			
 
				+    headers={},
			
 
				+    json={
			
 
				+        "account_name": "趣味晚年",
			
 
				+        "index_list": [1, 2],
			
 
				+        "min_time": None,
			
 
				+        "max_time": None,
			
 
				+        "msg_type": "9"
			
 
				+    }
			
 
				+)
			
 
				+print(response.json())
			
 
				+print(len(response.json()['title_list']))
			
--- a/test/nlp_dev.py
+++ b/test/nlp_dev.py
@@ -192,7 +192,7 @@ def test_request(url):
 
				     print(b - a)
			
 
				 
			
 
				 
			
 
				-url_list = ["http://192.168.100.31:6061/nlp"]
			
 
				+url_list = ["http://192.168.100.31:6060/nlp"]
			
 
				 test_request(url_list[0])
			
 
				 # with ThreadPoolExecutor(max_workers=3) as Pool:
			
 
				 #     Pool.map(test_request, url_list)
			
--- a/test/rank_dev.py
+++ b/test/rank_dev.py
--- a/test/score_list_dev.py
+++ b/test/score_list_dev.py
@@ -2,45 +2,48 @@
 
				 @author: luojunhui
			
 
				 """
			
 
				 import json
			
 
				+import time
			
 
				 
			
 
				 import requests
			
 
				+from concurrent.futures.thread import ThreadPoolExecutor
			
 
				 
			
 
				 
			
 
				-class ArticleRank(object):
			
 
				-    """
			
 
				-    账号排序
			
 
				+def score_list(account):
			
 
				     """
			
 
				     url = "http://192.168.100.31:8179/score_list"
			
 
				     url1 = "http://47.98.154.124:6060/score_list"
			
 
				     # url1 = "http://localhost:6060/score_list"
			
 
				-    url2 = "http://192.168.100.31:8179/score_list"
			
 
				-
			
 
				-    @classmethod
			
 
				-    def rank(cls, account_list, text_list):
			
 
				-        """
			
 
				-        Rank
			
 
				-        :param account_list:
			
 
				-        :param text_list:
			
 
				-        :return:
			
 
				-        """
			
 
				-        body = {
			
 
				-            "account_nickname_list": account_list,
			
 
				-            "text_list": text_list,
			
 
				-            "max_time": None,
			
 
				-            "min_time": None,
			
 
				-            "interest_type": "avg",
			
 
				-            "sim_type": "mean",
			
 
				-            "rate": 0.1
			
 
				-        }
			
 
				-        response = requests.post(url=cls.url, headers={}, json=body).json()
			
 
				-        return response
			
 
				+    url2 = "http://192.168.100.31:6062/score_list"
			
 
				+    :param account:
			
 
				+    :return:
			
 
				+    """
			
 
				+    url2 = "http://47.98.136.48:6060/score_list"
			
 
				+    body = {
			
 
				+        "account_nickname_list": [account],
			
 
				+        "text_list": [
			
 
				+            "在俄罗斯买好地了，却发现没有公路、码头、仓储、燃气管道……”",
			
 
				+            "被霸占15年后成功收回，岛礁资源超100万吨，曾遭到美菲联手抢夺",
			
 
				+            "感人！河南姐弟被父母遗弃，7岁弟弟带着姐姐看病：别怕，以后我养",
			
 
				+            "山东26岁女子产下罕见“4胞胎”，丈夫却突然消失，婆婆：养不起"
			
 
				+        ],
			
 
				+        "max_time": None,
			
 
				+        "min_time": None,
			
 
				+        "interest_type": "avg",
			
 
				+        "sim_type": "mean",
			
 
				+        "rate": 0.1
			
 
				+    }
			
 
				+    response = requests.post(url=url2, headers={}, json=body).json()
			
 
				+    print(json.dumps(response, ensure_ascii=False, indent=4))
			
 
				+    return response
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    AR = ArticleRank()
			
 
				-    response = AR.rank(
			
 
				-        account_list=['生活良读'],
			
 
				-        text_list=['保姆为300万拆迁款，嫁给大24岁老头，丈夫去世后，她发现房产证没有丈夫名字'] * 10,
			
 
				-
			
 
				-    )
			
 
				-    print(json.dumps(response, ensure_ascii=False, indent=4))
			
 
				+    # a = time.time()
			
 
				+    # with ThreadPoolExecutor(max_workers=100) as pool:
			
 
				+    #     pool.map(score_list, ["生活良读"] * 1)
			
 
				+    # b = time.time()
			
 
				+    # print(b - a)
			
 
				+    a = time.time()
			
 
				+    res = score_list("生活情感叁读")
			
 
				+    b = time.time()
			
 
				+    print(b - a)
			
--- a/tools/request_cache.py
+++ b/tools/request_cache.py
@@ -0,0 +1,67 @@
 
				+#! /usr/bin/env python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# vim:fenc=utf-8
			
 
				+
			
 
				+"""
			
 
				+A simple tool to request remote to make cache
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+import time
			
 
				+
			
 
				+import requests
			
 
				+from concurrent.futures.thread import ThreadPoolExecutor
			
 
				+
			
 
				+PLAN_ID = "20240813095121995118754"
			
 
				+ACCOUNT_MAP = {
			
 
				+    "gh_6d205db62f04": "20231214075906052349516",
			
 
				+    "gh_0c89e11f8bf3": "20231214075715819462085"
			
 
				+}
			
 
				+SERVER_URL = "http://47.98.136.48:6060/score_list"
			
 
				+
			
 
				+def get_articles(plan_id, account_id):
			
 
				+    URL = 'http://aigc-api.cybertogether.net/aigc/publish/content/gzhWaitingPublishContent'
			
 
				+    headers={
			
 
				+        "Content-Type": "application/json;charset=UTF-8"
			
 
				+    }
			
 
				+    payload={
			
 
				+        "params": {
			
 
				+            "accountId": account_id,
			
 
				+            "planId": plan_id
			
 
				+        }
			
 
				+    }
			
 
				+    resp = requests.post(URL, headers=headers, json=payload)
			
 
				+    json_data = resp.json()
			
 
				+    content_titles = [x['title'].replace("'", "") for x in json_data['data']]
			
 
				+    return content_titles
			
 
				+
			
 
				+def score_list(server_url, titles, account_gh_id):
			
 
				+    account_id = ACCOUNT_MAP[account_gh_id]
			
 
				+    predefined_titles = [
			
 
				+        "在俄罗斯买好地了，却发现没有公路、码头、仓储、燃气管道……”",
			
 
				+        "被霸占15年后成功收回，岛礁资源超100万吨，曾遭到美菲联手抢夺",
			
 
				+        "感人！河南姐弟被父母遗弃，7岁弟弟带着姐姐看病：别怕，以后我养",
			
 
				+        "山东26岁女子产下罕见“4胞胎”，丈夫却突然消失，婆婆：养不起",
			
 
				+        "突然，中国资产大爆发！A50指数期货直线拉升超4.5%，港股大涨！人民币也涨了"
			
 
				+    ]
			
 
				+
			
 
				+    t1 = time.time()
			
 
				+    body = {
			
 
				+        "gh_id_list": [account_gh_id],
			
 
				+        "text_list": titles,
			
 
				+        "max_time": None,
			
 
				+        "min_time": None,
			
 
				+        "interest_type": "avg",
			
 
				+        "sim_type": "avg",
			
 
				+        "rate": 0.1
			
 
				+    }
			
 
				+    response = requests.post(url=server_url, headers={}, json=body).json()
			
 
				+    t2 = time.time()
			
 
				+    print(json.dumps(response, ensure_ascii=False, indent=4))
			
 
				+    print(f"time: {t2 - t1:.4f}")
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    titles = get_articles(PLAN_ID, ACCOUNT_MAP['gh_6d205db62f04'])
			
 
				+    score_list(SERVER_URL, titles, 'gh_6d205db62f04')