浏览代码

Merge branch 'master-GPU' of Server/LongArticleAlgServer into master

luojunhui 4 月之前
父节点
当前提交
b062b0e110

+ 11 - 0
Dockerfile

@@ -0,0 +1,11 @@
+FROM registry.cn-hangzhou.aliyuncs.com/stuuudy/cyber-crawler-base:latest
+
+WORKDIR /LongArticleAlgServer
+
+COPY . .
+
+ENV TZ=Asia/Shanghai
+
+RUN pip install -r requirements.txt -i https://mirrors.163.com/pypi/simple/ --no-cache-dir
+
+CMD ["hypercorn", "alg_app:app", "--config", "alg.toml"]

+ 1 - 1
alg.toml

@@ -1,6 +1,6 @@
 reload = true
 bind = "0.0.0.0:6060"
-workers = 4
+workers = 3
 keep_alive_timeout = 120  # 保持连接的最大秒数,根据需要调整
 graceful_timeout = 30    # 重启或停止之前等待当前工作完成的时间
 loglevel = "debug"  # 日志级别

+ 4 - 3
alg_app.py

@@ -5,11 +5,11 @@ from quart import Quart
 from similarities import BertSimilarity
 from routes import AlgRoutes
 from applications import AsyncMySQLClient
+from applications.embedding_manager import EmbeddingManager
 
 app = Quart(__name__)
 AsyncMySQL = AsyncMySQLClient(app)
 
-
 @app.before_serving
 async def init():
     """
@@ -17,8 +17,9 @@ async def init():
     """
     await AsyncMySQL.init_pool()
     model = BertSimilarity(model_name_or_path="BAAI/bge-large-zh-v1.5")
+    embedding_manager = EmbeddingManager(model)
     print("模型加载成功")
-    app_routes = AlgRoutes(AsyncMySQL, model)
+    app_routes = AlgRoutes(AsyncMySQL, model, embedding_manager)
     app.register_blueprint(app_routes)
 
 
@@ -32,4 +33,4 @@ async def close_db():
 
 
 if __name__ == '__main__':
-    app.run(debug=True, host="0.0.0.0", port=6060)
+    app.run()

+ 107 - 10
applications/articleTools.py

@@ -5,6 +5,7 @@
 import asyncio
 import aiomysql
 from pandas import DataFrame
+from datetime import datetime
 
 
 class TaskMySQLClient(object):
@@ -64,6 +65,27 @@ class TaskMySQLClient(object):
                 await coon.commit()
 
 
+class AccountAvgInfo:
+    def __init__(self, gh_id, position, update_time, account_name, fans, read_avg, like_avg, status,
+                 account_type, account_mode, account_source, account_status, business_type, read_rate_avg):
+        self.gh_id = gh_id
+        self.position = position
+        self.update_time = update_time
+        self.account_name = account_name
+        self.fans = fans
+        self.read_avg = read_avg
+        self.like_avg = like_avg
+        self.status = status
+        self.account_type = account_type
+        self.account_mode = account_mode
+        self.account_source = account_source
+        self.account_status = account_status
+        self.business_type = business_type
+        self.read_rate_avg = read_rate_avg
+
+    def __repr__(self):
+        return f"<AccountAvgInfo {self.account_name}>"
+
 class ArticleDBTools(object):
     """
     长文数据库相关功能
@@ -76,7 +98,58 @@ class ArticleDBTools(object):
         """
         self.mysql_client = mysql_client
 
-    async def getSingleAccountArticles(self, account_name):
+    async def getAccountAvgInfo(self, gh_id):
+        """
+        获取单个账号历史均值
+        """
+        keys = [
+            "gh_id",
+            "position",
+            "update_time",
+            "account_name",
+            "fans",
+            "read_avg",
+            "like_avg",
+            "status",
+            "account_type",
+            "account_mode",
+            "account_source",
+            "account_status",
+            "business_type",
+            "read_rate_avg"
+        ]
+        sql = f"""
+            SELECT {", ".join(keys)}
+            FROM account_avg_info_v3
+            WHERE gh_id = '{gh_id}'
+            and position = 1;"""
+        result = await self.mysql_client.async_select(sql=sql)
+        account_avg_info_list = [AccountAvgInfo(*row) for row in result] if result else []
+
+        return account_avg_info_list
+
+    async def get_account_avg_info(self, account_avg_info_map, available_dates, timestamp):
+        target_date = datetime.fromtimestamp(timestamp).date()
+        # 尝试获取指定日期
+        info = account_avg_info_map.get(target_date.isoformat())
+        if info is not None:
+            return info
+        # 如果指定日期不存在,寻找最近日期
+        closest_date = None
+        for date in available_dates:
+            if closest_date is None:
+                closest_date = date
+                continue
+            days = abs((datetime.fromisoformat(date).date() - target_date).days)
+            closest_days = abs((datetime.fromisoformat(closest_date).date() - target_date).days)
+            if days < closest_days:
+                closest_date = date
+            elif days > closest_days:
+                break
+
+        return account_avg_info_map.get(closest_date) if closest_date else None
+
+    async def getSingleAccountArticles(self, gh_id):
         """
         获取单个账号的历史文章
         :param gh_id:
@@ -95,21 +168,22 @@ class ArticleDBTools(object):
         ]
         sql = f"""
             SELECT {", ".join(keys)}
-            FROM official_articles
-            WHERE accountName = '{account_name}';"""
+            FROM official_articles_v2
+            WHERE ghId = '{gh_id}';"""
         result = await self.mysql_client.async_select(sql=sql)
         return DataFrame(result, columns=keys)
 
     async def getArticleByFilter(
             self,
-            account_name,
+            gh_id,
+            view_count_filter=None,
             index_list=None,
             min_time=None,
             max_time=None,
             msg_type=None,
     ):
         """
-        :param account_name:
+        :param gh_id:
         :param index_list: index ranges from 1 to 8
         :param min_time: earliest time
         :param max_time: latest time
@@ -125,18 +199,21 @@ class ArticleDBTools(object):
         if not max_time:
             # 2099年
             max_time = 4088051123
-        articleDataFrame = await self.getSingleAccountArticles(account_name=account_name)
+        articleDataFrame = await self.getSingleAccountArticles(gh_id=gh_id)
         filterDataFrame = articleDataFrame[
             (articleDataFrame["Type"] == msg_type)
             & (min_time < articleDataFrame["updateTime"])
             & (articleDataFrame["updateTime"] < max_time)
             & (articleDataFrame["ItemIndex"].isin(index_list))
             ]
+        if view_count_filter:
+            filterDataFrame = filterDataFrame[(filterDataFrame["show_view_count"] > view_count_filter)]
         return filterDataFrame
 
     async def get_good_bad_articles(self,
-                                    account_name,
-                                    method,
+                                    gh_id,
+                                    interest_type,
+                                    view_count_filter,
                                     rate=0.1,
                                     index_list=None,
                                     min_time=None,
@@ -148,7 +225,8 @@ class ArticleDBTools(object):
         :return:
         """
         article_data_frame = await self.getArticleByFilter(
-            account_name=account_name,
+            gh_id=gh_id,
+            view_count_filter=view_count_filter,
             index_list=index_list,
             min_time=min_time,
             max_time=max_time,
@@ -156,7 +234,7 @@ class ArticleDBTools(object):
         )
         df_rows = len(article_data_frame)
         if df_rows > 0:
-            match method:
+            match interest_type:
                 case "top":
                     sorted_df = article_data_frame.sort_values(by='show_view_count', reversed=True)
                     topn = max(int(df_rows * rate), 1)
@@ -168,6 +246,25 @@ class ArticleDBTools(object):
                     good_df = article_data_frame[(article_data_frame['show_view_count']) > avg_view * (1.0 + rate)]
                     bad_df = article_data_frame[(article_data_frame['show_view_count']) > avg_view * (1.0 - rate)]
                     return good_df, bad_df
+                case "account_avg":
+                    account_read_avg_list = await self.getAccountAvgInfo(
+                        gh_id=gh_id
+                    )
+                    account_avg_info_map = {info.update_time: info for info in account_read_avg_list}
+                    # 获取所有可用日期并排序
+                    available_dates = sorted(account_avg_info_map.keys())
+                    view_count_avg_list = []
+                    for index, row in article_data_frame.iterrows():
+                        update_time = row['updateTime']
+                        info = await self.get_account_avg_info(account_avg_info_map, available_dates, update_time)
+                        view_count_avg_list.append(info.read_avg)
+
+                    article_data_frame['view_count_avg'] = view_count_avg_list
+                    good_df = article_data_frame[(article_data_frame['show_view_count']) >
+                                                 (article_data_frame['view_count_avg']) * (1.0 + rate)]
+                    bad_df = article_data_frame[(article_data_frame['show_view_count']) >
+                                                (article_data_frame['view_count_avg']) * (1.0 - rate)]
+                    return good_df, bad_df
         else:
             return None, None
 

+ 1 - 0
applications/asyncMySQL.py

@@ -24,6 +24,7 @@ class AsyncMySQLClient(object):
             password='crawler123456@',
             db='piaoquan-crawler',
             charset='utf8mb4',
+            maxsize=100,
             connect_timeout=120,
         )
         print("mysql init successfully")

+ 5 - 1
applications/config.py

@@ -2,4 +2,8 @@
 @author: luojunhui
 """
 # 默认数据库表
-db_config = ""
+db_config = ""
+
+port = "6060"
+
+ip = "localhost"

+ 131 - 0
applications/embedding_manager.py

@@ -0,0 +1,131 @@
+import os
+import threading
+from filelock import FileLock
+from time import sleep
+import numpy as np
+import random
+
+
+class EmbeddingManager:
+    def __init__(self, model, emb_size=1024, cache_file="cache/embedding_cache", save_interval=600):
+        self.model = model
+        self.emb_size = emb_size
+        self.cache_file = cache_file
+        self.cache_file_real = self.cache_file + ".npy"
+        self.cache_key_file = f'{self.cache_file}.keys'
+        # avoid multiple process read and write at same time and wait for filelock
+        self.save_interval = save_interval + random.randint(0, save_interval)
+        self.cache = {}
+        self.lock = threading.Lock()  # Thread-safe lock
+        self.filelock = FileLock(self.cache_file + ".lock")
+
+        self.load_cache()
+
+        # Start the periodic saving thread
+        self.saving_thread = threading.Thread(target=self._periodic_save, daemon=True)
+        self.saving_thread.start()
+
+
+    def _load_cache_unsafe(self):
+        """inter-thread and inter-process safety must be guaranteed by caller"""
+        embedding_data = np.load(self.cache_file_real)
+        embedding_keys = open(self.cache_key_file, "r").readlines()
+        embedding_keys = [key.strip("\n") for key in embedding_keys]
+        for idx, key in enumerate(embedding_keys):
+            self.cache[key] = embedding_data[idx]
+
+    def load_cache(self):
+        with self.lock:
+            if os.path.exists(self.cache_file_real):
+                with self.filelock:
+                    self._load_cache_unsafe()
+            print("[EmbeddingManager]cache loaded")
+
+    def dump_cache(self):
+        if os.path.dirname(self.cache_file):
+            os.makedirs(os.path.dirname(self.cache_file), 0o755, True)
+        tmp_cache_file = self.cache_file + ".tmp"
+        tmp_cache_key_file = self.cache_key_file + ".tmp"
+        with self.lock:  # Ensure thread-safe access firstly
+            with self.filelock: # Ensure inter-process safety secondly
+                if os.path.exists(self.cache_file_real):
+                    self._load_cache_unsafe()
+                keys = self.cache.keys()
+                cache_to_save = np.zeros((len(keys), self.emb_size), np.float32)
+                for idx, key in enumerate(keys):
+                    cache_to_save[idx] = self.cache[key]
+                np.save(tmp_cache_file, cache_to_save)
+                with open(tmp_cache_key_file, 'w') as fp:
+                    fp.write('\n'.join(keys))
+                if os.path.exists(self.cache_file + ".npy"):
+                    os.rename(self.cache_file + ".npy", self.cache_file + ".npy.bak")
+                if os.path.exists(self.cache_key_file):
+                    os.rename(self.cache_key_file, self.cache_key_file + ".bak")
+                os.rename(tmp_cache_file + ".npy", self.cache_file + ".npy")
+                os.rename(tmp_cache_key_file, self.cache_key_file)
+        print("[EmbeddingManager]cache dumped")
+
+    def get_embeddings(self, text_list):
+        """
+        Search embedding for a given text. If not found, generate using the model, save to cache, and return it.
+        """
+        if not isinstance(text_list, list):
+            raise Exception(f"Invalid parameter type: text_list {type(text_list)}")
+        embedding_list = np.zeros((len(text_list), self.emb_size), np.float32)
+        if not text_list:
+            return embedding_list
+        new_texts = []
+        new_texts_ori_idx = []
+        with self.lock:
+            for idx, text in enumerate(text_list):
+                if text in self.cache:
+                    # print(f"find {text} in cache")
+                    embedding_list[idx] = self.cache[text]
+                else:
+                    new_texts.append(text)
+                    new_texts_ori_idx.append(idx)
+
+        new_embeddings = self.model.get_embeddings(new_texts)
+        if new_embeddings.shape[0] > 0 and new_embeddings.shape[1] != self.emb_size:
+            raise Exception("Embedding size mismatch")
+
+        # Generate embedding if not found in cache
+        with self.lock:  # Ensure thread-safe access
+            for idx, text in enumerate(new_texts):
+                if text not in self.cache:  # Re-check in case another thread added it
+                    self.cache[text] = new_embeddings[idx]
+                embedding_list[new_texts_ori_idx[idx]] = new_embeddings[idx]
+        return embedding_list
+
+    def _periodic_save(self):
+        """Periodically save the cache to disk."""
+        while True:
+            sleep(self.save_interval)
+            self.dump_cache()
+
+
+# Only for testing
+class DummyModel:
+    def padding_text(self, text):
+        padding_factor = 1024 // len(text)
+        text = text * padding_factor
+        text += text[:1024 - len(text)]
+        return text
+
+    def get_embeddings(self, text_list):
+        embeddings = np.zeros((len(text_list), 1024), np.float32)
+        for idx, text in enumerate(text_list):
+            text = self.padding_text(text)
+            embedding = np.array([ord(c) for c in text], np.float32)
+            embeddings[idx] = embedding
+        return embeddings
+
+if __name__ == "__main__":
+    model = DummyModel()
+    manager = EmbeddingManager(model)
+    print(manager.get_embeddings(["hello"]))
+    print(manager.get_embeddings(["world"]))
+    print(manager.get_embeddings(["hello world"]))
+    manager.dump_cache()
+    print(manager.get_embeddings(["new", "word"]))
+    manager.dump_cache()

+ 1 - 1
applications/functions/__init__.py

@@ -2,7 +2,7 @@
 @author: luojunhui
 """
 from .article_account import ArticleRank
-from .article_tools import title_sim_v2_by_list
+from .article_tools import title_sim_v2_by_list, is_bad
 from .server_article_account import get_article_title_url_list, get_article_titles
 
 

+ 4 - 2
applications/functions/article_account.py

@@ -5,12 +5,14 @@ import json
 
 import requests
 
+from applications.config import port
+
 
 class ArticleRank(object):
     """
     账号排序
     """
-    url = "http://192.168.100.31:8179/score_list"
+    url = "http://localhost:{}/score_list".format(port)
 
     @classmethod
     def rank(cls, account_list, text_list):
@@ -25,7 +27,7 @@ class ArticleRank(object):
             "text_list": text_list,
             "max_time": None,
             "min_time": None,
-            "interest_type": "by_avg",
+            "interest_type": "avg",
             "sim_type": "mean",
             "rate": 0.1
         }

文件差异内容过多而无法显示
+ 3 - 0
applications/functions/article_tools.py


+ 45 - 0
applications/functions/bad.txt

@@ -0,0 +1,45 @@
+男人跟婚外女人很难断,不是因为爱,而是这三个原因,别傻傻不知
+瑞士和平会,乌有2个好消息,有得就有失,捧场的基本是西方国家
+华春莹代表中国,用《琉球地位未定论》,好好给日本上了一课
+再下一城,27年的等待一朝绽放,中国开始放大招
+一把大火,文明倒退1000年
+善恶终有报?晚年有了儿子的王刚本以为可以安度晚年,但如今的情况却让人感叹
+赵丽蓉:中国第一位小品女皇,因一句台词识破潜伏间谍,72岁自备寿衣,临终前为巩汉林留一句忠告
+致命一击!美国的惊天谎言被拆穿,暴露其真实嘴脸!
+男生遭醉汉掐脖反击后被刑拘,警方最新回应
+“夫妻不送葬,送葬必遭殃”,为什么老伴不能去送呢?亲人去世后,最忌讳的八件事,别不当回事!
+102岁杨振宁豪宅内,宴请小他27岁的岳母听沪剧,翁帆一个举动真情流露
+中国垫资38亿美元帮修铁路,6年过去没还,还想期限延长至50年!
+结婚前新娘突然要增加50万彩礼,否则不嫁,新郎无奈放弃,新娘收50万彩礼另嫁他人,1年后,报应来了!
+张学良到死都不知道,当年救他一命的不是宋美龄,而是一个痴心女
+美大选反转再反转!笑到最后的竟是她?拜登、特朗普都始料未及
+大妈揣80万去儿子家住一周,7天吃咸菜,孙子:等你走了吃煎牛排
+山西农民房檐上石狮子7万没有卖,3天后发生的事情让他始料未及
+当不成总统了?美大选有变,中方需早做准备
+女人最佳“绝经期”已公布,不是45岁,而是这个数,越接近越健康
+村头光棍在下雪天捡一女婴,终身未娶拉扯张长大28年,女孩回报方式看哭众人
+杜聿明的女儿,嫁给自己的老师杨振宁,生三个孩子事业有成
+他享有国葬殊荣,临终捐光2000亿,大儿子开出租谋生
+男人跟婚外女人很难断,不是因为爱,而是这三个原因,别傻傻不知
+瑞士和平会,乌有2个好消息,有得就有失,捧场的基本是西方国家
+华春莹代表中国,用《琉球地位未定论》,好好给日本上了一课
+再下一城,27年的等待一朝绽放,中国开始放大招
+一把大火,文明倒退1000年
+善恶终有报?晚年有了儿子的王刚本以为可以安度晚年,但如今的情况却让人感叹
+赵丽蓉:中国第一位小品女皇,因一句台词识破潜伏间谍,72岁自备寿衣,临终前为巩汉林留一句忠告
+致命一击!美国的惊天谎言被拆穿,暴露其真实嘴脸!
+男生遭醉汉掐脖反击后被刑拘,警方最新回应
+“夫妻不送葬,送葬必遭殃”,为什么老伴不能去送呢?亲人去世后,最忌讳的八件事,别不当回事!
+102岁杨振宁豪宅内,宴请小他27岁的岳母听沪剧,翁帆一个举动真情流露
+中国垫资38亿美元帮修铁路,6年过去没还,还想期限延长至50年!
+结婚前新娘突然要增加50万彩礼,否则不嫁,新郎无奈放弃,新娘收50万彩礼另嫁他人,1年后,报应来了!
+张学良到死都不知道,当年救他一命的不是宋美龄,而是一个痴心女
+美大选反转再反转!笑到最后的竟是她?拜登、特朗普都始料未及
+大妈揣80万去儿子家住一周,7天吃咸菜,孙子:等你走了吃煎牛排
+山西农民房檐上石狮子7万没有卖,3天后发生的事情让他始料未及
+当不成总统了?美大选有变,中方需早做准备
+女人最佳“绝经期”已公布,不是45岁,而是这个数,越接近越健康
+村头光棍在下雪天捡一女婴,终身未娶拉扯张长大28年,女孩回报方式看哭众人
+杜聿明的女儿,嫁给自己的老师杨振宁,生三个孩子事业有成
+他享有国葬殊荣,临终捐光2000亿,大儿子开出租谋生
+到底是奶奶亲,还是姥姥亲?科学的排序现实又扎心,对照看看和你想得一样吗?

+ 35 - 21
applications/pipeline.py

@@ -5,14 +5,41 @@ import time
 
 import requests
 
-from applications.functions import title_sim_v2_by_list
-from applications.functions import get_article_titles
+from applications.functions import title_sim_v2_by_list, is_bad
+from applications.config import port
 
 
 class LongArticlesPipeline(object):
     """
     Long articles Pipeline
     """
+    @classmethod
+    def get_titles(cls, account_name, index_list):
+        """
+        :param account_name:
+        :param index_list:
+        :return:
+        """
+        print("开始请求")
+        print(account_name)
+        print(index_list)
+        url = "http://localhost:{}/title_list".format(port)
+        response = requests.request(
+            "POST",
+            url=url,
+            headers={},
+            json={
+                "account_name": account_name,
+                "index_list": index_list,
+                "min_time": None,
+                "max_time": None,
+                "msg_type": "9"
+            }
+        )
+        print(response.status_code)
+        print(response.text)
+        print(response.json())
+        return response.json()['title_list']
 
     @classmethod
     def history_title(cls, account_nickname):
@@ -24,11 +51,13 @@ class LongArticlesPipeline(object):
         # if "【1】" in plan_name or "【2】" in plan_name:
         index_list_1 = [1, 2]
         index_list_2 = [1, 2, 3, 4, 5, 6, 7, 8]
-        account_title_list_1 = get_article_titles(
+        print("开始请求")
+        account_title_list_1 = cls.get_titles(
             account_nickname,
             index_list=index_list_1
         )
-        account_title_list_2 = get_article_titles(
+        print(account_title_list_1)
+        account_title_list_2 = cls.get_titles(
             account_nickname,
             index_list=index_list_2
         )
@@ -57,7 +86,7 @@ class LongArticlesPipeline(object):
         """
         判断文章是否安全
         """
-        url = "http://192.168.100.31:8177/sensitive/is_sensitive"
+        url = "http://61.48.133.26:8177/sensitive/is_sensitive"
         body = {
             "text": title
         }
@@ -76,22 +105,7 @@ class LongArticlesPipeline(object):
         :param account_nickname:
         :return:
         """
-        url = "http://192.168.100.31:8176/bad/is_bad"
-        headers = {
-            "accept": "application/json",
-            "Content-Type": "application/json"
-        }
-        body = {
-            "account_nickname": account_nickname,
-            "title": title
-        }
-        response = requests.request(
-            "POST",
-            url=url,
-            headers=headers,
-            json=body
-        )
-        return response.json()['is_bad']
+        return is_bad(title)
 
     @classmethod
     def deal(cls, article_obj, account_name, history_title_dict):

+ 32 - 11
applications/textSimilarity.py

@@ -17,39 +17,60 @@ def score_to_attention(score, symbol=1):
     score_attn = torch.nn.functional.softmax(score_norm, dim=1)
     return score_attn, score_norm, score_pred
 
+def compare_tensor(tensor1, tensor2):
+    if tensor1.shape != tensor2.shape:
+        print(f"[compare_tensor]shape error: {tensor1.shape} vs {tensor2.shape}")
+        return
+    if not torch.allclose(tensor1, tensor2):
+        print("[compare_tensor]value error: tensor1 not close to tensor2")
 
 class NLPFunction(object):
     """
     NLP Task
     """
 
-    def __init__(self, model):
+    def __init__(self, model, embedding_manager):
         self.model = model
+        self.embedding_manager = embedding_manager
 
-    def base_string_similarity(self, text_dict):
+    def direct_similarity(self, a, b):
+        return self.model.similarity(a, b)
+
+    def cached_similarity(self, a, b):
+        text_emb1 = self.embedding_manager.get_embeddings(a)
+        text_emb2 = self.embedding_manager.get_embeddings(b)
+        score_function = self.model.score_functions['cos_sim']
+        score_tensor = score_function(text_emb1, text_emb2)
+        return score_tensor
+
+    def base_string_similarity(self, text_dict, use_cache=True):
         """
         基础功能,计算两个字符串的相似度
         :param text_dict:
         :return:
         """
-        score_tensor = self.model.similarity(
-            text_dict['text_a'],
-            text_dict['text_b']
-        )
+        text_a = text_dict['text_a']
+        text_b = text_dict['text_b']
+        if use_cache:
+            score_tensor = self.cached_similarity(text_a, text_b)
+        else:
+            score_tensor = self.direct_similarity(text_a, text_b)
         response = {
             "score": score_tensor.squeeze().tolist()
         }
         return response
 
-    def base_list_similarity(self, pair_list_dict):
+    def base_list_similarity(self, pair_list_dict, use_cache=True):
         """
         计算两个list的相似度
         :return:
         """
-        score_tensor = self.model.similarity(
-            pair_list_dict['text_list_a'],
-            pair_list_dict['text_list_b']
-        )
+        text_a = pair_list_dict['text_list_a']
+        text_b = pair_list_dict['text_list_b']
+        if use_cache:
+            score_tensor = self.cached_similarity(text_a, text_b)
+        else:
+            score_tensor = self.direct_similarity(text_a, text_b)
         response = {
             "score_list_list": score_tensor.tolist()
         }

+ 5 - 4
applications/wxSpider.py

@@ -86,7 +86,7 @@ class ArticleManager(object):
         search articles in wx
         :return:
         """
-        url = "http://8.217.190.241:8888/crawler/wei_xin/keyword"
+        url = "http://47.98.154.124:8888/crawler/wei_xin/keyword"
         payload = json.dumps({
             "keyword": title,
             "cursor": "1"
@@ -106,7 +106,7 @@ class ArticleManager(object):
         :param content_link:
         :return:
         """
-        url = "http://8.217.190.241:8888/crawler/wei_xin/detail"
+        url = "http://47.98.154.124:8888/crawler/wei_xin/detail"
         payload = json.dumps({
             "content_link": content_link,
             "is_count": False,
@@ -123,8 +123,9 @@ class ArticleManager(object):
     def update_msg_list(cls, ghId, index):
         """
         :return:
+        "http://47.98.154.124:8888/crawler/wei_xin/detail"
         """
-        url = 'http://8.217.190.241:8888/crawler/wei_xin/blogger'
+        url = 'http://47.98.154.124:8888/crawler/wei_xin/blogger'
         payload = {
             'account_id': ghId,
             'cursor': index,
@@ -145,7 +146,7 @@ class ArticleManager(object):
         """
         async with aiohttp.ClientSession() as session:
             async with session.post(
-                    url='http://8.217.190.241:8888/crawler/wei_xin/account_info',
+                    url='http://47.98.154.124:8888/crawler/wei_xin/account_info',
                     headers={'Content-Type': 'application/json'},
                     json={"content_link": content_url}
             ) as response:

+ 19 - 0
docker-compose.yaml

@@ -0,0 +1,19 @@
+version: '3.8'
+
+services:
+  server-prod:
+
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: long_article_alg_server
+    container_name: long_article_alg_server
+    restart: unless-stopped
+    ports:
+      - '6060:6060'
+    networks:
+      - crawler
+    entrypoint: ["hypercorn", "alg_app:app", "--config", "alg.toml"]
+
+networks:
+  crawler:

+ 2 - 1
requirements.txt

@@ -26,4 +26,5 @@ torch~=2.3.1
 tqdm~=4.66.4
 transformers
 pydantic~=2.6.4
-similarities~=1.1.7
+similarities~=1.1.7
+filelock

+ 25 - 4
routes/__init__.py

@@ -8,9 +8,9 @@ from .accountArticleRank import AccountArticleRank
 from .nlpServer import NLPServer
 from .articleDBServer import ArticleSpider
 from .accountServer import AccountServer
+from applications.articleTools import ArticleDBTools
 
-
-def AlgRoutes(mysql_client, model):
+def AlgRoutes(mysql_client, model, embedding_manager):
     """
     ALG ROUTES
     :return:
@@ -45,12 +45,12 @@ def AlgRoutes(mysql_client, model):
         :return:
         """
         params = await request.get_json()
-        nlpS = NLPServer(params=params, model=model)
+        nlpS = NLPServer(params=params, model=model, embedding_manager=embedding_manager)
         response = nlpS.deal()
         return jsonify(response)
 
     @blueprint.route("/score_list", methods=["POST"])
-    async def articleAccount():
+    async def article_account():
         """
         公众号文章功能等接口
         :return:
@@ -60,6 +60,27 @@ def AlgRoutes(mysql_client, model):
         response = await AS.deal()
         return jsonify(response)
 
+    @blueprint.route("/title_list", methods=["POST"])
+    async def accountTitle():
+        """
+        获取账号的标题list
+        :return:
+        """
+        params = await request.get_json()
+        print(params)
+        ADBT = ArticleDBTools(mysql_client=mysql_client)
+        responseDF = await ADBT.getArticleByFilter(
+            account_name=params['account_name'],
+            index_list=params['index_list'],
+            min_time=params['min_time'],
+            max_time=params['max_time'],
+            msg_type=params['msg_type'],
+
+        )
+        title_list = responseDF['title']
+        response = {"title_list": title_list.values.tolist()}
+        return jsonify(response)
+
     @blueprint.route("/article_crawler", methods=["POST"])
     async def articleMysql():
         """

+ 3 - 0
routes/accountArticleRank.py

@@ -64,9 +64,11 @@ class AccountArticleRank(object):
         """
         self.publishArticleList = []
         self.filter_list = []
+        print("历史")
         history_title_dict = self.pipeline.history_title(
             account_nickname=self.accountName
         )
+        print(history_title_dict)
         for item in tqdm(self.params["publishArticleList"]):
             flag = self.pipeline.deal(item, self.accountName, history_title_dict)
             if flag:
@@ -89,6 +91,7 @@ class AccountArticleRank(object):
             self.publishNum = self.params["publishNum"]
             print("开始校验参数")
             self.filter()
+            print("参数校验成功")
             self.logger.log(code="1001", msg="参数校验成功", data=self.params)
             return None
         except Exception as e:

+ 63 - 47
routes/accountServer.py

@@ -5,6 +5,7 @@ import json
 
 import aiohttp
 from applications.articleTools import ArticleDBTools
+from applications.config import port
 
 
 class AccountServer(object):
@@ -14,26 +15,28 @@ class AccountServer(object):
 
     def __init__(self, mysql_client, params):
         self.account_name_list = None
+        self.gh_id_list = None
         self.sim_type = None
         self.interest_type = None
         self.min_time = None
         self.max_time = None
         self.rate = None
         self.title_list = None
+        self.view_count_filter = None
         self.params = params
         self.AT = ArticleDBTools(mysql_client)
 
-    async def request_for_nlp(self, title_list, account_interest, account_weight):
+    async def request_for_nlp(self, title_list, account_interest, interest_weight):
         """
         nlp process
         """
         headers = {"Content-Type": "application/json"}
-        url = "http://localhost:6060/nlp"
+        url = "http://localhost:{}/nlp".format(port)
         body = {
             "data": {
                 "text_list_a": [i.replace("'", "") for i in title_list],
                 "text_list_b": [i.replace("'", "") for i in account_interest],
-                "score_list_b": account_weight,
+                "score_list_b": interest_weight,
                 "symbol": 1,
             },
             "function": "similarities_cross_mean" if self.sim_type == "mean" else "similarities_cross_avg"
@@ -41,14 +44,14 @@ class AccountServer(object):
         async with aiohttp.ClientSession() as session:
             async with session.post(url, headers=headers, json=body) as response:
                 response_text = await response.text()
-                print("结果:\t", response_text)
+                # print("结果:\t", response_text)
                 if response_text:
                     return await response.json()
                 else:
                     print("Received empty response")
                     return {}
 
-    def checkParams(self):
+    def check_params(self):
         """
         校验传参
         :return:
@@ -56,20 +59,23 @@ class AccountServer(object):
         try:
             self.title_list = self.params["text_list"]
             self.account_name_list = self.params.get("account_nickname_list", [])
+            self.gh_id_list = self.params.get("gh_id_list", [])
             self.rate = self.params.get("rate", 0.1)
             self.max_time = self.params.get("max_time")
             self.min_time = self.params.get("min_time")
             self.interest_type = self.params.get("interest_type", "top")
             self.sim_type = self.params.get("sim_type", "mean")
+            self.view_count_filter = self.params.get("view_count_filter", None)
             return None
         except Exception as e:
             response = {"error": "Params error", "detail": str(e)}
             return response
 
-    async def getAccountInterest(
+    async def get_account_interest(
         self,
-        account_name,
-        method,
+        gh_id,
+        interest_type,
+        view_count_filter,
         rate=None,
         msg_type=None,
         index_list=None,
@@ -78,71 +84,81 @@ class AccountServer(object):
     ):
         """
         获取账号的兴趣类型
-        :param account_name:
+        :param gh_id:
         :param max_time:
         :param min_time:
         :param index_list:
         :param msg_type:
-        :param keys_dict:
         :param rate:
-        :param gh_id:
-        :param method:
+        :param interest_type:
+        :param view_count_filter:
         :return:
         """
         good_df, bad_df = await self.AT.get_good_bad_articles(
-            account_name=account_name,
-            method=method,
+            gh_id=gh_id,
+            interest_type=interest_type,
             msg_type=msg_type,
             index_list=index_list,
             min_time=min_time,
             max_time=max_time,
             rate=rate,
+            view_count_filter=view_count_filter,
         )
-        view_count_list = good_df["show_view_count"].values.tolist()
-        title_list = good_df["title"].values.tolist()
-        print(view_count_list)
-        print(title_list)
-        return title_list, view_count_list
+        extend_dicts = {
+            'view_count': good_df["show_view_count"].values.tolist(),
+        }
+        if 'view_count_avg' in good_df.columns:
+            extend_dicts['view_count_rate'] = \
+                (good_df["show_view_count"] / good_df["view_count_avg"]).values.tolist()
 
-    async def getEachAccountScoreList(self, account_name):
+        account_interest = good_df["title"].values.tolist()
+        return account_interest, extend_dicts
+
+    async def get_each_account_score_list(self, gh_id):
         """
         获取和单个账号的相关性分数
         :return:
         """
-        # try:
-        account_interest, account_weight = await self.getAccountInterest(
-            account_name=account_name,
-            method=self.interest_type,
-            rate=self.rate
-        )
-        sim_key = "score_list_mean" if self.sim_type == "mean" else "score_list_avg"
-        response = await self.request_for_nlp(
-            title_list=self.title_list,
-            account_interest=account_interest,
-            account_weight=account_weight
-        )
-        return {
-            "score_list": response[sim_key],
-            "text_list_max": response["text_list_max"],
-        }
-        # except Exception as e:
-        #     print(e)
-        #     return {
-        #         "score_list": [0] * len(self.title_list),
-        #         "text_list_max": self.title_list,
-        #     }
+        try:
+            account_interest, extend_dicts = await self.get_account_interest(
+                gh_id=gh_id,
+                interest_type=self.interest_type,
+                rate=self.rate,
+                view_count_filter=self.view_count_filter,
+                min_time=self.min_time,
+                max_time=self.max_time,
+            )
+            interest_weight = extend_dicts['view_count']
+            if self.sim_type == "weighted_by_view_count_rate":
+                interest_weight = extend_dicts['view_count_rate']
+            response = await self.request_for_nlp(
+                title_list=self.title_list,
+                account_interest=account_interest,
+                interest_weight=interest_weight
+            )
+            score_list_key = "score_list_mean" if self.sim_type == "mean" else "score_list_avg"
+            return {
+                "score_list": response[score_list_key],
+                "text_list_max": response["text_list_max"],
+            }
+        except Exception as e:
+            print(e)
+            return {
+                "score_list": [0] * len(self.title_list),
+                "text_list_max": self.title_list,
+            }
 
-    async def getAccountListScoreList(self):
+    async def get_account_list_score_list(self):
         """
         获取AccountList中每一个账号的相关性分数
         :return:
         """
         response = {}
-        for accountName in self.account_name_list:
-            if response.get(accountName):
+        for gh_id in self.gh_id_list:
+            if response.get(gh_id):
                 continue
             else:
-                response[accountName] = await self.getEachAccountScoreList(account_name=accountName)
+                response[gh_id] = await self.get_each_account_score_list(gh_id=gh_id)
         return response
 
     async def deal(self):
@@ -151,5 +167,5 @@ class AccountServer(object):
         :return:
         """
         return (
-            self.checkParams() if self.checkParams() else await self.getAccountListScoreList()
+            self.check_params() if self.check_params() else await self.get_account_list_score_list()
         )

+ 64 - 57
routes/articleDBServer.py

@@ -28,8 +28,6 @@ class ArticleSpider(object):
         """
         try:
             self.ghId = self.params['ghId']
-            # self.startTime = self.params['startTime']
-            # self.endTime = self.params['endTime']
             return None
         except Exception as e:
             return {
@@ -44,10 +42,11 @@ class ArticleSpider(object):
         :return:
         """
         sql = f"""
-        select accountName, updateTime 
-        from official_articles 
-        where ghId = '{self.ghId}' 
-        order by updateTime DESC;"""
+            select accountName, updateTime 
+            from official_articles_v2 
+            where ghId = '{self.ghId}' 
+            order by updateTime DESC;
+            """
         result = await self.mysql_client.async_select(sql)
         if result:
             account_name, update_time = result[0]
@@ -76,69 +75,77 @@ class ArticleSpider(object):
             appMsgId = info.get("AppMsg", {}).get("BaseInfo", {}).get("AppMsgId", None)
             createTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("CreateTime", None)
             updateTime = info.get("AppMsg", {}).get("BaseInfo", {}).get("UpdateTime", None)
-            if int(time.time()) - int(updateTime) <= 20 * 60 * 60:
-                continue
+            # if int(time.time()) - int(updateTime) <= 20 * 60 * 60:
+            #     continue
             Type = info.get("AppMsg", {}).get("BaseInfo", {}).get("Type", None)
             detail_article_list = info.get("AppMsg", {}).get("DetailInfo", [])
             if detail_article_list:
                 for article in detail_article_list:
+                    title = article.get("Title", None)
+                    Digest = article.get("Digest", None)
+                    ItemIndex = article.get("ItemIndex", None)
+                    ContentUrl = article.get("ContentUrl", None)
+                    SourceUrl = article.get("SourceUrl", None)
+                    CoverImgUrl = article.get("CoverImgUrl", None)
+                    CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
+                    CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
+                    ItemShowType = article.get("ItemShowType", None)
+                    IsOriginal = article.get("IsOriginal", None)
+                    ShowDesc = article.get("ShowDesc", None)
+                    show_stat = show_desc_to_sta(ShowDesc)
+                    ori_content = article.get("ori_content", None)
+                    show_view_count = show_stat.get("show_view_count", 0)
+                    show_like_count = show_stat.get("show_like_count", 0)
+                    show_zs_count = show_stat.get("show_zs_count", 0)
+                    show_pay_count = show_stat.get("show_pay_count", 0)
+                    wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
+                    info_tuple = (
+                        gh_id,
+                        account_name,
+                        appMsgId,
+                        title,
+                        Type,
+                        createTime,
+                        updateTime,
+                        Digest,
+                        ItemIndex,
+                        ContentUrl,
+                        SourceUrl,
+                        CoverImgUrl,
+                        CoverImgUrl_1_1,
+                        CoverImgUrl_235_1,
+                        ItemShowType,
+                        IsOriginal,
+                        ShowDesc,
+                        ori_content,
+                        show_view_count,
+                        show_like_count,
+                        show_zs_count,
+                        show_pay_count,
+                        wx_sn,
+                        json.dumps(baseInfo, ensure_ascii=False)
+                    )
                     try:
-                        title = article.get("Title", None)
-                        Digest = article.get("Digest", None)
-                        ItemIndex = article.get("ItemIndex", None)
-                        ContentUrl = article.get("ContentUrl", None)
-                        SourceUrl = article.get("SourceUrl", None)
-                        CoverImgUrl = article.get("CoverImgUrl", None)
-                        CoverImgUrl_1_1 = article.get("CoverImgUrl_1_1", None)
-                        CoverImgUrl_235_1 = article.get("CoverImgUrl_235_1", None)
-                        ItemShowType = article.get("ItemShowType", None)
-                        IsOriginal = article.get("IsOriginal", None)
-                        ShowDesc = article.get("ShowDesc", None)
-                        show_stat = show_desc_to_sta(ShowDesc)
-                        ori_content = article.get("ori_content", None)
-                        show_view_count = show_stat.get("show_view_count", 0)
-                        show_like_count = show_stat.get("show_like_count", 0)
-                        show_zs_count = show_stat.get("show_zs_count", 0)
-                        show_pay_count = show_stat.get("show_pay_count", 0)
-                        wx_sn = ContentUrl.split("&sn=")[1].split("&")[0] if ContentUrl else None
-                        info_tuple = (
-                            gh_id,
-                            account_name,
-                            appMsgId,
-                            title,
-                            Type,
-                            createTime,
-                            updateTime,
-                            Digest,
-                            ItemIndex,
-                            ContentUrl,
-                            SourceUrl,
-                            CoverImgUrl,
-                            CoverImgUrl_1_1,
-                            CoverImgUrl_235_1,
-                            ItemShowType,
-                            IsOriginal,
-                            ShowDesc,
-                            ori_content,
-                            show_view_count,
-                            show_like_count,
-                            show_zs_count,
-                            show_pay_count,
-                            wx_sn,
-                            json.dumps(baseInfo, ensure_ascii=False)
-                        )
                         insert_sql = f"""
-                            INSERT INTO official_articles
+                            INSERT INTO official_articles_v2
                             (ghId, accountName, appMsgId, title, Type, createTime, updateTime, Digest, ItemIndex, ContentUrl, SourceUrl, CoverImgUrl, CoverImgUrl_1_1, CoverImgUrl_255_1, ItemShowType, IsOriginal, ShowDesc, ori_content, show_view_count, show_like_count, show_zs_count, show_pay_count, wx_sn, baseInfo)
                             values
                             (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                             """
                         await self.mysql_client.async_insert(sql=insert_sql, params=info_tuple)
-                        print("更新成功")
+                        print("插入成功")
                     except Exception as e:
-                        print("error")
-                        print(e)
-                        continue
+                        try:
+                            update_sql = f"""
+                            UPDATE official_articles_v2
+                            SET show_view_count = %s, show_like_count=%s
+                            WHERE wx_sn = %s;
+                            """
+                            await self.mysql_client.async_insert(sql=update_sql, params=(show_view_count, show_like_count, wx_sn))
+                            print("更新成功")
+                        except Exception as e:
+                            print("失败-{}".format(e))
+                            continue
 
     async def getAccountArticleList(self, gh_id, account_name, last_update_time, cursor=None):
         """

+ 5 - 5
routes/nlpServer.py

@@ -3,19 +3,18 @@
 """
 from applications.textSimilarity import NLPFunction
 
-
 class NLPServer(object):
     """
     nlp_server
     """
-    def __init__(self, params, model):
+    def __init__(self, params, model, embedding_manager):
         """
         :param params:
         """
         self.data = None
         self.function = None
         self.params = params
-        self.nlp = NLPFunction(model=model)
+        self.nlp = NLPFunction(model=model, embedding_manager=embedding_manager)
 
     def check_params(self):
         """
@@ -25,6 +24,7 @@ class NLPServer(object):
         try:
             self.data = self.params['data']
             self.function = self.params['function']
+            self.use_cache = self.params.get('use_cache', True)
             print("参数校验成功")
             return None
         except Exception as e:
@@ -41,9 +41,9 @@ class NLPServer(object):
         """
         match self.function:
             case "similarities":
-                return self.nlp.base_string_similarity(text_dict=self.data)
+                return self.nlp.base_string_similarity(text_dict=self.data, use_cache=self.use_cache)
             case "similarities_cross":
-                return self.nlp.base_list_similarity(pair_list_dict=self.data)
+                return self.nlp.base_list_similarity(pair_list_dict=self.data, use_cache=self.use_cache)
             case "similarities_cross_max":
                 return self.nlp.max_cross_similarity(data=self.data)
             case "similarities_cross_avg":

+ 23 - 0
test/article_list.py

@@ -0,0 +1,23 @@
+"""
+@author: luojunhui
+"""
+import requests
+
+from applications.config import port
+
+url = "http://localhost:{}/title_list".format(port)
+
+response = requests.request(
+    "POST",
+    url=url,
+    headers={},
+    json={
+        "account_name": "趣味晚年",
+        "index_list": [1, 2],
+        "min_time": None,
+        "max_time": None,
+        "msg_type": "9"
+    }
+)
+print(response.json())
+print(len(response.json()['title_list']))

+ 1 - 1
test/nlp_dev.py

@@ -192,7 +192,7 @@ def test_request(url):
     print(b - a)
 
 
-url_list = ["http://192.168.100.31:6061/nlp"]
+url_list = ["http://192.168.100.31:6060/nlp"]
 test_request(url_list[0])
 # with ThreadPoolExecutor(max_workers=3) as Pool:
 #     Pool.map(test_request, url_list)

文件差异内容过多而无法显示
+ 5 - 7
test/rank_dev.py


+ 34 - 31
test/score_list_dev.py

@@ -2,45 +2,48 @@
 @author: luojunhui
 """
 import json
+import time
 
 import requests
+from concurrent.futures.thread import ThreadPoolExecutor
 
 
-class ArticleRank(object):
-    """
-    账号排序
+def score_list(account):
     """
     url = "http://192.168.100.31:8179/score_list"
     url1 = "http://47.98.154.124:6060/score_list"
     # url1 = "http://localhost:6060/score_list"
-    url2 = "http://192.168.100.31:8179/score_list"
-
-    @classmethod
-    def rank(cls, account_list, text_list):
-        """
-        Rank
-        :param account_list:
-        :param text_list:
-        :return:
-        """
-        body = {
-            "account_nickname_list": account_list,
-            "text_list": text_list,
-            "max_time": None,
-            "min_time": None,
-            "interest_type": "avg",
-            "sim_type": "mean",
-            "rate": 0.1
-        }
-        response = requests.post(url=cls.url, headers={}, json=body).json()
-        return response
+    url2 = "http://192.168.100.31:6062/score_list"
+    :param account:
+    :return:
+    """
+    url2 = "http://47.98.136.48:6060/score_list"
+    body = {
+        "account_nickname_list": [account],
+        "text_list": [
+            "在俄罗斯买好地了,却发现没有公路、码头、仓储、燃气管道……”",
+            "被霸占15年后成功收回,岛礁资源超100万吨,曾遭到美菲联手抢夺",
+            "感人!河南姐弟被父母遗弃,7岁弟弟带着姐姐看病:别怕,以后我养",
+            "山东26岁女子产下罕见“4胞胎”,丈夫却突然消失,婆婆:养不起"
+        ],
+        "max_time": None,
+        "min_time": None,
+        "interest_type": "avg",
+        "sim_type": "mean",
+        "rate": 0.1
+    }
+    response = requests.post(url=url2, headers={}, json=body).json()
+    print(json.dumps(response, ensure_ascii=False, indent=4))
+    return response
 
 
 if __name__ == '__main__':
-    AR = ArticleRank()
-    response = AR.rank(
-        account_list=['生活良读'],
-        text_list=['保姆为300万拆迁款,嫁给大24岁老头,丈夫去世后,她发现房产证没有丈夫名字'] * 10,
-
-    )
-    print(json.dumps(response, ensure_ascii=False, indent=4))
+    # a = time.time()
+    # with ThreadPoolExecutor(max_workers=100) as pool:
+    #     pool.map(score_list, ["生活良读"] * 1)
+    # b = time.time()
+    # print(b - a)
+    a = time.time()
+    res = score_list("生活情感叁读")
+    b = time.time()
+    print(b - a)

+ 67 - 0
tools/request_cache.py

@@ -0,0 +1,67 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:fenc=utf-8
+
+"""
+A simple tool to request remote to make cache
+"""
+
+import json
+import time
+
+import requests
+from concurrent.futures.thread import ThreadPoolExecutor
+
+PLAN_ID = "20240813095121995118754"
+ACCOUNT_MAP = {
+    "gh_6d205db62f04": "20231214075906052349516",
+    "gh_0c89e11f8bf3": "20231214075715819462085"
+}
+SERVER_URL = "http://47.98.136.48:6060/score_list"
+
+def get_articles(plan_id, account_id):
+    URL = 'http://aigc-api.cybertogether.net/aigc/publish/content/gzhWaitingPublishContent'
+    headers={
+        "Content-Type": "application/json;charset=UTF-8"
+    }
+    payload={
+        "params": {
+            "accountId": account_id,
+            "planId": plan_id
+        }
+    }
+    resp = requests.post(URL, headers=headers, json=payload)
+    json_data = resp.json()
+    content_titles = [x['title'].replace("'", "") for x in json_data['data']]
+    return content_titles
+
+def score_list(server_url, titles, account_gh_id):
+    account_id = ACCOUNT_MAP[account_gh_id]
+    predefined_titles = [
+        "在俄罗斯买好地了,却发现没有公路、码头、仓储、燃气管道……”",
+        "被霸占15年后成功收回,岛礁资源超100万吨,曾遭到美菲联手抢夺",
+        "感人!河南姐弟被父母遗弃,7岁弟弟带着姐姐看病:别怕,以后我养",
+        "山东26岁女子产下罕见“4胞胎”,丈夫却突然消失,婆婆:养不起",
+        "突然,中国资产大爆发!A50指数期货直线拉升超4.5%,港股大涨!人民币也涨了"
+    ]
+
+    t1 = time.time()
+    body = {
+        "gh_id_list": [account_gh_id],
+        "text_list": titles,
+        "max_time": None,
+        "min_time": None,
+        "interest_type": "avg",
+        "sim_type": "avg",
+        "rate": 0.1
+    }
+    response = requests.post(url=server_url, headers={}, json=body).json()
+    t2 = time.time()
+    print(json.dumps(response, ensure_ascii=False, indent=4))
+    print(f"time: {t2 - t1:.4f}")
+    return response
+
+
+if __name__ == '__main__':
+    titles = get_articles(PLAN_ID, ACCOUNT_MAP['gh_6d205db62f04'])
+    score_list(SERVER_URL, titles, 'gh_6d205db62f04')

部分文件因为文件数量过多而无法显示