Prechádzať zdrojové kódy

Merge branch 'feature/luojunhui/add-token-to-blogger' of luojunhui/LongArticlesJob into master

luojunhui 2 dní pred
rodič
commit
20ea372e27

+ 1 - 0
.gitignore

@@ -28,6 +28,7 @@ test/
 test/
 .vscode
 .DS_Store
+.idea
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

+ 1 - 1
applications/api/nlp_api.py

@@ -16,7 +16,7 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
     :return: list of similarity
     """
 
-    url = 'http://61.48.133.26:6060/nlp'
+    url = 'http://192.168.203.4:6060/nlp'
     url_backup = 'http://192.168.203.4:6060/nlp'
     body = {
         "data": {

+ 2 - 0
applications/const/__init__.py

@@ -113,6 +113,8 @@ class UpdateAccountReadRateTaskConst:
     # 最低粉丝量
     MIN_FANS = 1000
 
+    GROUP_ACCOUNT_SET  = {'gh_9cf3b7ff486b', 'gh_ecb21c0453af', 'gh_45beb952dc74', 'gh_84e744b16b3a', 'gh_b3ffc1ca3a04', 'gh_b8baac4296cb', 'gh_efaf7da157f5', 'gh_5855bed97938', 'gh_b32125c73861', 'gh_761976bb98a6', 'gh_5e543853d8f0', 'gh_61a72b720de3'}
+
 
 class UpdateAccountReadAvgTaskConst:
     """

+ 28 - 4
cal_account_read_rate_avg_daily.py

@@ -51,17 +51,37 @@ def get_account_articles_detail(db_client, gh_id_tuple, min_publish_timestamp) -
     """
     sql = f"""
             SELECT 
-                ghId, accountName, ItemIndex, show_view_count, publish_timestamp
+                ghId, accountName, ItemIndex, 
+                avg(show_view_count),
+                FROM_UNIXTIME(publish_timestamp, '%Y-%m-%d') AS pub_dt,
+                publish_timestamp
             FROM 
                 official_articles_v2
             WHERE 
-                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp};
+                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp}
+            GROUP BY ghId, accountName, ItemIndex, pub_dt;
             """
     response_list = db_client.fetch(query=sql, cursor_type=DictCursor)
     return response_list
 
 
-def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
+def get_fans_from_group_send_accounts(db_client,gh_id, dt):
+    """
+    获取指定日期发送的文章的账号粉丝数
+    """
+    query = """
+        SELECT CAST(SUM(sent_count) / 8 as signed) as fans
+        FROM long_articles_group_send_result
+        WHERE publish_date = %s AND gh_id = %s; 
+    """
+    response = db_client.fetch(query=query, cursor_type=DictCursor, params=(dt, gh_id))
+    if response:
+        return response[0]['fans']
+
+    return 0
+
+
+def cal_account_read_rate(db_client, article_list, fans_dict) -> DataFrame:
     """
     计算账号位置的阅读率
     :return:
@@ -71,8 +91,12 @@ def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
         gh_id = line['ghId']
         dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
         fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
+        if gh_id in const.GROUP_ACCOUNT_SET:
+            fans = get_fans_from_group_send_accounts(db_client, gh_id, dt)
+
         if not fans:
             fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
+
         if not fans:
             fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
             log(
@@ -313,7 +337,7 @@ def main() -> None:
     article_list = get_account_articles_detail(db_client=piaoquan_crawler_db_client, gh_id_tuple=gh_id_tuple, min_publish_timestamp=min_time)
 
     # cal account read rate and make a dataframe
-    read_rate_dataframe = cal_account_read_rate(article_list, fans_dict)
+    read_rate_dataframe = cal_account_read_rate(db_client=long_articles_db_client, article_list=article_list, fans_dict=fans_dict)
 
     # update each day's data
     update_single_day(dt, account_list, read_rate_dataframe, long_articles_db_client)

+ 2 - 1
cold_start/crawler/wechat/official_accounts_api.py

@@ -63,7 +63,8 @@ def get_article_list_from_account(
     payload = json.dumps(
         {
             "account_id": account_id,
-            "cursor": index
+            "cursor": index,
+            "token": "1fa4c0ad5c66e43ebd525611f3869f53"
         }
     )
     try: