Jelajahi Sumber

优化服务号分组,阅读率 && 阅读均值

luojunhui 1 Minggu lalu
induk
melakukan
9e44a79b7b

+ 1 - 1
applications/api/nlp_api.py

@@ -16,7 +16,7 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
     :return: list of similarity
     """
 
-    url = 'http://61.48.133.26:6060/nlp'
+    url = 'http://61.48.133.26:6066/nlp'
     url_backup = 'http://192.168.203.4:6060/nlp'
     body = {
         "data": {

+ 2 - 0
applications/const/__init__.py

@@ -113,6 +113,8 @@ class UpdateAccountReadRateTaskConst:
     # 最低粉丝量
     MIN_FANS = 1000
 
+    GROUP_ACCOUNT_SET  = {'gh_9cf3b7ff486b', 'gh_ecb21c0453af', 'gh_45beb952dc74', 'gh_84e744b16b3a', 'gh_b3ffc1ca3a04', 'gh_b8baac4296cb', 'gh_efaf7da157f5', 'gh_5855bed97938', 'gh_b32125c73861', 'gh_761976bb98a6', 'gh_5e543853d8f0', 'gh_61a72b720de3'}
+
 
 class UpdateAccountReadAvgTaskConst:
     """

+ 28 - 4
cal_account_read_rate_avg_daily.py

@@ -51,17 +51,37 @@ def get_account_articles_detail(db_client, gh_id_tuple, min_publish_timestamp) -
     """
     sql = f"""
             SELECT 
-                ghId, accountName, ItemIndex, show_view_count, publish_timestamp
+                ghId, accountName, ItemIndex, 
+                avg(show_view_count),
+                FROM_UNIXTIME(publish_timestamp, '%Y-%m-%d') AS pub_dt,
+                publish_timestamp
             FROM 
                 official_articles_v2
             WHERE 
-                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp};
+                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp}
+            GROUP BY ghId, accountName, ItemIndex, pub_dt;
             """
     response_list = db_client.fetch(query=sql, cursor_type=DictCursor)
     return response_list
 
 
-def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
+def get_fans_from_group_send_accounts(db_client,gh_id, dt):
+    """
+    获取指定日期发送的文章的账号粉丝数
+    """
+    query = """
+        SELECT CAST(SUM(sent_count) / 8 as signed) as fans
+        FROM long_articles_group_send_result
+        WHERE publish_date = %s AND gh_id = %s; 
+    """
+    response = db_client.fetch(query=query, cursor_type=DictCursor, params=(dt, gh_id))
+    if response:
+        return response[0]['fans']
+
+    return 0
+
+
+def cal_account_read_rate(db_client, article_list, fans_dict) -> DataFrame:
     """
     计算账号位置的阅读率
     :return:
@@ -71,8 +91,12 @@ def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
         gh_id = line['ghId']
         dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
         fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
+        if gh_id in const.GROUP_ACCOUNT_SET:
+            fans = get_fans_from_group_send_accounts(db_client, gh_id, dt)
+
         if not fans:
             fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
+
         if not fans:
             fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
             log(
@@ -313,7 +337,7 @@ def main() -> None:
     article_list = get_account_articles_detail(db_client=piaoquan_crawler_db_client, gh_id_tuple=gh_id_tuple, min_publish_timestamp=min_time)
 
     # cal account read rate and make a dataframe
-    read_rate_dataframe = cal_account_read_rate(article_list, fans_dict)
+    read_rate_dataframe = cal_account_read_rate(db_client=long_articles_db_client, article_list=article_list, fans_dict=fans_dict)
 
     # update each day's data
     update_single_day(dt, account_list, read_rate_dataframe, long_articles_db_client)