1 ay önce · 20ea372e27
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,7 @@ test/
 
				 test/
			
 
				 .vscode
			
 
				 .DS_Store
			
 
				+.idea
			
 
				 
			
 
				 # PyInstaller
			
 
				 #  Usually these files are written by a python script from a template
			
--- a/applications/api/nlp_api.py
+++ b/applications/api/nlp_api.py
@@ -16,7 +16,7 @@ def similarity_between_title_list(target_title_list: list[str], base_title_list:
 
				     :return: list of similarity
			
 
				     """
			
 
				 
			
 
				-    url = 'http://61.48.133.26:6060/nlp'
			
 
				+    url = 'http://192.168.203.4:6060/nlp'
			
 
				     url_backup = 'http://192.168.203.4:6060/nlp'
			
 
				     body = {
			
 
				         "data": {
			
--- a/applications/const/__init__.py
+++ b/applications/const/__init__.py
@@ -113,6 +113,8 @@ class UpdateAccountReadRateTaskConst:
 
				     # 最低粉丝量
			
 
				     MIN_FANS = 1000
			
 
				 
			
 
				+    GROUP_ACCOUNT_SET  = {'gh_9cf3b7ff486b', 'gh_ecb21c0453af', 'gh_45beb952dc74', 'gh_84e744b16b3a', 'gh_b3ffc1ca3a04', 'gh_b8baac4296cb', 'gh_efaf7da157f5', 'gh_5855bed97938', 'gh_b32125c73861', 'gh_761976bb98a6', 'gh_5e543853d8f0', 'gh_61a72b720de3'}
			
 
				+
			
 
				 
			
 
				 class UpdateAccountReadAvgTaskConst:
			
 
				     """
			
--- a/cal_account_read_rate_avg_daily.py
+++ b/cal_account_read_rate_avg_daily.py
@@ -51,17 +51,37 @@ def get_account_articles_detail(db_client, gh_id_tuple, min_publish_timestamp) -
 
				     """
			
 
				     sql = f"""
			
 
				             SELECT 
			
 
				-                ghId, accountName, ItemIndex, show_view_count, publish_timestamp
			
 
				+                ghId, accountName, ItemIndex, 
			
 
				+                avg(show_view_count),
			
 
				+                FROM_UNIXTIME(publish_timestamp, '%Y-%m-%d') AS pub_dt,
			
 
				+                publish_timestamp
			
 
				             FROM 
			
 
				                 official_articles_v2
			
 
				             WHERE 
			
 
				-                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp};
			
 
				+                ghId IN {gh_id_tuple} and Type = '{const.BULK_PUBLISH_TYPE}' and publish_timestamp >= {min_publish_timestamp}
			
 
				+            GROUP BY ghId, accountName, ItemIndex, pub_dt;
			
 
				             """
			
 
				     response_list = db_client.fetch(query=sql, cursor_type=DictCursor)
			
 
				     return response_list
			
 
				 
			
 
				 
			
 
				-def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
			
 
				+def get_fans_from_group_send_accounts(db_client,gh_id, dt):
			
 
				+    """
			
 
				+    获取指定日期发送的文章的账号粉丝数
			
 
				+    """
			
 
				+    query = """
			
 
				+        SELECT CAST(SUM(sent_count) / 8 as signed) as fans
			
 
				+        FROM long_articles_group_send_result
			
 
				+        WHERE publish_date = %s AND gh_id = %s; 
			
 
				+    """
			
 
				+    response = db_client.fetch(query=query, cursor_type=DictCursor, params=(dt, gh_id))
			
 
				+    if response:
			
 
				+        return response[0]['fans']
			
 
				+
			
 
				+    return 0
			
 
				+
			
 
				+
			
 
				+def cal_account_read_rate(db_client, article_list, fans_dict) -> DataFrame:
			
 
				     """
			
 
				     计算账号位置的阅读率
			
 
				     :return:
			
@@ -71,8 +91,12 @@ def cal_account_read_rate(article_list, fans_dict) -> DataFrame:
 
				         gh_id = line['ghId']
			
 
				         dt = functions.timestamp_to_str(timestamp=line['publish_timestamp'], string_format='%Y-%m-%d')
			
 
				         fans = fans_dict.get(gh_id, {}).get(dt, const.DEFAULT_FANS)
			
 
				+        if gh_id in const.GROUP_ACCOUNT_SET:
			
 
				+            fans = get_fans_from_group_send_accounts(db_client, gh_id, dt)
			
 
				+
			
 
				         if not fans:
			
 
				             fans = int(unauthorized_account.get(gh_id, const.DEFAULT_FANS))
			
 
				+
			
 
				         if not fans:
			
 
				             fans = int(backup_account_fans.get(gh_id, const.DEFAULT_FANS))
			
 
				             log(
			
@@ -313,7 +337,7 @@ def main() -> None:
 
				     article_list = get_account_articles_detail(db_client=piaoquan_crawler_db_client, gh_id_tuple=gh_id_tuple, min_publish_timestamp=min_time)
			
 
				 
			
 
				     # cal account read rate and make a dataframe
			
 
				-    read_rate_dataframe = cal_account_read_rate(article_list, fans_dict)
			
 
				+    read_rate_dataframe = cal_account_read_rate(db_client=long_articles_db_client, article_list=article_list, fans_dict=fans_dict)
			
 
				 
			
 
				     # update each day's data
			
 
				     update_single_day(dt, account_list, read_rate_dataframe, long_articles_db_client)
			
--- a/cold_start/crawler/wechat/official_accounts_api.py
+++ b/cold_start/crawler/wechat/official_accounts_api.py
@@ -63,7 +63,8 @@ def get_article_list_from_account(
 
				     payload = json.dumps(
			
 
				         {
			
 
				             "account_id": account_id,
			
 
				-            "cursor": index
			
 
				+            "cursor": index,
			
 
				+            "token": "1fa4c0ad5c66e43ebd525611f3869f53"
			
 
				         }
			
 
				     )
			
 
				     try: