Browse Source

公众号账号抓取分析

luojunhui 2 tháng trước cách đây
mục cha
commit
b0b20157cb

+ 6 - 5
applications/tasks/crawler_tasks/crawler_account_manager.py

@@ -3,6 +3,7 @@ import math
 
 from datetime import datetime
 from typing import Optional, List, Dict
+import pandas as pd
 from pandas import DataFrame
 from scipy import stats
 from tqdm.asyncio import tqdm
@@ -113,12 +114,12 @@ class WeixinAccountManager(CrawlerAccountManager):
             ci_lower, ci_upper = mean - margin, mean + margin
 
         # 计算发文频率
-        publish_times = dataframe["publish_time"].dropna()
+        publish_times = pd.to_numeric(dataframe["publish_time"], errors="coerce")
+        publish_times = publish_times.replace([float("inf"), float("-inf")], pd.NA).dropna()
         if len(publish_times) >= 2:
-            delta = publish_times.max() - publish_times.min()
-            publish_frequency = (
-                (len(publish_times) / delta * self.ONE_DAY_TIMESTAMP) if delta else 0.0
-            )
+            dates = pd.to_datetime(publish_times, unit="s").dt.normalize()
+            days_delta = max(int((dates.max() - dates.min()).days) + 1, 1)
+            publish_frequency = len(publish_times) / days_delta
         else:
             publish_frequency = 0.0