فهرست منبع

账号联想优化

luojunhui 3 ماه پیش
والد
کامیت
d09c7cf77a
2فایلهای تغییر یافته به همراه32 افزوده شده و 1 حذف شده
  1. 1 0
      .gitignore
  2. 31 1
      coldStartTasks/crawler/weixin_account_association_crawler.py

+ 1 - 0
.gitignore

@@ -21,6 +21,7 @@ lib64/
 parts/
 sdist/
 var/
+test/
 *.egg-info/
 .installed.cfg
 *.egg

+ 31 - 1
coldStartTasks/crawler/weixin_account_association_crawler.py

@@ -22,6 +22,7 @@ from config import apolloConfig
 const = AccountAssociationTaskConst()
 function = Functions()
 config = apolloConfig()
+empty_dict = {}
 
 
 def get_inner_account_gh_id() -> Set[str]:
@@ -48,6 +49,7 @@ class AccountAssociationCrawler(object):
         self.inner_account_count = 0
         self.account_name_filter_count = 0
         self.already_crawler_account_count = 0
+        self.official_accounts = 0
 
     def is_bad_account(self, account_name: str) -> bool:
         """
@@ -62,6 +64,28 @@ class AccountAssociationCrawler(object):
                 return True
         return False
 
+    def is_account_official(self, gh_id: str) -> bool:
+        """
+        判断账号是否为官方账号
+        :param gh_id:
+        :return: True or False
+        """
+        response = self.spider.update_msg_list(ghId=gh_id, index=None)
+        article_list = response['data']['data']
+        published_articles_send_date = []
+        for item in article_list:
+            if item.get("AppMsg", empty_dict).get("BaseInfo", empty_dict).get("Type") == 9:
+                # 获取群发头条的send_time
+                send_time = item['AppMsg']['DetailInfo'][0]['send_time']
+                send_date = datetime.datetime.fromtimestamp(send_time).strftime('%Y-%m-%d')
+                published_articles_send_date.append(send_date)
+
+        published_articles_send_date_set = set(published_articles_send_date)
+        if len(published_articles_send_date_set) == len(published_articles_send_date):
+            return False
+        else:
+            return True
+ 
     def get_seed_titles(self, run_date: datetime) -> List[Dict]:
         """
         :return:
@@ -148,6 +172,12 @@ class AccountAssociationCrawler(object):
                         if self.is_bad_account(account_name):
                             self.account_name_filter_count += 1
                             continue
+
+                        # 判断账号是否为官方账号
+                        if self.is_account_official(gh_id):
+                            self.official_accounts += 1
+                            continue
+
                         try:
                             self.insert_account_into_database(
                                 account_name=account_name,
@@ -211,8 +241,8 @@ class AccountAssociationCrawler(object):
                 "总共联想到的账号数": self.total_crawler_count,
                 "内部账号过滤": self.inner_account_count,
                 "账号名称过滤": self.account_name_filter_count,
+                "官方账号过滤": self.official_accounts,
                 "已经抓取账号": self.already_crawler_account_count,
                 "新增账号": self.crawler_account_count
             }
         )
-