|
@@ -22,6 +22,7 @@ from config import apolloConfig
|
|
|
const = AccountAssociationTaskConst()
|
|
|
function = Functions()
|
|
|
config = apolloConfig()
|
|
|
+empty_dict = {}
|
|
|
|
|
|
|
|
|
def get_inner_account_gh_id() -> Set[str]:
|
|
@@ -48,6 +49,7 @@ class AccountAssociationCrawler(object):
|
|
|
self.inner_account_count = 0
|
|
|
self.account_name_filter_count = 0
|
|
|
self.already_crawler_account_count = 0
|
|
|
+ self.official_accounts = 0
|
|
|
|
|
|
def is_bad_account(self, account_name: str) -> bool:
|
|
|
"""
|
|
@@ -62,6 +64,28 @@ class AccountAssociationCrawler(object):
|
|
|
return True
|
|
|
return False
|
|
|
|
|
|
+ def is_account_official(self, gh_id: str) -> bool:
|
|
|
+ """
|
|
|
+ 判断账号是否为官方账号
|
|
|
+ :param gh_id:
|
|
|
+ :return: True or False
|
|
|
+ """
|
|
|
+ response = self.spider.update_msg_list(ghId=gh_id, index=None)
|
|
|
+ article_list = response['data']['data']
|
|
|
+ published_articles_send_date = []
|
|
|
+ for item in article_list:
|
|
|
+ if item.get("AppMsg", empty_dict).get("BaseInfo", empty_dict).get("Type") == 9:
|
|
|
+ # 获取群发头条的send_time
|
|
|
+ send_time = item['AppMsg']['DetailInfo'][0]['send_time']
|
|
|
+ send_date = datetime.datetime.fromtimestamp(send_time).strftime('%Y-%m-%d')
|
|
|
+ published_articles_send_date.append(send_date)
|
|
|
+
|
|
|
+ published_articles_send_date_set = set(published_articles_send_date)
|
|
|
+ if len(published_articles_send_date_set) == len(published_articles_send_date):
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return True
|
|
|
+
|
|
|
def get_seed_titles(self, run_date: datetime) -> List[Dict]:
|
|
|
"""
|
|
|
:return:
|
|
@@ -148,6 +172,12 @@ class AccountAssociationCrawler(object):
|
|
|
if self.is_bad_account(account_name):
|
|
|
self.account_name_filter_count += 1
|
|
|
continue
|
|
|
+
|
|
|
+ # 判断账号是否为官方账号
|
|
|
+ if self.is_account_official(gh_id):
|
|
|
+ self.official_accounts += 1
|
|
|
+ continue
|
|
|
+
|
|
|
try:
|
|
|
self.insert_account_into_database(
|
|
|
account_name=account_name,
|
|
@@ -211,8 +241,8 @@ class AccountAssociationCrawler(object):
|
|
|
"总共联想到的账号数": self.total_crawler_count,
|
|
|
"内部账号过滤": self.inner_account_count,
|
|
|
"账号名称过滤": self.account_name_filter_count,
|
|
|
+ "官方账号过滤": self.official_accounts,
|
|
|
"已经抓取账号": self.already_crawler_account_count,
|
|
|
"新增账号": self.crawler_account_count
|
|
|
}
|
|
|
)
|
|
|
-
|