소스 검색

weixin账号抓取

luojunhui 1 년 전
부모
커밋
2b9f379c52
2개의 변경된 파일74개의 추가작업 그리고 12개의 파일을 삭제
  1. 67 0
      coldStartTasks/crawler/weixinAccountCrawler.py
  2. 7 12
      coldStartTasks/crawler/weixinCategoryCrawler.py

+ 67 - 0
coldStartTasks/crawler/weixinAccountCrawler.py

@@ -0,0 +1,67 @@
+"""
+@author: luojunhui
+输入文章连接,输出账号信息,并且把账号存储到数据库中
+"""
+import datetime
+
+from tqdm import tqdm
+from applications import WeixinSpider, longArticlesMySQL
+
+
+class weixinAccountCrawler(object):
+    """
+    weixinAccountCrawler
+    """
+
+    def __init__(self, target_url_list):
+        self.db_client = longArticlesMySQL
+        self.spider = WeixinSpider()
+        self.url_list = target_url_list
+
+    def get_account_detail(self, url):
+        """
+        通过文章链接获取账号信息
+        :param url:
+        :return:
+        """
+        account_detail = self.spider.get_account_by_url(content_url=url)
+        account_obj = account_detail['data']['data']
+        account_name = account_obj['account_name']
+        gh_id = account_obj['wx_gh']
+        self.insert_account_into_database(account_name, gh_id)
+
+    def insert_account_into_database(self, account_name, gh_id, category=None):
+        """
+        :param category:
+        :param account_name:
+        :param gh_id:
+        :return:
+        """
+        if not category:
+            category = "daily-account-mining"
+        insert_sql = f"""
+            INSERT INTO long_articles_accounts
+            (gh_id, account_source, account_name, account_category, init_date)
+            values 
+            (%s, %s, %s, %s, %s)
+        """
+        self.db_client.update(
+            sql=insert_sql,
+            params=(gh_id, "weixin", account_name, category, datetime.date.today().__str__())
+        )
+
+    def deal(self):
+        """
+        entrance of this code
+        :return:
+        """
+        for url in tqdm(self.url_list):
+            self.get_account_detail(url)
+
+
+if __name__ == '__main__':
+    url_list = [
+        'https://mp.weixin.qq.com/s/Q9Je-eNKcHNjh8S-NqQLgg'
+    ]
+    wac = weixinAccountCrawler(url_list)
+    wac.deal()

+ 7 - 12
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -7,7 +7,7 @@ import time
 
 from tqdm import tqdm
 
-from applications import WeixinSpider, Functions, DeNetMysql, PQMySQL
+from applications import WeixinSpider, Functions, DeNetMysql, longArticlesMySQL
 
 
 class weixinCategory(object):
@@ -16,7 +16,7 @@ class weixinCategory(object):
     """
 
     def __init__(self):
-        self.db_client_pq = PQMySQL()
+        self.db_client_lam = longArticlesMySQL
         self.db_client_dt = DeNetMysql()
         self.spider = WeixinSpider()
         self.function = Functions()
@@ -28,11 +28,11 @@ class weixinCategory(object):
         :return:
         """
         sql = f"""
-            select distinct gh_id, account_source, account_name, account_category, latest_update_time
+            select gh_id, account_source, account_name, account_category, latest_update_time
             from long_articles_accounts 
-            where account_category = '{account_category}';
+            where account_category = '{account_category}' and is_using = 1;
             """
-        account_tuple = self.db_client_pq.select(sql)
+        account_tuple = self.db_client_lam.select(sql)
         result = [
             {
                 "gh_id": i[0],
@@ -104,7 +104,7 @@ class weixinCategory(object):
             set latest_update_time = %s
             where gh_id = %s;
         """
-        self.db_client_pq.update(sql=update_sql, params=(dt_str, gh_id))
+        self.db_client_lam.update(sql=update_sql, params=(dt_str, gh_id))
 
     def updateEachAccountArticles(self, gh_id, category, latest_time_stamp, index=None):
         """
@@ -138,12 +138,7 @@ class weixinCategory(object):
 if __name__ == "__main__":
     wxCategory = weixinCategory()
     category_list = [
-        '军事',
-        '历史',
-        '娱乐八卦',
-        '情感生活',
-        '健康养生',
-        # '新闻媒体'
+        'daily-account-mining'
     ]
     for category in category_list:
         account_list = wxCategory.getAccountList(category)