Ver código fonte

冷启动--微信品类抓取

luojunhui 1 ano atrás
pai
commit
26642b1b43
1 arquivos alterados com 42 adições e 37 exclusões
  1. 42 37
      coldStartTasks/crawler/weixinCategoryCrawler.py

+ 42 - 37
coldStartTasks/crawler/weixinCategoryCrawler.py

@@ -7,8 +7,14 @@ import time
 
 from tqdm import tqdm
 
-from applications import WeixinSpider, Functions, DeNetMysql, longArticlesMySQL
+from applications import WeixinSpider, Functions, longArticlesMySQL
 
+# 常量
+ACCOUNT_GOOD_STATUS = 1
+DEFAULT_VIEW_COUNT = 0
+DEFAULT_LIKE_COUNT = 0
+DEFAULT_ARTICLE_STATUS = 1
+DEFAULT_TIMESTAMP = 1704038400
 
 class weixinCategory(object):
     """
@@ -16,12 +22,11 @@ class weixinCategory(object):
     """
 
     def __init__(self):
-        self.db_client_lam = longArticlesMySQL
-        self.db_client_dt = DeNetMysql()
+        self.db_client_lam = longArticlesMySQL()
         self.spider = WeixinSpider()
         self.function = Functions()
 
-    def getAccountList(self, account_category):
+    def get_account_list(self, account_category):
         """
         获取账号
         :param account_category 品类
@@ -30,7 +35,7 @@ class weixinCategory(object):
         sql = f"""
             select gh_id, account_source, account_name, account_category, latest_update_time
             from long_articles_accounts 
-            where account_category = '{account_category}' and is_using = 1;
+            where account_category = '{account_category}' and is_using = {ACCOUNT_GOOD_STATUS};
             """
         account_tuple = self.db_client_lam.select(sql)
         result = [
@@ -45,7 +50,7 @@ class weixinCategory(object):
         ]
         return result
 
-    def updateDataIntoMysql(self, gh_id, category, article_list):
+    def insert_data_into_db(self, gh_id, category, article_list):
         """
         将数据更新到数据库
         :return:
@@ -55,15 +60,15 @@ class weixinCategory(object):
             for obj in detail_article_list:
                 try:
                     show_stat = self.function.show_desc_to_sta(obj["ShowDesc"])
-                    show_view_count = show_stat.get("show_view_count", 0)
-                    show_like_count = show_stat.get("show_like_count", 0)
+                    show_view_count = show_stat.get("show_view_count", DEFAULT_VIEW_COUNT)
+                    show_like_count = show_stat.get("show_like_count", DEFAULT_LIKE_COUNT)
                     insert_sql = f"""
                         insert into crawler_meta_article
                         (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
                         VALUES 
                         (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                     """
-                    self.db_client_dt.update(
+                    self.db_client_lam.update(
                         sql=insert_sql,
                         params=(
                             "weixin",
@@ -78,14 +83,14 @@ class weixinCategory(object):
                             obj["Digest"],
                             obj["send_time"],
                             int(time.time()),
-                            1,
+                            DEFAULT_ARTICLE_STATUS,
                             self.function.generateGzhId(obj["ContentUrl"]),
                         ),
                     )
                 except Exception as e:
                     print(e)
 
-    def updateLatestAccountTimeStamp(self, gh_id):
+    def update_latest_account_timestamp(self, gh_id):
         """
         更新账号的最新时间戳
         :return:
@@ -96,7 +101,7 @@ class weixinCategory(object):
             WHERE out_account_id = '{gh_id}'
             ORDER BY publish_time DESC LIMIT 1;
         """
-        result = self.db_client_dt.select(select_sql)
+        result = self.db_client_lam.select(select_sql)
         time_stamp = result[0][0]
         dt_str = self.function.time_stamp_to_str(time_stamp)
         update_sql = f"""
@@ -106,7 +111,7 @@ class weixinCategory(object):
         """
         self.db_client_lam.update(sql=update_sql, params=(dt_str, gh_id))
 
-    def updateEachAccountArticles(self, gh_id, category, latest_time_stamp, index=None):
+    def update_each_account(self, gh_id, category, latest_time_stamp, index=None):
         """
         更新账号文章
         :return:
@@ -115,13 +120,13 @@ class weixinCategory(object):
         msg_list = response.get("data", {}).get("data")
         if msg_list:
             last_article_in_this_msg = msg_list[-1]
-            self.updateDataIntoMysql(
+            self.insert_data_into_db(
                 gh_id=gh_id, category=category, article_list=msg_list
             )
             last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
             if latest_time_stamp < last_time_stamp_in_this_msg:
                 next_cursor = response["data"]["next_cursor"]
-                return self.updateEachAccountArticles(
+                return self.update_each_account(
                     gh_id=gh_id,
                     latest_time_stamp=latest_time_stamp,
                     category=category,
@@ -129,32 +134,32 @@ class weixinCategory(object):
                 )
             else:
                 # 更新最近抓取时间
-                self.updateLatestAccountTimeStamp(gh_id=gh_id)
+                self.update_latest_account_timestamp(gh_id=gh_id)
                 print("账号时间更新成功")
         else:
             print("No more data")
 
+    def deal(self, category_list):
+        """
 
-if __name__ == "__main__":
-    wxCategory = weixinCategory()
-    category_list = [
-        'daily-account-mining'
-    ]
-    for category in category_list:
-        account_list = wxCategory.getAccountList(category)
-        for account in tqdm(account_list):
-            try:
-                gh_id = account['gh_id']
-                category = account['category']
+        :param category_list:
+        :return:
+        """
+        for category in category_list:
+            account_list = self.get_account_list(category)
+            for account in tqdm(account_list):
                 try:
-                    timestamp = int(account['latest_timestamp'].timestamp())
+                    gh_id = account['gh_id']
+                    category = account['category']
+                    try:
+                        timestamp = int(account['latest_timestamp'].timestamp())
+                    except Exception as e:
+                        timestamp = DEFAULT_TIMESTAMP
+                    self.update_each_account(
+                        gh_id=gh_id,
+                        category=category,
+                        latest_time_stamp=timestamp
+                    )
+                    print("success")
                 except Exception as e:
-                    timestamp = 1704038400
-                wxCategory.updateEachAccountArticles(
-                    gh_id=gh_id,
-                    category=category,
-                    latest_time_stamp=timestamp
-                )
-                print("success")
-            except Exception as e:
-                print("fail because of {}".format(e))
+                    print("fail because of {}".format(e))