Parcourir la source

developing account crawler

luojunhui il y a 7 mois
Parent
commit
7bfc9d89a3

+ 1 - 0
applications/pipeline/__init__.py

@@ -1,4 +1,5 @@
 """
 @author: luojunhui
 """
+from .account_pipeline import scrape_account_entities_process
 from .crawler_pipeline import scrape_video_entities_process

+ 35 - 0
applications/pipeline/account_pipeline.py

@@ -0,0 +1,35 @@
+"""
+@author: luojunhui
+@description: account crawler pipeline
+"""
+from applications.db import DatabaseConnector
+
+empty_dict = {}
+
+def whether_duplicate_account_id(account_id: str, platform: str, db_client: DatabaseConnector) -> bool:
+    """
+    whether duplicate account id
+    """
+    sql = f"""
+        select id from video_meta_accounts
+        where account_id = %s and platform = %s;
+    """
+    duplicate_id = db_client.fetch(query=sql, params=(account_id, platform))
+    if duplicate_id:
+        return True
+    return False
+
+def scrape_account_entities_process(account_item: dict, db_client: DatabaseConnector) -> dict:
+    """
+    scrape_account_entities_process,
+    """
+    account_id = account_item['account_id']
+    platform = account_item['platform']
+
+    # whether account exists
+    if whether_duplicate_account_id(account_id, platform, db_client):
+        return empty_dict
+
+    # account analysis
+
+    return account_item

+ 1 - 0
applications/utils/__init__.py

@@ -9,4 +9,5 @@ from .download_video import download_sph_video
 from .download_video import download_toutiao_video
 from .item import Item
 from .save_to_db import insert_into_single_video_source_table
+from .save_to_db import insert_into_video_meta_accounts_table
 from .upload import upload_to_oss

+ 27 - 0
applications/utils/item.py

@@ -2,6 +2,7 @@
 @author: luojunhui
 """
 
+
 import time
 
 default_single_video_table_fields = {
@@ -26,6 +27,17 @@ default_single_video_table_fields = {
     "video_oss_path": None,
 }
 
+default_account_table_fields = {
+    "platform": 'Not NULL',
+    "account_id": 'Not NULL',
+    "account_name": 'Not NULL',
+    "max_cursor": None,
+    "account_init_date": None,
+    "status": 0,
+    "priority": 0,
+
+}
+
 
 class Item(object):
     """
@@ -58,6 +70,19 @@ class Item(object):
         """
         return
 
+    def check_account_item(self):
+        """
+        check account item
+        """
+        fields = list(default_account_table_fields.keys())
+        for key in fields:
+            if self.item.get(key, None) is not None:
+                continue
+            elif default_account_table_fields[key] == 'Not NULL':
+                raise ValueError(f"{key} is not None, please check your account item")
+            else:
+                self.item[key] = default_account_table_fields[key]
+
     def check(self, source):
         """
         check item
@@ -67,3 +92,5 @@ class Item(object):
                 self.check_video_item()
             case "article":
                 self.check_article_item()
+            case "account":
+                self.check_account_item()

+ 35 - 0
applications/utils/save_to_db.py

@@ -50,3 +50,38 @@ def insert_into_single_video_source_table(db_client, video_item):
                 "oss_path": video_item["video_oss_path"],
             },
         )
+
+def insert_into_video_meta_accounts_table(db_client, account_item):
+    """
+    insert account into account meta table
+    """
+    insert_sql = f"""
+        insert into video_meta_accounts
+            (platform, account_id, account_name, max_cursor, account_init_date, status, priority)
+        values
+            (%s, %s, %s, %s, %s, %s, %s);
+    """
+    try:
+        db_client.save(
+            query=insert_sql,
+            params=(
+                account_item["platform"],
+                account_item["account_id"],
+                account_item["account_name"],
+                account_item["max_cursor"],
+                account_item["account_init_date"],
+                account_item["status"],
+                account_item["priority"],
+            ),
+        )
+    except Exception as e:
+        log(
+            task="{}_account_crawler".format(account_item["platform"]),
+            function="save_each_account",
+            message="save account failed",
+            data={
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+                "account_id": account_item["account_id"],
+            },
+        )

+ 42 - 0
tasks/crawler_channel_accounts.py

@@ -0,0 +1,42 @@
+"""
+@author: luojunhui
+@description: crawler channel accounts
+"""
+
+from applications.api import WechatChannelAPI
+from applications.db import DatabaseConnector
+from applications.pipeline import scrape_account_entities_process
+from applications.utils import Item
+from applications.utils import insert_into_video_meta_accounts_table
+from config import long_articles_config
+
+class ChannelAccountCrawler:
+    """
+    crawler channel accounts
+    strategy:
+        1. try to get search keys and titles from database
+        2. try to get hot_points from web
+        2. use search api to get accounts
+    """
+    def __init__(self):
+        self.db_client = DatabaseConnector(db_config=long_articles_config)
+        self.db_client.connect()
+
+    def get_seed_keys_from_db(self):
+        """
+        get search keys from database
+        """
+        sql = "select * from datastat_sort_strategy limit 100;"
+        result = self.db_client.fetch(sql)
+        return result
+
+
+CA = ChannelAccountCrawler()
+result_list = CA.get_seed_keys_from_db()
+for item in result_list:
+    print(item)
+
+
+
+
+