luojunhui пре 7 месеци
родитељ
комит
fb21af8072
3 измењених фајлова са 171 додато и 26 уклоњено
  1. 63 6
      account_crawler_task.py
  2. 54 0
      applications/api/feishu_api.py
  3. 54 20
      tasks/crawler_accounts_by_association.py

+ 63 - 6
account_crawler_task.py

@@ -2,15 +2,72 @@
 @author: luojunhui
 @description: try to get some more accounts
 """
-from tasks.crawler_accounts_by_association import ChannelAccountCrawler
+
+import datetime
+
+from applications.api.feishu_api import FeishuSheetApi
+from tasks.crawler_accounts_by_association import ChannelsAccountCrawler
 from tasks.crawler_accounts_by_association import ToutiaoAccountCrawler
 
+document_token = "BGQCsOXwHhVRq5tswjgcI8NInqd"
+toutiao_sheet_id = "pIJSt7"
+channels_sheet_id = "ee0163"
+
+
+def insert_data_into_feishu_sheet(platform: str, data_list: list[list[str]]) -> None:
+    """
+    insert data info into feishu sheet
+    :param platform: str, channels or toutiao
+    :param data_list: list[list[str]],
+    """
+    video_array = [
+        list(i) + [datetime.date.today().strftime("%Y-%m-%d")] for i in data_list
+    ]
+    feishu_sheet = FeishuSheetApi()
+    feishu_sheet.fetch_token()
+    match platform:
+        case "toutiao":
+            sheet_id = toutiao_sheet_id
+        case "channels":
+            sheet_id = channels_sheet_id
+        case _:
+            raise RuntimeError("platform error")
 
-if __name__ == '__main__':
+    feishu_sheet.prepend_value(
+        sheet_token=document_token,
+        sheet_id=sheet_id,
+        values=[["******"]],
+        ranges="A2:A2",
+    )
+    feishu_sheet.insert_value(
+        sheet_token=document_token,
+        sheet_id=sheet_id,
+        values=video_array,
+        ranges="A2:I{}".format(2 + len(video_array)),
+    )
+
+
+if __name__ == "__main__":
     # crawler channels
-    channel_account_crawler = ChannelAccountCrawler()
-    channel_account_crawler.deal()
+    channels_account_crawler = ChannelsAccountCrawler()
+    channels_account_crawler.deal()
+
+    # insert data into toutiao sheet
+    video_list = channels_account_crawler.get_video_list_with_score(platform="channels")
+    insert_data_into_feishu_sheet(platform="toutiao", data_list=video_list)
+    video_id_list = [i[0] for i in video_list]
+    channels_account_crawler.update_video_status(
+        video_id_tuple=tuple(video_id_list), ori_status=0, new_status=1
+    )
 
     # crawler toutiao
-    toutiao_crawler = ToutiaoAccountCrawler()
-    toutiao_crawler.deal()
+    toutiao_account_crawler = ToutiaoAccountCrawler()
+    toutiao_account_crawler.deal()
+
+    # insert data into toutiao sheet
+    video_list = toutiao_account_crawler.get_video_list_with_score(platform="toutiao")
+    insert_data_into_feishu_sheet(platform="toutiao", data_list=video_list)
+    video_id_list = [i[0] for i in video_list]
+    toutiao_account_crawler.update_video_status(
+        video_id_tuple=tuple(video_id_list), ori_status=0, new_status=1
+    )

+ 54 - 0
applications/api/feishu_api.py

@@ -0,0 +1,54 @@
+import requests
+
+
+class Feishu:
+    def __init__(self):
+        self.token = None
+        self.headers = {"Content-Type": "application/json"}
+
+    def fetch_token(self):
+        url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
+        post_data = {
+            "app_id": "cli_a51114cf8bf8d00c",
+            "app_secret": "cNoTAqMpsAm7mPBcpCAXFfvOzCNL27fe",
+        }
+        response = requests.request("POST", url=url, data=post_data)
+        tenant_access_token = response.json()["tenant_access_token"]
+        self.token = tenant_access_token
+
+
+class FeishuSheetApi(Feishu):
+
+    def prepend_value(self, sheet_token, sheet_id, ranges, values):
+        insert_value_url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{}/values_prepend".format(
+            sheet_token
+        )
+        headers = {
+            "Authorization": "Bearer " + self.token,
+            "contentType": "application/json; charset=utf-8",
+        }
+        body = {
+            "valueRange": {"range": "{}!{}".format(sheet_id, ranges), "values": values}
+        }
+        response = requests.request(
+            "POST", url=insert_value_url, headers=headers, json=body
+        )
+        print(response.json())
+
+    def insert_value(self, sheet_token, sheet_id, ranges, values):
+        insert_value_url = (
+            "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/{}/values".format(
+                sheet_token
+            )
+        )
+        headers = {
+            "Authorization": "Bearer " + self.token,
+            "contentType": "application/json; charset=utf-8",
+        }
+        body = {
+            "valueRange": {"range": "{}!{}".format(sheet_id, ranges), "values": values}
+        }
+        response = requests.request(
+            "PUT", url=insert_value_url, headers=headers, json=body
+        )
+        print(response.json())

+ 54 - 20
tasks/crawler_accounts_by_association.py

@@ -34,7 +34,7 @@ class CrawlerAccounts:
         self.db_client = DatabaseConnector(db_config=long_articles_config)
         self.db_client.connect()
 
-    def get_seed_keys(self):
+    def get_seed_keys(self)->list[dict]:
         """
         get search keys from database
         """
@@ -42,7 +42,7 @@ class CrawlerAccounts:
         result = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
         return result
 
-    def insert_video_into_recommend_table(self, item):
+    def insert_video_into_recommend_table(self, item: dict) -> None:
         # whether account exists
         final_item = scrape_account_entities_process(item, self.db_client)
         if not final_item:
@@ -97,8 +97,30 @@ class CrawlerAccounts:
         fetch_response = self.db_client.fetch(query=fetch_query, cursor_type=DictCursor)
         return fetch_response
 
+    def get_video_list_with_score(self, platform: str):
+        """
+        find video from video association
+        """
+        fetch_query = f"""
+            select id, account_name, recommend_video_id, title, read_cnt, duration, seed_account, seed_title
+            from video_association 
+            where score > 0.5 and platform = '{platform}' and status = 0
+            order by account_name;
+        """
+        fetch_response = self.db_client.fetch(query=fetch_query)
+        return fetch_response
+
+    def update_video_status(self, video_id_tuple: tuple, ori_status: int, new_status: int) -> int:
+        update_query = f"""
+            update video_association
+            set status = %s
+            where id in %s and status = %s;
+        """
+        affected_rows = self.db_client.save(query=update_query, params=(new_status, video_id_tuple, ori_status))
+        return affected_rows
+
 
-class ChannelAccountCrawler(CrawlerAccounts):
+class ChannelsAccountCrawler(CrawlerAccounts):
     """
     crawler channel accounts
     strategy:
@@ -120,23 +142,35 @@ class ChannelAccountCrawler(CrawlerAccounts):
         search_video_response = get_channel_account_videos(account_id)
         video_list = search_video_response["data"]["object"]
         for video in video_list[:5]:
-            video_item = Item()
-            video_item.add("account_name", account_name)
-            video_item.add("account_id", account_id)
-            video_item.add("recommend_video_id", video["id"])
-            video_item.add("title", video["objectDesc"]["description"])
-            video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
-            video_item.add("seed_account", "SearchWithOutAccount")
-            video_item.add("seed_title", seed_title)
-            video_item.add(
-                "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
-            )
-            video_item.add("platform", "sph")
-            # check item
-            video_item.check(source="association")
+            try:
+                video_item = Item()
+                video_item.add("account_name", account_name)
+                video_item.add("account_id", account_id)
+                video_item.add("recommend_video_id", video["id"])
+                video_item.add("title", video["objectDesc"]["description"])
+                video_item.add("duration", video["objectDesc"]["media"][0]["VideoPlayLen"])
+                video_item.add("seed_account", "SearchWithOutAccount")
+                video_item.add("seed_title", seed_title)
+                video_item.add(
+                    "recommend_date", datetime.datetime.today().strftime("%Y-%m-%d")
+                )
+                video_item.add("platform", "sph")
+                # check item
+                video_item.check(source="association")
 
-            # save to db
-            self.insert_video_into_recommend_table(video_item.item)
+                # save to db
+                self.insert_video_into_recommend_table(video_item.item)
+            except Exception as e:
+                log(
+                    task="channel account crawler",
+                    function="process_each_video",
+                    message="create item and save to db failed",
+                    data={
+                        "video": video,
+                        "error": str(e),
+                        "traceback": traceback.format_exc()
+                    }
+                )
 
     def search_by_title_from_database(self, title: str) -> None:
         """
@@ -244,4 +278,4 @@ class ToutiaoAccountCrawler(CrawlerAccounts):
 
 class HaoKanAccountCrawler(CrawlerAccounts):
     def deal(self):
-        return
+        raise NotImplementedError()