Browse Source

优化:每隔关键词搜索,抓取用户信息20条/ 页 / 400 页

wangkun 2 years ago
parent
commit
08ce2f579b
6 changed files with 188 additions and 29 deletions
  1. 7 6
      README.md
  2. 15 16
      main/common.py
  3. 154 0
      main/copy_usersinfo.py
  4. 11 0
      main/run.py
  5. 1 3
      main/search_by_words.py
  6. 0 4
      requirements.txt

+ 7 - 6
README.md

@@ -1,9 +1,10 @@
 twitter爬虫
 
+python==3.10
+loguru==0.6.0
+python_dateutil==2.8.2
+requests==2.27.1
+urllib3==1.26.9
+
 1.必须要可以访问 twitter 的网络条件
-2.每天早 8 点定时重启
-3.loguru==0.6.0
-4.python_dateutil==2.8.2
-5.requests==2.27.1
-6.urllib3==1.26.9
-7.执行入口 ./main/run.py
+2.执行入口 ./main/run.py

+ 15 - 16
main/common.py

@@ -52,27 +52,26 @@ class Common:
 
     # 清除日志,保留最近 7 个文件
     @classmethod
-    def del_logs(cls, d_dir):
+    def del_logs(cls):
         """
         清除冗余日志文件
         :d_dir: 需要删除的 log 地址
         :return: 保留最近 7 个日志
         """
-        if d_dir == "logs":
-            logs_dir = "./logs/"
-            all_files = sorted(os.listdir(logs_dir))
-            all_logs = []
-            for log in all_files:
-                name = os.path.splitext(log)[-1]
-                if name == ".log":
-                    all_logs.append(log)
-
-            if len(all_logs) <= 7:
-                pass
-            else:
-                for file in all_logs[:len(all_logs) - 7]:
-                    os.remove(logs_dir + file)
-            cls.logger().info("清除冗余日志成功")
+        logs_dir = "./logs/"
+        all_files = sorted(os.listdir(logs_dir))
+        all_logs = []
+        for log in all_files:
+            name = os.path.splitext(log)[-1]
+            if name == ".log":
+                all_logs.append(log)
+
+        if len(all_logs) <= 7:
+            pass
+        else:
+            for file in all_logs[:len(all_logs) - 7]:
+                os.remove(logs_dir + file)
+        cls.logger().info("清除冗余日志成功")
 
     # 封装下载视频或封面的方法
     @classmethod

+ 154 - 0
main/copy_usersinfo.py

@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+# @Author: wangkun
+# @Time: 2022/6/20
+import time
+
+import requests
+import urllib3
+from main.common import Common
+from main.feishu_lib import Feishu
+proxies = {"http": None, "https": None}
+
+
+class CopyUsersInfo:
+
+    # 备份表数据
+    @classmethod
+    def copyed_data(cls):
+        try:
+            data_list = []
+            lists = Feishu.get_values_batch("twitter", "WPJILC")
+            for i in lists:
+                for j in i:
+                    # 过滤空的单元格内容
+                    if j is None:
+                        pass
+                    else:
+                        data_list.append(j)
+            return data_list
+        except Exception as e:
+            Common.logger().error("获取备份表数据异常:{}", e)
+
+    # 增加工作表,复制工作表、删除工作表。
+    @classmethod
+    def sheets_batch_update(cls):
+        """
+        https://open.feishu.cn/document/ukTMukTMukTM/uYTMzUjL2EzM14iNxMTN
+        """
+        url = "https://open.feishu.cn/open-apis/sheets/v2/spreadsheets/shtcn6BYfYuqegIP13ORB6rI2dh/sheets_batch_update"
+        headers = {
+            "Authorization": "Bearer " + Feishu.get_token(),
+            "Content-Type": "application/json; charset=utf-8"
+        }
+        body = {
+            "requests": [
+                {
+                    "copySheet": {
+                        "source": {
+                            "sheetId": "db114c"
+                        },
+                        "destination": {
+                            "title": ""
+                        }
+                    }
+                }
+            ]
+        }
+        try:
+            urllib3.disable_warnings()
+            r = requests.post(url=url, headers=headers, json=body, proxies=proxies, verify=False)
+            print(r.json())
+            # Common.logger().info("增加工作表,复制工作表、删除工作表:{}", r.json()["msg"])
+        except Exception as e:
+            Common.logger().error("增加工作表,复制工作表、删除工作表异常:{}", e)
+
+    # 复制用户信息
+    @classmethod
+    def copy_usersinfo(cls):
+        try:
+            user_list = Feishu.get_values_batch("twitter", "db114c")
+            for i in range(1, len(user_list[1:])):
+                uid = user_list[i][0]
+                key_word = user_list[i][1]
+                name = user_list[i][2]
+                screen_name = user_list[i][3]
+                person_url = user_list[i][4]
+                description = user_list[i][5]
+                location = user_list[i][6]
+                friends_count = user_list[i][7]
+                followers_count = user_list[i][8]
+                favourites_count = user_list[i][9]
+                listed_count = user_list[i][10]
+                statuses_count = user_list[i][11]
+                media_count = user_list[i][12]
+                display_url = user_list[i][13]
+                created_at = user_list[i][14]
+                profile_image_url = user_list[i][15]
+                profile_banner_url = user_list[i][16]
+                ext_has_nft_avatar = user_list[i][17]
+                verified = user_list[i][18]
+                created_time = user_list[i][19]
+                update_time = user_list[i][20]
+                # print(f"\n第{i}个用户信息")
+                # print(f"uid:{uid}")
+                # print(f"key_word:{key_word}")
+                # print(f"name:{name}")
+                # print(f"screen_name:{screen_name}")
+                # print(f"person_url:{person_url}")
+                # print(f"description:{description}")
+                # print(f"location:{location}")
+                # print(f"friends_count:{friends_count}")
+                # print(f"followers_count:{followers_count}")
+                # print(f"favourites_count:{favourites_count}")
+                # print(f"listed_count:{listed_count}")
+                # print(f"statuses_count:{statuses_count}")
+                # print(f"media_count:{media_count}")
+                # print(f"display_url:{display_url}")
+                # print(f"created_at:{created_at}")
+                # print(f"profile_image_url:{profile_image_url}")
+                # print(f"profile_banner_url:{profile_banner_url}")
+                # print(f"ext_has_nft_avatar:{ext_has_nft_avatar}, type:{type(ext_has_nft_avatar)}")
+                # print(f"verified:{verified}, type:{type(verified)}")
+                # print(f"created_time:{created_time}")
+                # print(f"update_time:{update_time}")
+                # print("\n")
+
+                if uid in cls.copyed_data():
+                    Common.logger().info("用户信息已存在")
+                    return
+                else:
+                    Common.logger().info("开始拷贝第{}个用户信息", i)
+                    time.sleep(1)
+                    Feishu.insert_columns("twitter", "WPJILC", "ROWS", 1, 2)
+                    values = [[uid,
+                               key_word,
+                               name,
+                               screen_name,
+                               person_url,
+                               description,
+                               location,
+                               friends_count,
+                               followers_count,
+                               favourites_count,
+                               listed_count,
+                               statuses_count,
+                               media_count,
+                               display_url,
+                               created_at,
+                               profile_image_url,
+                               profile_banner_url,
+                               str(ext_has_nft_avatar),
+                               str(verified),
+                               created_time,
+                               update_time]]
+                    time.sleep(1)
+                    Feishu.update_values("twitter", "WPJILC", "A2:U2", values)
+
+        except Exception as e:
+            Common.logger().error("复制用户信息异常:{}", e)
+
+
+if __name__ == "__main__":
+    copy = CopyUsersInfo()
+    copy.sheets_batch_update()
+    # copy.copy_usersinfo()

+ 11 - 0
main/run.py

@@ -1,16 +1,27 @@
 # -*- coding: utf-8 -*-
 # @Author: wangkun
 # @Time: 2022/6/1
+import datetime
 import os
 import sys
 
 sys.path.append(os.getcwd())
+from main.common import Common
 from search_by_words import Search
+# from main.copy_usersinfo import CopyUsersInfo
 
 
 def main_pord():
     while True:
         Search.search_users_by_key_words()
+        Common.del_logs()
+        # main_pord_time = datetime.datetime.now()
+        # while True:
+        #     if main_pord_time.hour == 23 and main_pord_time.minute <= 30:
+        #         CopyUsersInfo.copy_usersinfo()
+        #         break
+        #     else:
+        #         Search.search_users_by_key_words()
 
 
 if __name__ == "__main__":

+ 1 - 3
main/search_by_words.py

@@ -164,10 +164,8 @@ class Search:
                 'x-twitter-client-language': 'zh-cn'
             }
             r = requests.get(url=url, headers=headers, proxies=proxies)
-            # Common.logger().info("response:{}", r.text)
             cls.cursor = r.json()["timeline"]["instructions"][-1]["addEntries"][
                 "entries"][-1]["content"]["operation"]["cursor"]["value"]
-            # Common.logger().info("cursor:{}", cls.cursor)
             users = r.json()["globalObjects"]["users"]
             if len(users) == 0:
                 Common.logger().info("本次请求无数据返回")
@@ -350,7 +348,7 @@ class Search:
             cls.cursor = ''
             time.sleep(1)
             start = time.time()
-            for i in range(200):
+            for i in range(400):
                 Common.logger().info("正在请求第{}页", i+1)
                 cls.search_users_v2(key_word)
             end_time = time.time()

+ 0 - 4
requirements.txt

@@ -1,4 +0,0 @@
-loguru==0.6.0
-python_dateutil==2.8.2
-requests==2.27.1
-urllib3==1.26.9