123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2022/5/23
- import os
- import sys
- import time
- import requests
- from datetime import date, timedelta
- from dateutil import parser
- sys.path.append(os.getcwd())
- from common import Common
- from feishu_lib import Feishu
- # proxies = {"http": "127.0.0.1:19180", "https": "127.0.0.1:19180"}
- proxies = {"http": None, "https": None}
- class Search:
- # 前天 <class 'str'> 2022-04-15
- before_yesterday = (date.today() + timedelta(days=2)).strftime("%Y-%m-%d")
- # 昨天 <class 'str'> 2022-04-13
- yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
- # 今天 <class 'datetime.date'> 2022-04-14
- today = date.today()
- cursor = ""
- # 搜索词列表
- @classmethod
- def search_words(cls):
- # 搜索词
- word_list = []
- # 从云文档读取所有敏感词,添加到词库列表
- time.sleep(1)
- lists = Feishu.get_values_batch("twitter", "PZGpSZ")
- for i in lists:
- for j in i:
- # 过滤空的单元格内容
- if j is None:
- pass
- elif "#" in j:
- pass
- else:
- word_list.append(j)
- return word_list
- # 更新用户信息
- @classmethod
- def update_user_info(cls, uid, key_word, values):
- try:
- if len(Feishu.get_values_batch("twitter", "db114c")) == 1:
- Common.logger().info("无用户信息")
- else:
- time.sleep(1)
- i = Feishu.find_cell("twitter", "db114c", uid)
- user_words = Feishu.get_range_value("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i))
- user_create_time = Feishu.get_range_value("twitter", "db114c", "T" + str(i) + ":" + "T" + str(i))[0]
- user_update_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
- if key_word in user_words:
- Common.logger().info("满足条件:key_word已存在,更新当前用户信息:{}", uid)
- time.sleep(1)
- values.append(user_create_time)
- values.append(user_update_time)
- Common.logger().info("values:{}", values)
- Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
- Common.logger().info("用户:{}信息更新成功", uid)
- return
- elif key_word not in user_words:
- Common.logger().info("满足条件:key_word不存在,更新当前用户信息:{}", uid)
- # 先更新除了 key_word 以外的信息
- time.sleep(1)
- values.append(user_create_time)
- values.append(user_update_time)
- Common.logger().info("values:{}", values)
- Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
- Common.logger().info("用户:{}信息更新成功", uid)
- # 再更新 key_word
- time.sleep(1)
- words = user_words[0]+","+key_word
- Feishu.update_values("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i),
- [[str(words)]])
- Common.logger().info("用户key_word:{}更新成功", key_word)
- return
- except Exception as e:
- Common.logger().error("更新用户信息异常:{}", e)
- # 根据关键字搜索
- @classmethod
- def search_users_v2(cls, key_word):
- try:
- cursor_params = ''
- if len(cls.cursor) > 0:
- cursor_params = '&cursor={}'.format(cls.cursor)
- # 搜索最近三天的数据
- # url = "https://twitter.com/i/api/2/search/adaptive.json?" \
- # "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
- # "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
- # "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
- # "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
- # "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
- # "include_ext_media_color=true&include_ext_media_availability=true&" \
- # "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
- # "send_error_codes=true&simple_quoted_tweet=true&" \
- # "q=(" + key_word + ")%20until%3A" + str(cls.today) + "%20since%3A" + str(cls.before_yesterday) + \
- # "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
- # "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
- # "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
- url = "https://twitter.com/i/api/2/search/adaptive.json?" \
- "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
- "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
- "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
- "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
- "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
- "include_ext_media_color=true&include_ext_media_availability=true&" \
- "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
- "send_error_codes=true&simple_quoted_tweet=true&" \
- "q=" + key_word + \
- "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
- "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
- "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
- headers = {
- 'authority': 'twitter.com',
- 'accept': '*/*',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz'
- '4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
- 'cookie': 'guest_id_marketing=v1%3A164691584304284451; guest_id_ads=v1%3A164691584304284451;'
- ' kdt=RGGgmMi1qsAE8ap8NlKFjpksuDXG9gdD1utIeK0u; des_opt_in=Y; _gcl_au=1.1.1066'
- '77612.1647418528;'
- ' g_state={"i_l":0}; _gid=GA1.2.645428048.1652699425;'
- ' personalization_id="v1_zSZMfoG7rsTlMHQYwOA39Q=="; guest_id=v1%3A165294843395764407;'
- ' auth_token=592dbe3e68ce355f31f8343d700215030fbcd817;'
- ' ct0=df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f7f39a447'
- 'b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c1056545ab007de55be;'
- ' twid=u%3D1501900092303101953; external_referer=padhuUp37zg6GVaBnLSoCA0layDKYA'
- 'Tn|0|8e8t2xd8A2w%3D; mbox=PC#3ffa21b420af400ca9e94d2b1b72525c.32_0#1716385856|s'
- 'ession#047c8af8f5e34fa585b247e05c6f0a6b#1653142916; _ga=GA1.2.659870250.1646915849;'
- ' _ga_BYKEBDM7DS=GS1.1.1653201242.12.0.1653201242.0; _ga_34PHSZMC42=GS1.1.1653201242.5'
- '8.0.1653201242.0; lang=zh-cn; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6R'
- 'mxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCMQBs%252BqAAToMY3NyZl9p%250AZC'
- 'IlYjJkNWIyOTZiMzhmMGVlNWM1NDY0MmUyNDM5NTJkNjg6B2lkIiVkZjNl%250AMWNkNTY5OTUwNDdiYzgzNDE1NG'
- 'UyNjA3ZWU1NA%253D%253D--b3450fa2f7a9503c9e5e8356aff22570d29a7912; guest_id=v1%3A16479480474'
- '0239293; guest_id_ads=v1%3A164794804740239293; guest_id_marketing=v1%3A164794804740239293;'
- ' personalization_id="v1_/1LnzKXLyeYnZl13Ri62bg=="',
- # 搜索最近三天的
- # 'referer': "https://twitter.com/search?q=(" + key_word + ")%20until%3A" + str(cls.today) +
- # "%20since%3A" + str(cls.before_yesterday) + "&src=typed_query&f=user",
- 'referer': "https://twitter.com/search?q=" + key_word + "&src=typed_query&f=user",
- 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"macOS"',
- 'sec-fetch-dest': 'empty',
- 'sec-fetch-mode': 'cors',
- 'sec-fetch-site': 'same-origin',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
- ' Chrome/101.0.4951.64 Safari/537.36',
- 'x-csrf-token': 'df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f'
- '7f39a447b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c10565'
- '45ab007de55be',
- 'x-twitter-active-user': 'yes',
- 'x-twitter-auth-type': 'OAuth2Session',
- 'x-twitter-client-language': 'zh-cn'
- }
- r = requests.get(url=url, headers=headers, proxies=proxies)
- # Common.logger().info("response:{}", r.text)
- cls.cursor = r.json()["timeline"]["instructions"][-1]["addEntries"][
- "entries"][-1]["content"]["operation"]["cursor"]["value"]
- # Common.logger().info("cursor:{}", cls.cursor)
- users = r.json()["globalObjects"]["users"]
- if len(users) == 0:
- Common.logger().info("本次请求无数据返回")
- return
- else:
- userid_list = []
- for userid in users:
- userid_list.append(userid)
- for userinfo in userid_list:
- userinfo = users[userinfo]
- if "id_str" in userinfo:
- uid = userinfo["id_str"]
- else:
- uid = "null"
- if "name" in userinfo:
- name = userinfo["name"]
- else:
- name = "null"
- if "screen_name" in userinfo:
- screen_name = userinfo["screen_name"]
- else:
- screen_name = "null"
- if screen_name == "null":
- person_url = "null"
- else:
- person_url = "https://twitter.com/" + screen_name
- if "description" in userinfo:
- description = userinfo["description"]
- else:
- description = "null"
- if "location" in userinfo:
- location = userinfo["location"]
- else:
- location = "null"
- if "friends_count" in userinfo:
- friends_count = userinfo["friends_count"]
- else:
- friends_count = "null"
- if "followers_count" in userinfo:
- followers_count = userinfo["followers_count"]
- else:
- followers_count = "null"
- if "favourites_count" in userinfo:
- favourites_count = userinfo["favourites_count"]
- else:
- favourites_count = "null"
- if "listed_count" in userinfo:
- listed_count = userinfo["listed_count"]
- else:
- listed_count = "null"
- if "statuses_count" in userinfo:
- statuses_count = userinfo["statuses_count"]
- else:
- statuses_count = "null"
- if "media_count" in userinfo:
- media_count = userinfo["media_count"]
- else:
- media_count = "null"
- if "entities" not in userinfo:
- display_url = "null"
- elif "url" not in userinfo["entities"]:
- display_url = "null"
- elif "display_url" in userinfo["entities"]["url"]["urls"][0]:
- display_url = userinfo["entities"]["url"]["urls"][0]["display_url"]
- elif "expanded_url" in userinfo["entities"]["url"]["urls"][0]:
- display_url = userinfo["entities"]["url"]["urls"][0]["expanded_url"]
- elif "url" in userinfo["entities"]["url"]["urls"][0]:
- display_url = userinfo["entities"]["url"]["urls"][0]["url"]
- else:
- display_url = "null"
- if "created_at" in userinfo:
- created_at1 = userinfo["created_at"]
- created_at = str(parser.parse(created_at1).strftime("%Y/%m/%d %H:%M:%S"))
- else:
- created_at = "null"
- if "profile_image_url" in userinfo:
- profile_image_url = userinfo["profile_image_url"]
- else:
- profile_image_url = "null"
- if "profile_banner_url" in userinfo:
- profile_banner_url = userinfo["profile_banner_url"]
- else:
- profile_banner_url = "null"
- if "ext_has_nft_avatar" in userinfo:
- ext_has_nft_avatar = userinfo["ext_has_nft_avatar"]
- else:
- ext_has_nft_avatar = "null"
- if "verified" in userinfo:
- verified = userinfo["verified"]
- else:
- verified = "null"
- # 过滤无效用户
- if uid == "" or uid == "null":
- Common.logger().info("无效用户")
- # 用户已存在云文档中
- elif uid in [j for i in Feishu.get_values_batch("twitter", "db114c") for j in i]:
- Common.logger().info("用户已存在:{}", uid)
- pass
- # time.sleep(1)
- # values = [str(name),
- # str(screen_name),
- # str(person_url),
- # str(description),
- # str(location),
- # int(friends_count),
- # int(followers_count),
- # int(favourites_count),
- # int(listed_count),
- # int(statuses_count),
- # int(media_count),
- # str(display_url),
- # str(created_at),
- # str(profile_image_url),
- # str(profile_banner_url),
- # str(ext_has_nft_avatar),
- # str(verified)]
- # cls.update_user_info(uid, key_word, values)
- # 用户未存在云文档中
- else:
- Common.logger().info("添加用户:{} 至云文档", name)
- create_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
- update_time = ""
- # 云文档插入行:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
- Feishu.insert_columns("twitter", "db114c", "ROWS", 1, 2)
- # 云文档写入数据:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
- values = [[str(uid),
- str(key_word),
- str(name),
- str(screen_name),
- str(person_url),
- str(description),
- str(location),
- int(friends_count),
- int(followers_count),
- int(favourites_count),
- int(listed_count),
- int(statuses_count),
- int(media_count),
- str(display_url),
- str(created_at),
- str(profile_image_url),
- str(profile_banner_url),
- str(ext_has_nft_avatar),
- str(verified),
- str(create_time),
- str(update_time)]]
- time.sleep(1)
- Feishu.update_values("twitter", "db114c", "A2:U2", values)
- Common.logger().info("添加成功\n")
- except Exception as e:
- Common.logger().error("搜索用户异常:{}", e)
- @classmethod
- def search_users_by_key_words(cls):
- for key_word in cls.search_words():
- Common.logger().info("根据关键词:{} 搜索用户", key_word)
- cls.cursor = ''
- time.sleep(1)
- start = time.time()
- for i in range(200):
- Common.logger().info("正在请求第{}页", i+1)
- cls.search_users_v2(key_word)
- end_time = time.time()
- Common.logger().info("本次根据{}关键词搜索, 共耗时:{}秒", key_word, int(end_time-start))
- if __name__ == "__main__":
- search = Search()
- # search.search_users("web3")
- search.search_users_by_key_words()
|