search_by_words.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/5/23
  4. import os
  5. import sys
  6. import time
  7. import requests
  8. from datetime import date, timedelta
  9. from dateutil import parser
  10. sys.path.append(os.getcwd())
  11. from common import Common
  12. from feishu_lib import Feishu
  13. # proxies = {"http": "127.0.0.1:19180", "https": "127.0.0.1:19180"}
  14. proxies = {"http": None, "https": None}
  15. class Search:
  16. # 前天 <class 'str'> 2022-04-15
  17. before_yesterday = (date.today() + timedelta(days=2)).strftime("%Y-%m-%d")
  18. # 昨天 <class 'str'> 2022-04-13
  19. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
  20. # 今天 <class 'datetime.date'> 2022-04-14
  21. today = date.today()
  22. cursor = ""
  23. # 搜索词列表
  24. @classmethod
  25. def search_words(cls):
  26. # 搜索词
  27. word_list = []
  28. # 从云文档读取所有敏感词,添加到词库列表
  29. time.sleep(1)
  30. lists = Feishu.get_values_batch("twitter", "PZGpSZ")
  31. for i in lists:
  32. for j in i:
  33. # 过滤空的单元格内容
  34. if j is None:
  35. pass
  36. elif "#" in j:
  37. pass
  38. else:
  39. word_list.append(j)
  40. return word_list
  41. # 更新用户信息
  42. @classmethod
  43. def update_user_info(cls, uid, key_word, values):
  44. try:
  45. if len(Feishu.get_values_batch("twitter", "db114c")) == 1:
  46. Common.logger().info("无用户信息")
  47. else:
  48. time.sleep(1)
  49. i = Feishu.find_cell("twitter", "db114c", uid)
  50. user_words = Feishu.get_range_value("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i))
  51. user_create_time = Feishu.get_range_value("twitter", "db114c", "T" + str(i) + ":" + "T" + str(i))[0]
  52. user_update_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
  53. if key_word in user_words:
  54. Common.logger().info("满足条件:key_word已存在,更新当前用户信息:{}", uid)
  55. time.sleep(1)
  56. values.append(user_create_time)
  57. values.append(user_update_time)
  58. Common.logger().info("values:{}", values)
  59. Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
  60. Common.logger().info("用户:{}信息更新成功", uid)
  61. return
  62. elif key_word not in user_words:
  63. Common.logger().info("满足条件:key_word不存在,更新当前用户信息:{}", uid)
  64. # 先更新除了 key_word 以外的信息
  65. time.sleep(1)
  66. values.append(user_create_time)
  67. values.append(user_update_time)
  68. Common.logger().info("values:{}", values)
  69. Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
  70. Common.logger().info("用户:{}信息更新成功", uid)
  71. # 再更新 key_word
  72. time.sleep(1)
  73. words = user_words[0]+","+key_word
  74. Feishu.update_values("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i),
  75. [[str(words)]])
  76. Common.logger().info("用户key_word:{}更新成功", key_word)
  77. return
  78. except Exception as e:
  79. Common.logger().error("更新用户信息异常:{}", e)
  80. # 根据关键字搜索
  81. @classmethod
  82. def search_users_v2(cls, key_word):
  83. try:
  84. cursor_params = ''
  85. if len(cls.cursor) > 0:
  86. cursor_params = '&cursor={}'.format(cls.cursor)
  87. # 搜索最近三天的数据
  88. # url = "https://twitter.com/i/api/2/search/adaptive.json?" \
  89. # "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
  90. # "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
  91. # "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
  92. # "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
  93. # "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
  94. # "include_ext_media_color=true&include_ext_media_availability=true&" \
  95. # "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
  96. # "send_error_codes=true&simple_quoted_tweet=true&" \
  97. # "q=(" + key_word + ")%20until%3A" + str(cls.today) + "%20since%3A" + str(cls.before_yesterday) + \
  98. # "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
  99. # "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
  100. # "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
  101. url = "https://twitter.com/i/api/2/search/adaptive.json?" \
  102. "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
  103. "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
  104. "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
  105. "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
  106. "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
  107. "include_ext_media_color=true&include_ext_media_availability=true&" \
  108. "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
  109. "send_error_codes=true&simple_quoted_tweet=true&" \
  110. "q=" + key_word + \
  111. "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
  112. "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
  113. "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
  114. headers = {
  115. 'authority': 'twitter.com',
  116. 'accept': '*/*',
  117. 'accept-language': 'zh-CN,zh;q=0.9',
  118. 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz'
  119. '4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
  120. 'cookie': 'guest_id_marketing=v1%3A164691584304284451; guest_id_ads=v1%3A164691584304284451;'
  121. ' kdt=RGGgmMi1qsAE8ap8NlKFjpksuDXG9gdD1utIeK0u; des_opt_in=Y; _gcl_au=1.1.1066'
  122. '77612.1647418528;'
  123. ' g_state={"i_l":0}; _gid=GA1.2.645428048.1652699425;'
  124. ' personalization_id="v1_zSZMfoG7rsTlMHQYwOA39Q=="; guest_id=v1%3A165294843395764407;'
  125. ' auth_token=592dbe3e68ce355f31f8343d700215030fbcd817;'
  126. ' ct0=df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f7f39a447'
  127. 'b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c1056545ab007de55be;'
  128. ' twid=u%3D1501900092303101953; external_referer=padhuUp37zg6GVaBnLSoCA0layDKYA'
  129. 'Tn|0|8e8t2xd8A2w%3D; mbox=PC#3ffa21b420af400ca9e94d2b1b72525c.32_0#1716385856|s'
  130. 'ession#047c8af8f5e34fa585b247e05c6f0a6b#1653142916; _ga=GA1.2.659870250.1646915849;'
  131. ' _ga_BYKEBDM7DS=GS1.1.1653201242.12.0.1653201242.0; _ga_34PHSZMC42=GS1.1.1653201242.5'
  132. '8.0.1653201242.0; lang=zh-cn; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6R'
  133. 'mxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCMQBs%252BqAAToMY3NyZl9p%250AZC'
  134. 'IlYjJkNWIyOTZiMzhmMGVlNWM1NDY0MmUyNDM5NTJkNjg6B2lkIiVkZjNl%250AMWNkNTY5OTUwNDdiYzgzNDE1NG'
  135. 'UyNjA3ZWU1NA%253D%253D--b3450fa2f7a9503c9e5e8356aff22570d29a7912; guest_id=v1%3A16479480474'
  136. '0239293; guest_id_ads=v1%3A164794804740239293; guest_id_marketing=v1%3A164794804740239293;'
  137. ' personalization_id="v1_/1LnzKXLyeYnZl13Ri62bg=="',
  138. # 搜索最近三天的
  139. # 'referer': "https://twitter.com/search?q=(" + key_word + ")%20until%3A" + str(cls.today) +
  140. # "%20since%3A" + str(cls.before_yesterday) + "&src=typed_query&f=user",
  141. 'referer': "https://twitter.com/search?q=" + key_word + "&src=typed_query&f=user",
  142. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
  143. 'sec-ch-ua-mobile': '?0',
  144. 'sec-ch-ua-platform': '"macOS"',
  145. 'sec-fetch-dest': 'empty',
  146. 'sec-fetch-mode': 'cors',
  147. 'sec-fetch-site': 'same-origin',
  148. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
  149. ' Chrome/101.0.4951.64 Safari/537.36',
  150. 'x-csrf-token': 'df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f'
  151. '7f39a447b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c10565'
  152. '45ab007de55be',
  153. 'x-twitter-active-user': 'yes',
  154. 'x-twitter-auth-type': 'OAuth2Session',
  155. 'x-twitter-client-language': 'zh-cn'
  156. }
  157. r = requests.get(url=url, headers=headers, proxies=proxies)
  158. # Common.logger().info("response:{}", r.text)
  159. cls.cursor = r.json()["timeline"]["instructions"][-1]["addEntries"][
  160. "entries"][-1]["content"]["operation"]["cursor"]["value"]
  161. # Common.logger().info("cursor:{}", cls.cursor)
  162. users = r.json()["globalObjects"]["users"]
  163. if len(users) == 0:
  164. Common.logger().info("本次请求无数据返回")
  165. return
  166. else:
  167. userid_list = []
  168. for userid in users:
  169. userid_list.append(userid)
  170. for userinfo in userid_list:
  171. userinfo = users[userinfo]
  172. if "id_str" in userinfo:
  173. uid = userinfo["id_str"]
  174. else:
  175. uid = "null"
  176. if "name" in userinfo:
  177. name = userinfo["name"]
  178. else:
  179. name = "null"
  180. if "screen_name" in userinfo:
  181. screen_name = userinfo["screen_name"]
  182. else:
  183. screen_name = "null"
  184. if screen_name == "null":
  185. person_url = "null"
  186. else:
  187. person_url = "https://twitter.com/" + screen_name
  188. if "description" in userinfo:
  189. description = userinfo["description"]
  190. else:
  191. description = "null"
  192. if "location" in userinfo:
  193. location = userinfo["location"]
  194. else:
  195. location = "null"
  196. if "friends_count" in userinfo:
  197. friends_count = userinfo["friends_count"]
  198. else:
  199. friends_count = "null"
  200. if "followers_count" in userinfo:
  201. followers_count = userinfo["followers_count"]
  202. else:
  203. followers_count = "null"
  204. if "favourites_count" in userinfo:
  205. favourites_count = userinfo["favourites_count"]
  206. else:
  207. favourites_count = "null"
  208. if "listed_count" in userinfo:
  209. listed_count = userinfo["listed_count"]
  210. else:
  211. listed_count = "null"
  212. if "statuses_count" in userinfo:
  213. statuses_count = userinfo["statuses_count"]
  214. else:
  215. statuses_count = "null"
  216. if "media_count" in userinfo:
  217. media_count = userinfo["media_count"]
  218. else:
  219. media_count = "null"
  220. if "entities" not in userinfo:
  221. display_url = "null"
  222. elif "url" not in userinfo["entities"]:
  223. display_url = "null"
  224. elif "display_url" in userinfo["entities"]["url"]["urls"][0]:
  225. display_url = userinfo["entities"]["url"]["urls"][0]["display_url"]
  226. elif "expanded_url" in userinfo["entities"]["url"]["urls"][0]:
  227. display_url = userinfo["entities"]["url"]["urls"][0]["expanded_url"]
  228. elif "url" in userinfo["entities"]["url"]["urls"][0]:
  229. display_url = userinfo["entities"]["url"]["urls"][0]["url"]
  230. else:
  231. display_url = "null"
  232. if "created_at" in userinfo:
  233. created_at1 = userinfo["created_at"]
  234. created_at = str(parser.parse(created_at1).strftime("%Y/%m/%d %H:%M:%S"))
  235. else:
  236. created_at = "null"
  237. if "profile_image_url" in userinfo:
  238. profile_image_url = userinfo["profile_image_url"]
  239. else:
  240. profile_image_url = "null"
  241. if "profile_banner_url" in userinfo:
  242. profile_banner_url = userinfo["profile_banner_url"]
  243. else:
  244. profile_banner_url = "null"
  245. if "ext_has_nft_avatar" in userinfo:
  246. ext_has_nft_avatar = userinfo["ext_has_nft_avatar"]
  247. else:
  248. ext_has_nft_avatar = "null"
  249. if "verified" in userinfo:
  250. verified = userinfo["verified"]
  251. else:
  252. verified = "null"
  253. # 过滤无效用户
  254. if uid == "" or uid == "null":
  255. Common.logger().info("无效用户")
  256. # 用户已存在云文档中
  257. elif uid in [j for i in Feishu.get_values_batch("twitter", "db114c") for j in i]:
  258. Common.logger().info("用户已存在:{}", uid)
  259. pass
  260. # time.sleep(1)
  261. # values = [str(name),
  262. # str(screen_name),
  263. # str(person_url),
  264. # str(description),
  265. # str(location),
  266. # int(friends_count),
  267. # int(followers_count),
  268. # int(favourites_count),
  269. # int(listed_count),
  270. # int(statuses_count),
  271. # int(media_count),
  272. # str(display_url),
  273. # str(created_at),
  274. # str(profile_image_url),
  275. # str(profile_banner_url),
  276. # str(ext_has_nft_avatar),
  277. # str(verified)]
  278. # cls.update_user_info(uid, key_word, values)
  279. # 用户未存在云文档中
  280. else:
  281. Common.logger().info("添加用户:{} 至云文档", name)
  282. create_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
  283. update_time = ""
  284. # 云文档插入行:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
  285. Feishu.insert_columns("twitter", "db114c", "ROWS", 1, 2)
  286. # 云文档写入数据:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
  287. values = [[str(uid),
  288. str(key_word),
  289. str(name),
  290. str(screen_name),
  291. str(person_url),
  292. str(description),
  293. str(location),
  294. int(friends_count),
  295. int(followers_count),
  296. int(favourites_count),
  297. int(listed_count),
  298. int(statuses_count),
  299. int(media_count),
  300. str(display_url),
  301. str(created_at),
  302. str(profile_image_url),
  303. str(profile_banner_url),
  304. str(ext_has_nft_avatar),
  305. str(verified),
  306. str(create_time),
  307. str(update_time)]]
  308. time.sleep(1)
  309. Feishu.update_values("twitter", "db114c", "A2:U2", values)
  310. Common.logger().info("添加成功\n")
  311. except Exception as e:
  312. Common.logger().error("搜索用户异常:{}", e)
  313. @classmethod
  314. def search_users_by_key_words(cls):
  315. for key_word in cls.search_words():
  316. Common.logger().info("根据关键词:{} 搜索用户", key_word)
  317. cls.cursor = ''
  318. time.sleep(1)
  319. start = time.time()
  320. for i in range(200):
  321. Common.logger().info("正在请求第{}页", i+1)
  322. cls.search_users_v2(key_word)
  323. end_time = time.time()
  324. Common.logger().info("本次根据{}关键词搜索, 共耗时:{}秒", key_word, int(end_time-start))
  325. if __name__ == "__main__":
  326. search = Search()
  327. # search.search_users("web3")
  328. search.search_users_by_key_words()