search_by_words.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/5/23
  4. import os
  5. import sys
  6. import time
  7. import requests
  8. from datetime import date, timedelta
  9. from dateutil import parser
  10. sys.path.append(os.getcwd())
  11. from common import Common
  12. from feishu_lib import Feishu
  13. proxies = {"http": "127.0.0.1:19180", "https": "127.0.0.1:19180"}
  14. class Search:
  15. # 前天 <class 'str'> 2022-04-15
  16. before_yesterday = (date.today() + timedelta(days=2)).strftime("%Y-%m-%d")
  17. # 昨天 <class 'str'> 2022-04-13
  18. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
  19. # 今天 <class 'datetime.date'> 2022-04-14
  20. today = date.today()
  21. cursor = ""
  22. # 搜索词列表
  23. @classmethod
  24. def search_words(cls):
  25. # 搜索词
  26. word_list = []
  27. # 从云文档读取所有敏感词,添加到词库列表
  28. time.sleep(1)
  29. lists = Feishu.get_values_batch("twitter", "PZGpSZ")
  30. for i in lists:
  31. for j in i:
  32. # 过滤空的单元格内容
  33. if j is None:
  34. pass
  35. elif "#" in j:
  36. pass
  37. else:
  38. word_list.append(j)
  39. return word_list
  40. # 更新用户信息
  41. @classmethod
  42. def update_user_info(cls, uid, key_word, values):
  43. try:
  44. if len(Feishu.get_values_batch("twitter", "db114c")) == 1:
  45. Common.logger().info("无用户信息")
  46. else:
  47. time.sleep(1)
  48. i = Feishu.find_cell("twitter", "db114c", uid)
  49. user_words = Feishu.get_range_value("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i))
  50. user_create_time = Feishu.get_range_value("twitter", "db114c", "T" + str(i) + ":" + "T" + str(i))[0]
  51. user_update_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
  52. if key_word in user_words:
  53. Common.logger().info("满足条件:key_word已存在,更新当前用户信息:{}", uid)
  54. time.sleep(1)
  55. values.append(user_create_time)
  56. values.append(user_update_time)
  57. Common.logger().info("values:{}", values)
  58. Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
  59. Common.logger().info("用户:{}信息更新成功", uid)
  60. return
  61. elif key_word not in user_words:
  62. Common.logger().info("满足条件:key_word不存在,更新当前用户信息:{}", uid)
  63. # 先更新除了 key_word 以外的信息
  64. time.sleep(1)
  65. values.append(user_create_time)
  66. values.append(user_update_time)
  67. Common.logger().info("values:{}", values)
  68. Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
  69. Common.logger().info("用户:{}信息更新成功", uid)
  70. # 再更新 key_word
  71. time.sleep(1)
  72. words = user_words[0]+","+key_word
  73. Feishu.update_values("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i),
  74. [[str(words)]])
  75. Common.logger().info("用户key_word:{}更新成功", key_word)
  76. return
  77. except Exception as e:
  78. Common.logger().error("更新用户信息异常:{}", e)
  79. # 根据关键字搜索
  80. @classmethod
  81. def search_users_v2(cls, key_word):
  82. try:
  83. cursor_params = ''
  84. if len(cls.cursor) > 0:
  85. cursor_params = '&cursor={}'.format(cls.cursor)
  86. # 搜索最近三天的数据
  87. # url = "https://twitter.com/i/api/2/search/adaptive.json?" \
  88. # "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
  89. # "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
  90. # "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
  91. # "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
  92. # "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
  93. # "include_ext_media_color=true&include_ext_media_availability=true&" \
  94. # "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
  95. # "send_error_codes=true&simple_quoted_tweet=true&" \
  96. # "q=(" + key_word + ")%20until%3A" + str(cls.today) + "%20since%3A" + str(cls.before_yesterday) + \
  97. # "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
  98. # "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
  99. # "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
  100. url = "https://twitter.com/i/api/2/search/adaptive.json?" \
  101. "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
  102. "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
  103. "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
  104. "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
  105. "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
  106. "include_ext_media_color=true&include_ext_media_availability=true&" \
  107. "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
  108. "send_error_codes=true&simple_quoted_tweet=true&" \
  109. "q=" + key_word + \
  110. "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
  111. "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
  112. "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
  113. headers = {
  114. 'authority': 'twitter.com',
  115. 'accept': '*/*',
  116. 'accept-language': 'zh-CN,zh;q=0.9',
  117. 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz'
  118. '4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
  119. 'cookie': 'guest_id_marketing=v1%3A164691584304284451; guest_id_ads=v1%3A164691584304284451;'
  120. ' kdt=RGGgmMi1qsAE8ap8NlKFjpksuDXG9gdD1utIeK0u; des_opt_in=Y; _gcl_au=1.1.1066'
  121. '77612.1647418528;'
  122. ' g_state={"i_l":0}; _gid=GA1.2.645428048.1652699425;'
  123. ' personalization_id="v1_zSZMfoG7rsTlMHQYwOA39Q=="; guest_id=v1%3A165294843395764407;'
  124. ' auth_token=592dbe3e68ce355f31f8343d700215030fbcd817;'
  125. ' ct0=df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f7f39a447'
  126. 'b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c1056545ab007de55be;'
  127. ' twid=u%3D1501900092303101953; external_referer=padhuUp37zg6GVaBnLSoCA0layDKYA'
  128. 'Tn|0|8e8t2xd8A2w%3D; mbox=PC#3ffa21b420af400ca9e94d2b1b72525c.32_0#1716385856|s'
  129. 'ession#047c8af8f5e34fa585b247e05c6f0a6b#1653142916; _ga=GA1.2.659870250.1646915849;'
  130. ' _ga_BYKEBDM7DS=GS1.1.1653201242.12.0.1653201242.0; _ga_34PHSZMC42=GS1.1.1653201242.5'
  131. '8.0.1653201242.0; lang=zh-cn; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6R'
  132. 'mxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCMQBs%252BqAAToMY3NyZl9p%250AZC'
  133. 'IlYjJkNWIyOTZiMzhmMGVlNWM1NDY0MmUyNDM5NTJkNjg6B2lkIiVkZjNl%250AMWNkNTY5OTUwNDdiYzgzNDE1NG'
  134. 'UyNjA3ZWU1NA%253D%253D--b3450fa2f7a9503c9e5e8356aff22570d29a7912; guest_id=v1%3A16479480474'
  135. '0239293; guest_id_ads=v1%3A164794804740239293; guest_id_marketing=v1%3A164794804740239293;'
  136. ' personalization_id="v1_/1LnzKXLyeYnZl13Ri62bg=="',
  137. # 搜索最近三天的
  138. # 'referer': "https://twitter.com/search?q=(" + key_word + ")%20until%3A" + str(cls.today) +
  139. # "%20since%3A" + str(cls.before_yesterday) + "&src=typed_query&f=user",
  140. 'referer': "https://twitter.com/search?q=" + key_word + "&src=typed_query&f=user",
  141. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
  142. 'sec-ch-ua-mobile': '?0',
  143. 'sec-ch-ua-platform': '"macOS"',
  144. 'sec-fetch-dest': 'empty',
  145. 'sec-fetch-mode': 'cors',
  146. 'sec-fetch-site': 'same-origin',
  147. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
  148. ' Chrome/101.0.4951.64 Safari/537.36',
  149. 'x-csrf-token': 'df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f'
  150. '7f39a447b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c10565'
  151. '45ab007de55be',
  152. 'x-twitter-active-user': 'yes',
  153. 'x-twitter-auth-type': 'OAuth2Session',
  154. 'x-twitter-client-language': 'zh-cn'
  155. }
  156. r = requests.get(url=url, headers=headers, proxies=proxies)
  157. # Common.logger().info("response:{}", r.text)
  158. cls.cursor = r.json()["timeline"]["instructions"][-1]["addEntries"][
  159. "entries"][-1]["content"]["operation"]["cursor"]["value"]
  160. # Common.logger().info("cursor:{}", cls.cursor)
  161. users = r.json()["globalObjects"]["users"]
  162. if len(users) == 0:
  163. Common.logger().info("本次请求无数据返回")
  164. return
  165. else:
  166. userid_list = []
  167. for userid in users:
  168. userid_list.append(userid)
  169. for userinfo in userid_list:
  170. userinfo = users[userinfo]
  171. if "id_str" in userinfo:
  172. uid = userinfo["id_str"]
  173. else:
  174. uid = "null"
  175. if "name" in userinfo:
  176. name = userinfo["name"]
  177. else:
  178. name = "null"
  179. if "screen_name" in userinfo:
  180. screen_name = userinfo["screen_name"]
  181. else:
  182. screen_name = "null"
  183. if screen_name == "null":
  184. person_url = "null"
  185. else:
  186. person_url = "https://twitter.com/" + screen_name
  187. if "description" in userinfo:
  188. description = userinfo["description"]
  189. else:
  190. description = "null"
  191. if "location" in userinfo:
  192. location = userinfo["location"]
  193. else:
  194. location = "null"
  195. if "friends_count" in userinfo:
  196. friends_count = userinfo["friends_count"]
  197. else:
  198. friends_count = "null"
  199. if "followers_count" in userinfo:
  200. followers_count = userinfo["followers_count"]
  201. else:
  202. followers_count = "null"
  203. if "favourites_count" in userinfo:
  204. favourites_count = userinfo["favourites_count"]
  205. else:
  206. favourites_count = "null"
  207. if "listed_count" in userinfo:
  208. listed_count = userinfo["listed_count"]
  209. else:
  210. listed_count = "null"
  211. if "statuses_count" in userinfo:
  212. statuses_count = userinfo["statuses_count"]
  213. else:
  214. statuses_count = "null"
  215. if "media_count" in userinfo:
  216. media_count = userinfo["media_count"]
  217. else:
  218. media_count = "null"
  219. if "entities" not in userinfo:
  220. display_url = "null"
  221. elif "url" not in userinfo["entities"]:
  222. display_url = "null"
  223. elif "display_url" in userinfo["entities"]["url"]["urls"][0]:
  224. display_url = userinfo["entities"]["url"]["urls"][0]["display_url"]
  225. elif "expanded_url" in userinfo["entities"]["url"]["urls"][0]:
  226. display_url = userinfo["entities"]["url"]["urls"][0]["expanded_url"]
  227. elif "url" in userinfo["entities"]["url"]["urls"][0]:
  228. display_url = userinfo["entities"]["url"]["urls"][0]["url"]
  229. else:
  230. display_url = "null"
  231. if "created_at" in userinfo:
  232. created_at1 = userinfo["created_at"]
  233. created_at = str(parser.parse(created_at1).strftime("%Y/%m/%d %H:%M:%S"))
  234. else:
  235. created_at = "null"
  236. if "profile_image_url" in userinfo:
  237. profile_image_url = userinfo["profile_image_url"]
  238. else:
  239. profile_image_url = "null"
  240. if "profile_banner_url" in userinfo:
  241. profile_banner_url = userinfo["profile_banner_url"]
  242. else:
  243. profile_banner_url = "null"
  244. if "ext_has_nft_avatar" in userinfo:
  245. ext_has_nft_avatar = userinfo["ext_has_nft_avatar"]
  246. else:
  247. ext_has_nft_avatar = "null"
  248. if "verified" in userinfo:
  249. verified = userinfo["verified"]
  250. else:
  251. verified = "null"
  252. # 过滤无效用户
  253. if uid == "" or uid == "null":
  254. Common.logger().info("无效用户")
  255. # 用户已存在云文档中
  256. elif uid in [j for i in Feishu.get_values_batch("twitter", "db114c") for j in i]:
  257. Common.logger().info("用户已存在:{}", uid)
  258. time.sleep(1)
  259. values = [str(name),
  260. str(screen_name),
  261. str(person_url),
  262. str(description),
  263. str(location),
  264. int(friends_count),
  265. int(followers_count),
  266. int(favourites_count),
  267. int(listed_count),
  268. int(statuses_count),
  269. int(media_count),
  270. str(display_url),
  271. str(created_at),
  272. str(profile_image_url),
  273. str(profile_banner_url),
  274. str(ext_has_nft_avatar),
  275. str(verified)]
  276. cls.update_user_info(uid, key_word, values)
  277. # 用户未存在云文档中
  278. else:
  279. Common.logger().info("添加用户:{} 至云文档", name)
  280. create_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
  281. update_time = ""
  282. # 云文档插入行:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
  283. Feishu.insert_columns("twitter", "db114c", "ROWS", 1, 2)
  284. # 云文档写入数据:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
  285. values = [[str(uid),
  286. str(key_word),
  287. str(name),
  288. str(screen_name),
  289. str(person_url),
  290. str(description),
  291. str(location),
  292. int(friends_count),
  293. int(followers_count),
  294. int(favourites_count),
  295. int(listed_count),
  296. int(statuses_count),
  297. int(media_count),
  298. str(display_url),
  299. str(created_at),
  300. str(profile_image_url),
  301. str(profile_banner_url),
  302. str(ext_has_nft_avatar),
  303. str(verified),
  304. str(create_time),
  305. str(update_time)]]
  306. time.sleep(1)
  307. Feishu.update_values("twitter", "db114c", "A2:U2", values)
  308. Common.logger().info("添加成功\n")
  309. except Exception as e:
  310. Common.logger().error("搜索用户异常:{}", e)
  311. @classmethod
  312. def search_users_by_key_words(cls):
  313. for key_word in cls.search_words():
  314. Common.logger().info("根据关键词:{} 搜索用户", key_word)
  315. cls.cursor = ''
  316. time.sleep(1)
  317. start = time.time()
  318. for i in range(200):
  319. Common.logger().info("正在请求第{}页", i+1)
  320. cls.search_users_v2(key_word)
  321. end_time = time.time()
  322. Common.logger().info("本次根据{}关键词搜索, 共耗时:{}秒", key_word, int(end_time-start))
  323. if __name__ == "__main__":
  324. search = Search()
  325. # search.search_users("web3")
  326. search.search_users_by_key_words()