search_by_words.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2022/5/23
  4. import os
  5. import sys
  6. import time
  7. import requests
  8. from datetime import date, timedelta
  9. from dateutil import parser
  10. sys.path.append(os.getcwd())
  11. from common import Common
  12. from feishu_lib import Feishu, Bitable
  13. # proxies = {"http": "127.0.0.1:19180", "https": "127.0.0.1:19180"}
  14. proxies = {"http": None, "https": None}
  15. class Search:
  16. # 前天 <class 'str'> 2022-04-15
  17. before_yesterday = (date.today() + timedelta(days=2)).strftime("%Y-%m-%d")
  18. # 昨天 <class 'str'> 2022-04-13
  19. yesterday = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
  20. # 今天 <class 'datetime.date'> 2022-04-14
  21. today = date.today()
  22. cursor = ""
  23. # 搜索词列表
  24. @classmethod
  25. def search_words(cls):
  26. # 搜索词
  27. word_list = []
  28. # 从云文档读取所有敏感词,添加到词库列表
  29. time.sleep(1)
  30. lists = Feishu.get_values_batch("twitter", "PZGpSZ")
  31. for i in lists:
  32. for j in i:
  33. # 过滤空的单元格内容
  34. if j is None:
  35. pass
  36. elif "#" in j:
  37. pass
  38. else:
  39. word_list.append(j)
  40. return word_list
  41. # 更新用户信息
  42. @classmethod
  43. def update_user_info(cls, uid, key_word, values):
  44. try:
  45. if len(Feishu.get_values_batch("twitter", "db114c")) == 1:
  46. Common.logger().info("无用户信息")
  47. else:
  48. time.sleep(1)
  49. i = Feishu.find_cell("twitter", "db114c", uid)
  50. user_words = Feishu.get_range_value("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i))
  51. user_create_time = Feishu.get_range_value("twitter", "db114c", "T" + str(i) + ":" + "T" + str(i))[0]
  52. user_update_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
  53. if key_word in user_words:
  54. Common.logger().info("满足条件:key_word已存在,更新当前用户信息:{}", uid)
  55. time.sleep(1)
  56. values.append(user_create_time)
  57. values.append(user_update_time)
  58. Common.logger().info("values:{}", values)
  59. Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
  60. Common.logger().info("用户:{}信息更新成功", uid)
  61. return
  62. elif key_word not in user_words:
  63. Common.logger().info("满足条件:key_word不存在,更新当前用户信息:{}", uid)
  64. # 先更新除了 key_word 以外的信息
  65. time.sleep(1)
  66. values.append(user_create_time)
  67. values.append(user_update_time)
  68. Common.logger().info("values:{}", values)
  69. Feishu.update_values("twitter", "db114c", "C" + str(i) + ":" + "U" + str(i), [values])
  70. Common.logger().info("用户:{}信息更新成功", uid)
  71. # 再更新 key_word
  72. time.sleep(1)
  73. words = user_words[0] + "," + key_word
  74. Feishu.update_values("twitter", "db114c", "B" + str(i) + ":" + "B" + str(i),
  75. [[str(words)]])
  76. Common.logger().info("用户key_word:{}更新成功", key_word)
  77. return
  78. except Exception as e:
  79. Common.logger().error("更新用户信息异常:{}", e)
  80. # 根据关键字搜索
  81. @classmethod
  82. def search_users_v2(cls, key_word):
  83. try:
  84. cursor_params = ''
  85. if len(cls.cursor) > 0:
  86. cursor_params = '&cursor={}'.format(cls.cursor)
  87. # 搜索最近三天的数据
  88. # url = "https://twitter.com/i/api/2/search/adaptive.json?" \
  89. # "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
  90. # "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
  91. # "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
  92. # "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
  93. # "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
  94. # "include_ext_media_color=true&include_ext_media_availability=true&" \
  95. # "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
  96. # "send_error_codes=true&simple_quoted_tweet=true&" \
  97. # "q=(" + key_word + ")%20until%3A" + str(cls.today) + "%20since%3A" + str(cls.before_yesterday) + \
  98. # "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
  99. # "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
  100. # "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
  101. url = "https://twitter.com/i/api/2/search/adaptive.json?" \
  102. "include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&" \
  103. "include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&" \
  104. "include_can_media_tag=1&include_ext_has_nft_avatar=1&skip_status=1&" \
  105. "cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&" \
  106. "include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&" \
  107. "include_ext_media_color=true&include_ext_media_availability=true&" \
  108. "include_ext_sensitive_media_warning=true&include_ext_trusted_friends_metadata=true&" \
  109. "send_error_codes=true&simple_quoted_tweet=true&" \
  110. "q=" + key_word + \
  111. "&result_filter=user&count=20&query_source=typed_query" + cursor_params + \
  112. "&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2ChasNftAvatar%2CvoiceInfo%2" \
  113. "Cenrichments%2CsuperFollowMetadata%2CunmentionInfo"
  114. headers = {
  115. 'authority': 'twitter.com',
  116. 'accept': '*/*',
  117. 'accept-language': 'zh-CN,zh;q=0.9',
  118. 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz'
  119. '4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
  120. 'cookie': 'guest_id_marketing=v1%3A164691584304284451; guest_id_ads=v1%3A164691584304284451;'
  121. ' kdt=RGGgmMi1qsAE8ap8NlKFjpksuDXG9gdD1utIeK0u; des_opt_in=Y; _gcl_au=1.1.1066'
  122. '77612.1647418528;'
  123. ' g_state={"i_l":0}; _gid=GA1.2.645428048.1652699425;'
  124. ' personalization_id="v1_zSZMfoG7rsTlMHQYwOA39Q=="; guest_id=v1%3A165294843395764407;'
  125. ' auth_token=592dbe3e68ce355f31f8343d700215030fbcd817;'
  126. ' ct0=df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f7f39a447'
  127. 'b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c1056545ab007de55be;'
  128. ' twid=u%3D1501900092303101953; external_referer=padhuUp37zg6GVaBnLSoCA0layDKYA'
  129. 'Tn|0|8e8t2xd8A2w%3D; mbox=PC#3ffa21b420af400ca9e94d2b1b72525c.32_0#1716385856|s'
  130. 'ession#047c8af8f5e34fa585b247e05c6f0a6b#1653142916; _ga=GA1.2.659870250.1646915849;'
  131. ' _ga_BYKEBDM7DS=GS1.1.1653201242.12.0.1653201242.0; _ga_34PHSZMC42=GS1.1.1653201242.5'
  132. '8.0.1653201242.0; lang=zh-cn; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6R'
  133. 'mxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCMQBs%252BqAAToMY3NyZl9p%250AZC'
  134. 'IlYjJkNWIyOTZiMzhmMGVlNWM1NDY0MmUyNDM5NTJkNjg6B2lkIiVkZjNl%250AMWNkNTY5OTUwNDdiYzgzNDE1NG'
  135. 'UyNjA3ZWU1NA%253D%253D--b3450fa2f7a9503c9e5e8356aff22570d29a7912; guest_id=v1%3A16479480474'
  136. '0239293; guest_id_ads=v1%3A164794804740239293; guest_id_marketing=v1%3A164794804740239293;'
  137. ' personalization_id="v1_/1LnzKXLyeYnZl13Ri62bg=="',
  138. # 搜索最近三天的
  139. # 'referer': "https://twitter.com/search?q=(" + key_word + ")%20until%3A" + str(cls.today) +
  140. # "%20since%3A" + str(cls.before_yesterday) + "&src=typed_query&f=user",
  141. 'referer': "https://twitter.com/search?q=" + key_word + "&src=typed_query&f=user",
  142. 'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
  143. 'sec-ch-ua-mobile': '?0',
  144. 'sec-ch-ua-platform': '"macOS"',
  145. 'sec-fetch-dest': 'empty',
  146. 'sec-fetch-mode': 'cors',
  147. 'sec-fetch-site': 'same-origin',
  148. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
  149. ' Chrome/101.0.4951.64 Safari/537.36',
  150. 'x-csrf-token': 'df0294bd236bf2b599c0c62906066652be2f03658877d0fe982fbb0bb645270e8485ddb2f'
  151. '7f39a447b9e7ab341e244415576d8303df6302876fb00b8a5c996871bcfc2703a5d1c10565'
  152. '45ab007de55be',
  153. 'x-twitter-active-user': 'yes',
  154. 'x-twitter-auth-type': 'OAuth2Session',
  155. 'x-twitter-client-language': 'zh-cn'
  156. }
  157. r = requests.get(url=url, headers=headers, proxies=proxies)
  158. cls.cursor = r.json()["timeline"]["instructions"][-1]["addEntries"][
  159. "entries"][-1]["content"]["operation"]["cursor"]["value"]
  160. users = r.json()["globalObjects"]["users"]
  161. if len(users) == 0:
  162. Common.logger().info("本次请求无数据返回")
  163. return
  164. else:
  165. userid_list = []
  166. for userid in users:
  167. userid_list.append(userid)
  168. for userinfo in userid_list:
  169. userinfo = users[userinfo]
  170. if "id_str" in userinfo:
  171. uid = userinfo["id_str"]
  172. else:
  173. uid = "null"
  174. if "name" in userinfo:
  175. name = userinfo["name"]
  176. else:
  177. name = "null"
  178. if "screen_name" in userinfo:
  179. screen_name = userinfo["screen_name"]
  180. else:
  181. screen_name = "null"
  182. if screen_name == "null":
  183. person_url = "null"
  184. else:
  185. person_url = "https://twitter.com/" + screen_name
  186. if "description" in userinfo:
  187. description = userinfo["description"]
  188. else:
  189. description = "null"
  190. if "location" in userinfo:
  191. location = userinfo["location"]
  192. else:
  193. location = "null"
  194. if "friends_count" in userinfo:
  195. friends_count = userinfo["friends_count"]
  196. else:
  197. friends_count = "null"
  198. if "followers_count" in userinfo:
  199. followers_count = userinfo["followers_count"]
  200. else:
  201. followers_count = "null"
  202. if "favourites_count" in userinfo:
  203. favourites_count = userinfo["favourites_count"]
  204. else:
  205. favourites_count = "null"
  206. if "listed_count" in userinfo:
  207. listed_count = userinfo["listed_count"]
  208. else:
  209. listed_count = "null"
  210. if "statuses_count" in userinfo:
  211. statuses_count = userinfo["statuses_count"]
  212. else:
  213. statuses_count = "null"
  214. if "media_count" in userinfo:
  215. media_count = userinfo["media_count"]
  216. else:
  217. media_count = "null"
  218. if "entities" not in userinfo:
  219. display_url = "null"
  220. elif "url" not in userinfo["entities"]:
  221. display_url = "null"
  222. elif "display_url" in userinfo["entities"]["url"]["urls"][0]:
  223. display_url = userinfo["entities"]["url"]["urls"][0]["display_url"]
  224. elif "expanded_url" in userinfo["entities"]["url"]["urls"][0]:
  225. display_url = userinfo["entities"]["url"]["urls"][0]["expanded_url"]
  226. elif "url" in userinfo["entities"]["url"]["urls"][0]:
  227. display_url = userinfo["entities"]["url"]["urls"][0]["url"]
  228. else:
  229. display_url = "null"
  230. if "created_at" in userinfo:
  231. created_at1 = userinfo["created_at"]
  232. created_at = str(parser.parse(created_at1).strftime("%Y/%m/%d %H:%M:%S"))
  233. else:
  234. created_at = "null"
  235. if "profile_image_url" in userinfo:
  236. profile_image_url = userinfo["profile_image_url"]
  237. else:
  238. profile_image_url = "null"
  239. if "profile_banner_url" in userinfo:
  240. profile_banner_url = userinfo["profile_banner_url"]
  241. else:
  242. profile_banner_url = "null"
  243. if "ext_has_nft_avatar" in userinfo:
  244. ext_has_nft_avatar = userinfo["ext_has_nft_avatar"]
  245. else:
  246. ext_has_nft_avatar = "null"
  247. if "verified" in userinfo:
  248. verified = userinfo["verified"]
  249. else:
  250. verified = "null"
  251. # 过滤无效用户
  252. if uid == "" or uid == "null":
  253. Common.logger().info("无效用户")
  254. # 用户已存在云文档中
  255. elif uid in [j for i in Feishu.get_values_batch("twitter", "db114c") for j in i]\
  256. or str(uid) in [j for i in Feishu.get_values_batch("twitter", "db114c") for j in i]\
  257. or uid in [j for i in Feishu.get_values_batch("twitter", "B9NIuU") for j in i]\
  258. or str(uid) in [j for i in Feishu.get_values_batch("twitter", "B9NIuU") for j in i]:
  259. Common.logger().info("用户已存在:{}", uid)
  260. pass
  261. # time.sleep(1)
  262. # values = [str(name),
  263. # str(screen_name),
  264. # str(person_url),
  265. # str(description),
  266. # str(location),
  267. # int(friends_count),
  268. # int(followers_count),
  269. # int(favourites_count),
  270. # int(listed_count),
  271. # int(statuses_count),
  272. # int(media_count),
  273. # str(display_url),
  274. # str(created_at),
  275. # str(profile_image_url),
  276. # str(profile_banner_url),
  277. # str(ext_has_nft_avatar),
  278. # str(verified)]
  279. # cls.update_user_info(uid, key_word, values)
  280. # 用户未存在云文档中
  281. else:
  282. Common.logger().info("添加用户:{} 至云文档", name)
  283. create_time = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time()))
  284. update_time = ""
  285. # 云文档插入行:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
  286. Feishu.insert_columns("twitter", "db114c", "ROWS", 1, 2)
  287. # 云文档写入数据:https://w42nne6hzg.feishu.cn/sheets/shtcn6BYfYuqegIP13ORB6rI2dh?sheet=db114c
  288. values = [[str(uid),
  289. str(key_word),
  290. str(name),
  291. str(screen_name),
  292. str(person_url),
  293. str(description),
  294. str(location),
  295. int(friends_count),
  296. int(followers_count),
  297. int(favourites_count),
  298. int(listed_count),
  299. int(statuses_count),
  300. int(media_count),
  301. str(display_url),
  302. str(created_at),
  303. str(profile_image_url),
  304. str(profile_banner_url),
  305. str(ext_has_nft_avatar),
  306. str(verified),
  307. str(create_time),
  308. str(update_time)]]
  309. time.sleep(1)
  310. Feishu.update_values("twitter", "db114c", "A2:U2", values)
  311. Common.logger().info("添加至云文档成功")
  312. Common.logger().info("添加用户:{} 至多维表格", name)
  313. fields = {
  314. "fields": {
  315. "uid": str(uid),
  316. "key_words": str(key_word),
  317. "name": str(name),
  318. "screen_name": str(screen_name),
  319. "person_url": {
  320. "link": str(person_url),
  321. "text": str(person_url)
  322. },
  323. "description": str(description),
  324. "location": str(location),
  325. "friends_count": int(friends_count),
  326. "followers_count": int(followers_count),
  327. "favourites_count": int(favourites_count),
  328. "listed_count": int(listed_count),
  329. "statuses_count": int(statuses_count),
  330. "media_count": int(media_count),
  331. "display_url": {
  332. "link": str(display_url),
  333. "text": str(display_url)
  334. },
  335. "created_at": int(time.mktime(time.strptime(created_at, "%Y/%m/%d %H:%M:%S")))*1000,
  336. "profile_image_url": {
  337. "link": str(profile_image_url),
  338. "text": str(profile_image_url)
  339. },
  340. "profile_banner_url": {
  341. "link": str(profile_banner_url),
  342. "text": str(profile_banner_url)
  343. },
  344. "ext_has_nft_avatar": str(ext_has_nft_avatar),
  345. "verified": str(verified),
  346. "记录创建时间": int(time.mktime(time.strptime(create_time, "%Y/%m/%d %H:%M:%S"))) * 1000,
  347. # "记录修改时间": ""
  348. }
  349. }
  350. Bitable.create_record(fields)
  351. Common.logger().info("添加至多维表格成功\n")
  352. except Exception as e:
  353. Common.logger().error("搜索用户异常:{}", e)
  354. @classmethod
  355. def search_users_by_key_words(cls):
  356. for key_word in cls.search_words():
  357. Common.logger().info("根据关键词:{} 搜索用户", key_word)
  358. cls.cursor = ''
  359. time.sleep(1)
  360. start = time.time()
  361. for i in range(400):
  362. Common.logger().info("正在请求第{}页", i + 1)
  363. cls.search_users_v2(key_word)
  364. end_time = time.time()
  365. Common.logger().info("本次根据{}关键词搜索, 共耗时:{}秒", key_word, int(end_time - start))
  366. if __name__ == "__main__":
  367. search = Search()
  368. # search.search_users("web3")
  369. search.search_users_by_key_words()