wxSpider.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import time
  6. import asyncio
  7. import aiohttp
  8. import requests
  9. def retry_on_none_data():
  10. """
  11. 基于None类型数据的重试装饰器
  12. :return:
  13. """
  14. def decorator(func):
  15. """
  16. :param func:
  17. :return:
  18. """
  19. max_retries = 5
  20. wait_seconds = 1
  21. def wrapper(*args, **kwargs):
  22. """
  23. :param args:
  24. :param kwargs:
  25. :return:
  26. """
  27. for attempt in range(max_retries):
  28. response = func(*args, **kwargs)
  29. if response['data'] is not None:
  30. return response
  31. time.sleep(wait_seconds)
  32. return None
  33. return wrapper
  34. return decorator
  35. def retryAsyncOnNoneData():
  36. """
  37. 异步装饰器
  38. :return:
  39. """
  40. def decorator(func):
  41. """
  42. :param func:
  43. :return:
  44. """
  45. max_retries = 5
  46. wait_seconds = 1
  47. async def wrapper(*args, **kwargs):
  48. """
  49. :param args:
  50. :param kwargs:
  51. :return:
  52. """
  53. for attempt in range(max_retries):
  54. response = await func(*args, **kwargs)
  55. if response.get('data') is not None:
  56. return response
  57. await asyncio.sleep(wait_seconds)
  58. return None
  59. return wrapper
  60. return decorator
  61. class ArticleManager(object):
  62. """
  63. Update account articles
  64. """
  65. @classmethod
  66. @retry_on_none_data()
  67. def search_articles(cls, title):
  68. """
  69. search articles in wx
  70. :return:
  71. """
  72. url = "http://47.98.154.124:8888/crawler/wei_xin/keyword"
  73. payload = json.dumps({
  74. "keyword": title,
  75. "cursor": "1"
  76. })
  77. headers = {
  78. 'Content-Type': 'application/json'
  79. }
  80. response = requests.request("POST", url, headers=headers, data=payload)
  81. return response.json()
  82. @classmethod
  83. @retry_on_none_data()
  84. def get_article_text(cls, content_link):
  85. """
  86. 获取文章
  87. :param content_link:
  88. :return:
  89. """
  90. url = "http://47.98.154.124:8888/crawler/wei_xin/detail"
  91. payload = json.dumps({
  92. "content_link": content_link,
  93. "is_count": False,
  94. "is_ad": False
  95. })
  96. headers = {
  97. 'Content-Type': 'application/json'
  98. }
  99. response = requests.request("POST", url, headers=headers, data=payload)
  100. return response.json()
  101. @classmethod
  102. @retry_on_none_data()
  103. def update_msg_list(cls, ghId, index):
  104. """
  105. :return:
  106. "http://47.98.154.124:8888/crawler/wei_xin/detail"
  107. """
  108. url = 'http://47.98.154.124:8888/crawler/wei_xin/blogger'
  109. payload = {
  110. 'account_id': ghId,
  111. 'cursor': index,
  112. }
  113. headers = {
  114. 'Content-Type': 'application/json'
  115. }
  116. response = requests.post(url, headers=headers, data=json.dumps(payload))
  117. return response.json()
  118. @classmethod
  119. @retryAsyncOnNoneData()
  120. async def get_account_by_url(cls, content_url):
  121. """
  122. 通过文章获取账号信息
  123. :param content_url:
  124. :return:
  125. """
  126. async with aiohttp.ClientSession() as session:
  127. async with session.post(
  128. url='http://47.98.154.124:8888/crawler/wei_xin/account_info',
  129. headers={'Content-Type': 'application/json'},
  130. json={"content_link": content_url}
  131. ) as response:
  132. return await response.json()
  133. # response = requests.request(
  134. # "POST",
  135. # url='http://8.217.190.241:8888/crawler/wei_xin/account_info',
  136. # headers={'Content-Type': 'application/json'},
  137. # json={"content_link": content_url}
  138. # )
  139. # return response.json()