wxSpiderApi.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. """
  2. @author: luojunhui
  3. """
  4. import json
  5. import time
  6. import requests
  7. from applications.aliyunLogApi import log
  8. from applications.decoratorApi import retryOnNone
  9. class WeixinSpider(object):
  10. """
  11. Update account articles
  12. """
  13. # ip = "8.217.190.241"
  14. # ip = "47.98.154.124"
  15. # port = "8888"
  16. base_url = "http://crawler-cn.aiddit.com/crawler/wei_xin"
  17. headers = {"Content-Type": "application/json"}
  18. @classmethod
  19. @retryOnNone()
  20. def search_articles(cls, title, page="1") -> dict:
  21. """
  22. search articles in wx
  23. :return:
  24. """
  25. url = "{}/keyword".format(cls.base_url)
  26. payload = json.dumps({"keyword": title, "cursor": page})
  27. response = requests.request(
  28. "POST", url, headers=cls.headers, data=payload, timeout=120
  29. )
  30. return response.json()
  31. @classmethod
  32. # @retryOnNone()
  33. def get_article_text(cls, content_link, is_count=False, is_cache=True) -> dict:
  34. """
  35. 获取文章
  36. :param is_cache:
  37. :param is_count:
  38. :param content_link:
  39. :return:
  40. """
  41. url = "{}/detail".format(cls.base_url)
  42. payload = json.dumps(
  43. {
  44. "content_link": content_link,
  45. "is_count": is_count,
  46. "is_ad": False,
  47. "is_cache": is_cache,
  48. }
  49. )
  50. response = requests.request(
  51. "POST", url, headers=cls.headers, data=payload, timeout=120
  52. )
  53. return response.json()
  54. @classmethod
  55. @retryOnNone()
  56. def update_msg_list(cls, ghId, index) -> dict:
  57. """
  58. :return:
  59. """
  60. url = "{}/blogger".format(cls.base_url)
  61. payload = {
  62. "account_id": ghId,
  63. "cursor": index,
  64. }
  65. response = requests.post(
  66. url=url, headers=cls.headers, data=json.dumps(payload), timeout=120
  67. )
  68. return response.json()
  69. @classmethod
  70. @retryOnNone()
  71. def get_account_by_url(cls, content_url) -> dict:
  72. """
  73. 通过文章获取账号信息
  74. :param content_url:
  75. :return:
  76. """
  77. url = "{}/account_info".format(cls.base_url)
  78. data = {"content_link": content_url}
  79. response = requests.request(
  80. "POST", url=url, headers=cls.headers, json=data, timeout=120
  81. )
  82. return response.json()
  83. @classmethod
  84. def get_recommend_articles(cls, content_link) -> dict:
  85. """
  86. use content link to get recommend articles
  87. :param content_link:
  88. :return:
  89. """
  90. url = "{}/recommend".format(cls.base_url)
  91. payload = json.dumps({"content_link": content_link})
  92. response = requests.request(
  93. "POST", url=url, headers=cls.headers, data=payload, timeout=120
  94. )
  95. response_json = response.json()
  96. if response_json["code"] != 0:
  97. return cls.get_recommend_articles(content_link)
  98. time.sleep(3)
  99. return response.json()
  100. @classmethod
  101. def get_recommend_articles_v2(cls, content_link) -> dict:
  102. """
  103. use content link to get recommend articles
  104. :param content_link:
  105. :return:
  106. """
  107. url = "http://datapi.top/wxapi/relatedarticle"
  108. payload = json.dumps(
  109. {"content_link": content_link, "token": "401e4d3c85068bb5"}
  110. )
  111. response = requests.request(
  112. "POST", url=url, headers=cls.headers, data=payload, timeout=120
  113. )
  114. log(
  115. task="article_association_crawler",
  116. function="get_recommend_articles_v2",
  117. message="获取推荐链接,付费接口",
  118. data={"content_link": content_link, "response": response.json()},
  119. )
  120. time.sleep(3)
  121. return response.json()