spiderTool.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import datetime
  6. from applications import WeixinSpider, Functions, PQMySQL, DeNetMysql
  7. class SpiderTools(object):
  8. """
  9. 长文爬虫公共入口
  10. """
  11. spider_client = WeixinSpider()
  12. function = Functions()
  13. pq_mysql_client = PQMySQL()
  14. denet_mysql_client = DeNetMysql()
  15. @classmethod
  16. def searchEachAccountArticlesSinglePage(cls, gh_id, category):
  17. """
  18. 抓取账号单页
  19. :param gh_id:
  20. :param category:
  21. :return:
  22. """
  23. response = cls.spider_client.update_msg_list(ghId=gh_id, index=None)
  24. msg_list = response.get("data", {}).get("data")
  25. if msg_list:
  26. cls.updateDataIntoMysql(
  27. gh_id=gh_id,
  28. category=category,
  29. mode="account",
  30. article_list=msg_list
  31. )
  32. cls.updateLatestAccountTimeStamp(gh_id=gh_id)
  33. else:
  34. print("No more data")
  35. @classmethod
  36. def searchEachAccountArticlesAllData(cls, gh_id, category, latest_time_stamp, index=None):
  37. """
  38. 抓取账号截止到2024-01-01的最新数据
  39. :param index:
  40. :param gh_id:
  41. :param category:
  42. :param latest_time_stamp
  43. :return:
  44. """
  45. response = cls.spider_client.update_msg_list(ghId=gh_id, index=index)
  46. msg_list = response.get("data", {}).get("data")
  47. if msg_list:
  48. last_article_in_this_msg = msg_list[-1]
  49. cls.updateDataIntoMysql(
  50. gh_id=gh_id, category=category, article_list=msg_list, mode="account"
  51. )
  52. last_time_stamp_in_this_msg = last_article_in_this_msg["AppMsg"]["BaseInfo"]["UpdateTime"]
  53. if latest_time_stamp < last_time_stamp_in_this_msg:
  54. next_cursor = response["data"]["next_cursor"]
  55. return cls.searchEachAccountArticlesAllData(
  56. gh_id=gh_id,
  57. latest_time_stamp=latest_time_stamp,
  58. category=category,
  59. index=next_cursor,
  60. )
  61. else:
  62. # 更新最近抓取时间
  63. cls.updateLatestAccountTimeStamp(gh_id=gh_id)
  64. else:
  65. print("No more data")
  66. @classmethod
  67. def updateDataIntoMysql(cls, gh_id, category, mode, article_list):
  68. """
  69. 将数据更新到数据库
  70. :return:
  71. """
  72. for article_obj in article_list:
  73. detail_article_list = article_obj["AppMsg"]["DetailInfo"]
  74. for obj in detail_article_list:
  75. try:
  76. show_stat = cls.function.show_desc_to_sta(obj["ShowDesc"])
  77. show_view_count = show_stat.get("show_view_count", 0)
  78. show_like_count = show_stat.get("show_like_count", 0)
  79. insert_sql = f"""
  80. insert into crawler_meta_article
  81. (platform, mode, category, out_account_id, article_index, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, status, unique_index)
  82. VALUES
  83. (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
  84. """
  85. cls.denet_mysql_client.update(
  86. sql=insert_sql,
  87. params=(
  88. "weixin",
  89. mode,
  90. category,
  91. gh_id,
  92. obj['ItemIndex'],
  93. obj["Title"],
  94. obj["ContentUrl"],
  95. show_view_count,
  96. show_like_count,
  97. obj["Digest"],
  98. obj["send_time"],
  99. int(time.time()),
  100. 1,
  101. cls.function.generateGzhId(obj["ContentUrl"]),
  102. ),
  103. )
  104. except Exception as e:
  105. print(e)
  106. @classmethod
  107. def updateLatestAccountTimeStamp(cls, gh_id):
  108. """
  109. 更新账号的最新时间戳
  110. :return:
  111. """
  112. select_sql = f"""
  113. SELECT publish_time
  114. From crawler_meta_article
  115. WHERE out_account_id = '{gh_id}'
  116. ORDER BY publish_time DESC LIMIT 1;
  117. """
  118. result = cls.denet_mysql_client.select(select_sql)
  119. time_stamp = result[0][0]
  120. dt_object = datetime.datetime.utcfromtimestamp(time_stamp)
  121. local_dt = dt_object.astimezone()
  122. dt_string = local_dt.strftime('%Y-%m-%d %H:%M:%S')
  123. update_sql = f"""
  124. update long_articles_accounts
  125. set latest_update_time = %s
  126. where account_id = %s;
  127. """
  128. cls.pq_mysql_client.update(sql=update_sql, params=(dt_string, gh_id))