weixinzhishu_hot_search.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/3/27
  4. import os
  5. import sys
  6. from datetime import date, timedelta
  7. from lxml import etree
  8. import requests
  9. sys.path.append(os.getcwd())
  10. from common.scheduling_db import MysqlHelper
  11. from common.common import Common
  12. proxies = {"http": None, "https": None}
  13. class HotSearch:
  14. # 日期,格式 年-月-日
  15. today = (date.today() + timedelta(days=0)).strftime("%Y-%m-%d")
  16. # 百度热搜
  17. @classmethod
  18. def baidu_hot_search(cls, log_type, crawler, env):
  19. try:
  20. url = "https://top.baidu.com/board?tab=realtime"
  21. headers = {
  22. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  23. }
  24. response = requests.get(url=url, headers=headers, proxies=proxies).text
  25. if len(response) == 0:
  26. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  27. return
  28. response_html = etree.HTML(response)
  29. title_list_html = response_html.xpath('//*[@class="c-single-text-ellipsis"]')
  30. for title_html in title_list_html:
  31. source = "百度"
  32. title = title_html.xpath("./text()")[0].strip()
  33. publish_time = cls.today
  34. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
  35. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  36. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  37. Common.logger(log_type, crawler).info("写入数据库成功\n")
  38. except Exception as e:
  39. Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n")
  40. # 抖音热搜
  41. @classmethod
  42. def douyin_hot_search(cls, log_type, crawler, env):
  43. try:
  44. url = "https://tophub.today/n/K7GdaMgdQy"
  45. headers = {
  46. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  47. }
  48. response = requests.get(url=url, headers=headers, proxies=proxies).text
  49. if len(response) == 0:
  50. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  51. return
  52. response_html = etree.HTML(response)
  53. tr_list = response_html.xpath("//tr")
  54. for tr in tr_list:
  55. source = "抖音"
  56. publish_day = tr.xpath("./td[1]/text()")[0]
  57. if len(publish_day) < 10:
  58. publish_day = cls.today
  59. elif publish_day != cls.today:
  60. pass
  61. publish_time = publish_day
  62. if publish_time != cls.today:
  63. continue
  64. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  65. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
  66. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  67. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  68. Common.logger(log_type, crawler).info("写入数据库成功\n")
  69. except Exception as e:
  70. Common.logger(log_type, crawler).error(f"douyin_hot_search:{e}\n")
  71. # 快手热搜
  72. @classmethod
  73. def kuaishou_hot_search(cls, log_type, crawler, env):
  74. try:
  75. url = "https://tophub.today/n/MZd7PrPerO"
  76. headers = {
  77. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  78. }
  79. response = requests.get(url=url, headers=headers, proxies=proxies).text
  80. if len(response) == 0:
  81. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  82. return
  83. response_html = etree.HTML(response)
  84. tr_list = response_html.xpath("//tr")
  85. for tr in tr_list:
  86. source = "快手"
  87. publish_day = tr.xpath("./td[1]/text()")[0]
  88. if len(publish_day) < 10:
  89. publish_day = cls.today
  90. elif publish_day != cls.today:
  91. pass
  92. publish_time = publish_day
  93. if publish_time != cls.today:
  94. continue
  95. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  96. # print(source)
  97. # print(publish_time)
  98. # print(title)
  99. # print('\n')
  100. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
  101. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  102. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  103. Common.logger(log_type, crawler).info("写入数据库成功\n")
  104. except Exception as e:
  105. Common.logger(log_type, crawler).error(f"kuaishou_hot_search:{e}\n")
  106. # 微博热搜
  107. @classmethod
  108. def weibo_hot_search(cls, log_type, crawler, env):
  109. try:
  110. url = "https://tophub.today/n/KqndgxeLl9"
  111. headers = {
  112. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  113. }
  114. response = requests.get(url=url, headers=headers, proxies=proxies).text
  115. if len(response) == 0:
  116. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  117. return
  118. response_html = etree.HTML(response)
  119. tr_list = response_html.xpath("//tr")
  120. for tr in tr_list:
  121. source = "微博"
  122. publish_day = tr.xpath("./td[1]/text()")[0]
  123. if len(publish_day) < 10:
  124. publish_day = cls.today
  125. elif publish_day != cls.today:
  126. pass
  127. publish_time = publish_day
  128. if publish_time != cls.today:
  129. continue
  130. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  131. # print(source)
  132. # print(publish_time)
  133. # print(title)
  134. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
  135. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  136. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  137. Common.logger(log_type, crawler).info("写入数据库成功\n")
  138. except Exception as e:
  139. Common.logger(log_type, crawler).error(f"weibo_hot_search:{e}\n")
  140. # 微信热搜
  141. @classmethod
  142. def weixin_hot_search(cls, log_type, crawler, env):
  143. try:
  144. url = "https://tophub.today/n/W1VdJPZoLQ"
  145. headers = {
  146. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
  147. }
  148. response = requests.get(url=url, headers=headers, proxies=proxies).text
  149. if len(response) == 0:
  150. Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
  151. return
  152. response_html = etree.HTML(response)
  153. tr_list = response_html.xpath("//tr")
  154. for tr in tr_list:
  155. source = "微信"
  156. publish_day = tr.xpath("./td[1]/text()")[0]
  157. if len(publish_day) < 10:
  158. publish_day = cls.today
  159. elif publish_day != cls.today:
  160. pass
  161. publish_time = publish_day
  162. if publish_time != cls.today:
  163. continue
  164. title = tr.xpath("./td[2]/*[1]/text()")[0].strip()
  165. # print(source)
  166. # print(publish_time)
  167. # print(title)
  168. insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")"""
  169. Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
  170. MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
  171. Common.logger(log_type, crawler).info("写入数据库成功\n")
  172. except Exception as e:
  173. Common.logger(log_type, crawler).error(f"weixin_hot_search:{e}\n")
  174. if __name__ == "__main__":
  175. HotSearch.baidu_hot_search("hot-search", "weixinzhishu", "dev")
  176. HotSearch.douyin_hot_search("hot-search", "weixinzhishu", "dev")
  177. HotSearch.kuaishou_hot_search("hot-search", "weixinzhishu", "dev")
  178. HotSearch.weibo_hot_search("hot-search", "weixinzhishu", "dev")
  179. HotSearch.weixin_hot_search("hot-search", "weixinzhishu", "dev")