# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/3/27 import os import sys from datetime import date, timedelta from lxml import etree import requests sys.path.append(os.getcwd()) from common.scheduling_db import MysqlHelper from common.common import Common proxies = {"http": None, "https": None} class HotSearch: # 日期,格式 年-月-日 today = (date.today() + timedelta(days=0)).strftime("%Y-%m-%d") # 百度热搜 @classmethod def baidu_hot_search(cls, log_type, crawler, env): try: url = "https://top.baidu.com/board?tab=realtime" headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54" } response = requests.get(url=url, headers=headers, proxies=proxies).text if len(response) == 0: Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n") return response_html = etree.HTML(response) title_list_html = response_html.xpath('//*[@class="c-single-text-ellipsis"]') for title_html in title_list_html: source = "百度" title = title_html.xpath("./text()")[0].strip() publish_time = cls.today insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")""" Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}") MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="") Common.logger(log_type, crawler).info("写入数据库成功\n") except Exception as e: Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n") # 抖音热搜 @classmethod def douyin_hot_search(cls, log_type, crawler, env): try: url = "https://tophub.today/n/K7GdaMgdQy" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54" } response = requests.get(url=url, headers=headers, proxies=proxies).text if len(response) == 0: Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n") return response_html = etree.HTML(response) tr_list = response_html.xpath("//tr") for tr in tr_list: source = "抖音" publish_day = tr.xpath("./td[1]/text()")[0] if len(publish_day) < 10: publish_day = cls.today elif publish_day != cls.today: pass publish_time = publish_day if publish_time != cls.today: continue title = tr.xpath("./td[2]/*[1]/text()")[0].strip() insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")""" Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}") MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="") Common.logger(log_type, crawler).info("写入数据库成功\n") except Exception as e: Common.logger(log_type, crawler).error(f"douyin_hot_search:{e}\n") # 快手热搜 @classmethod def kuaishou_hot_search(cls, log_type, crawler, env): try: url = "https://tophub.today/n/MZd7PrPerO" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54" } response = requests.get(url=url, headers=headers, proxies=proxies).text if len(response) == 0: Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n") return response_html = etree.HTML(response) tr_list = response_html.xpath("//tr") for tr in tr_list: source = "快手" publish_day = tr.xpath("./td[1]/text()")[0] if len(publish_day) < 10: publish_day = cls.today elif publish_day != cls.today: pass publish_time = publish_day if publish_time != cls.today: continue title = tr.xpath("./td[2]/*[1]/text()")[0].strip() # print(source) # print(publish_time) # print(title) # print('\n') insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")""" Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}") MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="") Common.logger(log_type, crawler).info("写入数据库成功\n") except Exception as e: Common.logger(log_type, crawler).error(f"kuaishou_hot_search:{e}\n") # 微博热搜 @classmethod def weibo_hot_search(cls, log_type, crawler, env): try: url = "https://tophub.today/n/KqndgxeLl9" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54" } response = requests.get(url=url, headers=headers, proxies=proxies).text if len(response) == 0: Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n") return response_html = etree.HTML(response) tr_list = response_html.xpath("//tr") for tr in tr_list: source = "微博" publish_day = tr.xpath("./td[1]/text()")[0] if len(publish_day) < 10: publish_day = cls.today elif publish_day != cls.today: pass publish_time = publish_day if publish_time != cls.today: continue title = tr.xpath("./td[2]/*[1]/text()")[0].strip() # print(source) # print(publish_time) # print(title) insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")""" Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}") MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="") Common.logger(log_type, crawler).info("写入数据库成功\n") except Exception as e: Common.logger(log_type, crawler).error(f"weibo_hot_search:{e}\n") # 微信热搜 @classmethod def weixin_hot_search(cls, log_type, crawler, env): try: url = "https://tophub.today/n/W1VdJPZoLQ" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54" } response = requests.get(url=url, headers=headers, proxies=proxies).text if len(response) == 0: Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n") return response_html = etree.HTML(response) tr_list = response_html.xpath("//tr") for tr in tr_list: source = "微信" publish_day = tr.xpath("./td[1]/text()")[0] if len(publish_day) < 10: publish_day = cls.today elif publish_day != cls.today: pass publish_time = publish_day if publish_time != cls.today: continue title = tr.xpath("./td[2]/*[1]/text()")[0].strip() # print(source) # print(publish_time) # print(title) insert_sql = f"""insert into crawler_hot_title(source, title, publish_time) values("{source}", '{title}', "{publish_time}")""" Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}") MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="") Common.logger(log_type, crawler).info("写入数据库成功\n") except Exception as e: Common.logger(log_type, crawler).error(f"weixin_hot_search:{e}\n") if __name__ == "__main__": HotSearch.baidu_hot_search("hot-search", "weixinzhishu", "dev") HotSearch.douyin_hot_search("hot-search", "weixinzhishu", "dev") HotSearch.kuaishou_hot_search("hot-search", "weixinzhishu", "dev") HotSearch.weibo_hot_search("hot-search", "weixinzhishu", "dev") HotSearch.weixin_hot_search("hot-search", "weixinzhishu", "dev")