1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- # -*- coding: utf-8 -*-
- # @Author: wangkun
- # @Time: 2023/3/27
- import os
- import sys
- from datetime import date, timedelta
- from lxml import etree
- import requests
- from common.common import Common
- sys.path.append(os.getcwd())
- from common.scheduling_db import MysqlHelper
- proxies = {"http": None, "https": None}
- class HotSearch:
- # 日期,格式 年-月-日
- today = (date.today() + timedelta(days=0)).strftime("%Y-%m-%d")
- # 百度热搜
- @classmethod
- def baidu_hot_search(cls, log_type, crawler, env):
- try:
- url = "https://top.baidu.com/board?tab=realtime"
- headers = {
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54"
- }
- response = requests.get(url=url, headers=headers, proxies=proxies).text
- if len(response) == 0:
- Common.logger(log_type, crawler).error(f"baidu_hot_search:{response}\n")
- return
- response_html = etree.HTML(response)
- title_list_html = response_html.xpath('//*[@class="c-single-text-ellipsis"]')
- for title_html in title_list_html:
- source = "百度"
- title = title_html.xpath("./text()")[0].strip()
- publish_time = cls.today
- insert_sql = f"""insert into crawler_hot_title(source, title, publish_time)
- values("{source}", "{title}", "{publish_time}")"""
- Common.logger(log_type, crawler).info(f"insert_sql:{insert_sql}")
- MysqlHelper.update_values(log_type, crawler, insert_sql, env, action="")
- Common.logger(log_type, crawler).info("写入数据库成功\n")
- except Exception as e:
- Common.logger(log_type, crawler).info(f"baidu_hot_search:{e}\n")
- if __name__ == "__main__":
- HotSearch.baidu_hot_search("hot-search", "weixinzhishu", "dec")
|