123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204 |
- import os
- import xml.etree.ElementTree as ET
- from typing import Any, Dict, List, Literal, Optional, TypeAlias, Union
- import requests
- from pqai_agent.toolkit.base import BaseToolkit
- from pqai_agent.toolkit.function_tool import FunctionTool
- class SearchToolkit(BaseToolkit):
- r"""A class representing a toolkit for web search.
- """
- def search_baidu(self, query: str, max_results: int = 5) -> Dict[str, Any]:
- r"""Search Baidu using web scraping to retrieve relevant search
- results. This method queries Baidu's search engine and extracts search
- results including titles, descriptions, and URLs.
- Args:
- query (str): Search query string to submit to Baidu.
- max_results (int): Maximum number of results to return.
- (default: :obj:`5`)
- Returns:
- Dict[str, Any]: A dictionary containing search results or error
- message.
- """
- from bs4 import BeautifulSoup
- try:
- url = "https://www.baidu.com/s"
- headers = {
- "User-Agent": (
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
- "AppleWebKit/537.36 (KHTML, like Gecko) "
- "Chrome/120.0.0.0 Safari/537.36"
- ),
- "Referer": "https://www.baidu.com",
- }
- params = {"wd": query, "rn": str(max_results)}
- response = requests.get(url, headers=headers, params=params)
- response.encoding = "utf-8"
- soup = BeautifulSoup(response.text, "html.parser")
- results = []
- for idx, item in enumerate(soup.select(".result"), 1):
- title_element = item.select_one("h3 > a")
- title = (
- title_element.get_text(strip=True) if title_element else ""
- )
- link = title_element["href"] if title_element else ""
- desc_element = item.select_one(".c-abstract, .c-span-last")
- desc = (
- desc_element.get_text(strip=True) if desc_element else ""
- )
- results.append(
- {
- "result_id": idx,
- "title": title,
- "description": desc,
- "url": link,
- }
- )
- if len(results) >= max_results:
- break
- if not results:
- print(
- "Warning: No results found. Check "
- "if Baidu HTML structure has changed."
- )
- return {"results": results}
- except Exception as e:
- return {"error": f"Baidu scraping error: {e!s}"}
- def search_bing(self, query: str, max_results: int = 5) -> Dict[str, Any]:
- r"""Use Bing search engine to search information for the given query.
- This function queries the Chinese version of Bing search engine (cn.
- bing.com) using web scraping to retrieve relevant search results. It
- extracts search results including titles, snippets, and URLs. This
- function is particularly useful when the query is in Chinese or when
- Chinese search results are desired.
- Args:
- query (str): The search query string to submit to Bing. Works best
- with Chinese queries or when Chinese results are preferred.
- max_results (int): Maximum number of results to return.
- (default: :obj:`5`)
- Returns:
- Dict ([str, Any]): A dictionary containing either:
- - 'results': A list of dictionaries, each with:
- - 'result_id': The index of the result.
- - 'snippet': A brief description of the search result.
- - 'title': The title of the search result.
- - 'link': The URL of the search result.
- - or 'error': An error message if something went wrong.
- """
- from typing import Any, Dict, List, cast
- from urllib.parse import urlencode
- from bs4 import BeautifulSoup, Tag
- try:
- query = urlencode({"q": query})
- url = f'https://cn.bing.com/search?{query}'
- headers = {
- "User-Agent": (
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
- "AppleWebKit/537.36 (KHTML, like Gecko) "
- "Chrome/120.0.0.0 Safari/537.36"
- ),
- }
- # Add timeout to prevent hanging
- response = requests.get(url, headers=headers, timeout=10)
- # Check if the request was successful
- if response.status_code != 200:
- return {
- "error": (
- f"Bing returned status code: "
- f"{response.status_code}"
- )
- }
- response.encoding = 'utf-8'
- soup = BeautifulSoup(response.text, 'html.parser')
- b_results_element = soup.find("ol", id="b_results")
- if b_results_element is None:
- return {"results": []}
- # Ensure b_results is a Tag and find all li elements
- b_results_tag = cast(Tag, b_results_element)
- result_items = b_results_tag.find_all("li")
- results: List[Dict[str, Any]] = []
- for i in range(min(len(result_items), max_results)):
- row = result_items[i]
- if not isinstance(row, Tag):
- continue
- h2_element = row.find("h2")
- if h2_element is None:
- continue
- h2_tag = cast(Tag, h2_element)
- title = h2_tag.get_text().strip()
- link_tag_element = h2_tag.find("a")
- if link_tag_element is None:
- continue
- link_tag = cast(Tag, link_tag_element)
- link = link_tag.get("href")
- if link is None:
- continue
- content_element = row.find("p", class_="b_algoSlug")
- content_text = ""
- if content_element is not None and isinstance(
- content_element, Tag
- ):
- content_text = content_element.get_text()
- row_data = {
- "result_id": i + 1,
- "snippet": content_text,
- "title": title,
- "link": link,
- }
- results.append(row_data)
- if not results:
- return {
- "warning": "No results found. Check if "
- "Bing HTML structure has changed."
- }
- return {"results": results}
- except Exception as e:
- return {"error": f"Bing scraping error: {e!s}"}
- def get_tools(self) -> List[FunctionTool]:
- r"""Returns a list of FunctionTool objects representing the
- functions in the toolkit.
- Returns:
- List[FunctionTool]: A list of FunctionTool objects
- representing the functions in the toolkit.
- """
- return [
- FunctionTool(self.search_baidu),
- FunctionTool(self.search_bing),
- ]
|