Server
/
AgentCoreService


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
							import os
import xml.etree.ElementTree as ET
from typing import Any, Dict, List, Literal, Optional, TypeAlias, Union

import requests

from pqai_agent.toolkit.base import BaseToolkit
from pqai_agent.toolkit.function_tool import FunctionTool


class SearchToolkit(BaseToolkit):
    r"""A class representing a toolkit for web search.
    """
    def search_baidu(self, query: str, max_results: int = 5) -> Dict[str, Any]:
        r"""Search Baidu using web scraping to retrieve relevant search
        results. This method queries Baidu's search engine and extracts search
        results including titles, descriptions, and URLs.

        Args:
            query (str): Search query string to submit to Baidu.
            max_results (int): Maximum number of results to return.
                (default: :obj:`5`)

        Returns:
            Dict[str, Any]: A dictionary containing search results or error
                message.
        """
        from bs4 import BeautifulSoup

        try:
            url = "https://www.baidu.com/s"
            headers = {
                "User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/120.0.0.0 Safari/537.36"
                ),
                "Referer": "https://www.baidu.com",
            }
            params = {"wd": query, "rn": str(max_results)}

            response = requests.get(url, headers=headers, params=params)
            response.encoding = "utf-8"

            soup = BeautifulSoup(response.text, "html.parser")

            results = []
            for idx, item in enumerate(soup.select(".result"), 1):
                title_element = item.select_one("h3 > a")
                title = (
                    title_element.get_text(strip=True) if title_element else ""
                )

                link = title_element["href"] if title_element else ""

                desc_element = item.select_one(".c-abstract, .c-span-last")
                desc = (
                    desc_element.get_text(strip=True) if desc_element else ""
                )

                results.append(
                    {
                        "result_id": idx,
                        "title": title,
                        "description": desc,
                        "url": link,
                    }
                )
                if len(results) >= max_results:
                    break

            if not results:
                print(
                    "Warning: No results found. Check "
                    "if Baidu HTML structure has changed."
                )

            return {"results": results}

        except Exception as e:
            return {"error": f"Baidu scraping error: {e!s}"}

    def search_bing(self, query: str, max_results: int = 5) -> Dict[str, Any]:
        r"""Use Bing search engine to search information for the given query.

        This function queries the Chinese version of Bing search engine (cn.
        bing.com) using web scraping to retrieve relevant search results. It
        extracts search results including titles, snippets, and URLs. This
        function is particularly useful when the query is in Chinese or when
        Chinese search results are desired.

        Args:
            query (str): The search query string to submit to Bing. Works best
                with Chinese queries or when Chinese results are preferred.
            max_results (int): Maximum number of results to return.
                (default: :obj:`5`)

        Returns:
            Dict ([str, Any]): A dictionary containing either:
                - 'results': A list of dictionaries, each with:
                    - 'result_id': The index of the result.
                    - 'snippet': A brief description of the search result.
                    - 'title': The title of the search result.
                    - 'link': The URL of the search result.
                - or 'error': An error message if something went wrong.
        """
        from typing import Any, Dict, List, cast
        from urllib.parse import urlencode

        from bs4 import BeautifulSoup, Tag

        try:
            query = urlencode({"q": query})
            url = f'https://cn.bing.com/search?{query}'
            headers = {
                "User-Agent": (
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) "
                    "Chrome/120.0.0.0 Safari/537.36"
                ),
            }
            # Add timeout to prevent hanging
            response = requests.get(url, headers=headers, timeout=10)

            # Check if the request was successful
            if response.status_code != 200:
                return {
                    "error": (
                        f"Bing returned status code: "
                        f"{response.status_code}"
                    )
                }

            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'html.parser')

            b_results_element = soup.find("ol", id="b_results")
            if b_results_element is None:
                return {"results": []}

            # Ensure b_results is a Tag and find all li elements
            b_results_tag = cast(Tag, b_results_element)
            result_items = b_results_tag.find_all("li")

            results: List[Dict[str, Any]] = []
            for i in range(min(len(result_items), max_results)):
                row = result_items[i]
                if not isinstance(row, Tag):
                    continue

                h2_element = row.find("h2")
                if h2_element is None:
                    continue
                h2_tag = cast(Tag, h2_element)

                title = h2_tag.get_text().strip()

                link_tag_element = h2_tag.find("a")
                if link_tag_element is None:
                    continue
                link_tag = cast(Tag, link_tag_element)

                link = link_tag.get("href")
                if link is None:
                    continue

                content_element = row.find("p", class_="b_algoSlug")
                content_text = ""
                if content_element is not None and isinstance(
                    content_element, Tag
                ):
                    content_text = content_element.get_text()

                row_data = {
                    "result_id": i + 1,
                    "snippet": content_text,
                    "title": title,
                    "link": link,
                }
                results.append(row_data)

            if not results:
                return {
                    "warning": "No results found. Check if "
                    "Bing HTML structure has changed."
                }

            return {"results": results}

        except Exception as e:
            return {"error": f"Bing scraping error: {e!s}"}

    def get_tools(self) -> List[FunctionTool]:
        r"""Returns a list of FunctionTool objects representing the
        functions in the toolkit.

        Returns:
            List[FunctionTool]: A list of FunctionTool objects
                representing the functions in the toolkit.
        """
        return [
            FunctionTool(self.search_baidu),
            FunctionTool(self.search_bing),
        ]