Browse Source

Add toolkit: web_extractor

StrayWarrior 4 weeks ago
parent
commit
46dd6c5d7c
1 changed files with 29 additions and 0 deletions
  1. 29 0
      pqai_agent/toolkit/web_extractor.py

+ 29 - 0
pqai_agent/toolkit/web_extractor.py

@@ -0,0 +1,29 @@
+from typing import Any, Dict, List
+import requests
+from bs4 import BeautifulSoup
+
+from pqai_agent.toolkit.base import BaseToolkit
+from pqai_agent.toolkit.function_tool import FunctionTool
+
+class WebExtractor(BaseToolkit):
+    def extract_web_content(self, url: str) -> str:
+        """
+        Extracts the content of a webpage given its URL.
+        Args:
+            url (str): The URL of the webpage to extract content from.
+        Returns:
+            str: The extracted content of the webpage.
+        """
+
+        try:
+            response = requests.get(url)
+            response.raise_for_status()  # Raise an error for bad responses
+            soup = BeautifulSoup(response.text, 'html.parser')
+
+            content = soup.get_text(separator='\n', strip=True)
+            return content
+        except Exception as e:
+            return f"Error occurred while extracting content: {str(e)}"
+
+    def get_tools(self) -> List[FunctionTool]:
+        return [FunctionTool(self.extract_web_content)]