|
@@ -0,0 +1,29 @@
|
|
|
|
+from typing import Any, Dict, List
|
|
|
|
+import requests
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
+
|
|
|
|
+from pqai_agent.toolkit.base import BaseToolkit
|
|
|
|
+from pqai_agent.toolkit.function_tool import FunctionTool
|
|
|
|
+
|
|
|
|
+class WebExtractor(BaseToolkit):
|
|
|
|
+ def extract_web_content(self, url: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts the content of a webpage given its URL.
|
|
|
|
+ Args:
|
|
|
|
+ url (str): The URL of the webpage to extract content from.
|
|
|
|
+ Returns:
|
|
|
|
+ str: The extracted content of the webpage.
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ try:
|
|
|
|
+ response = requests.get(url)
|
|
|
|
+ response.raise_for_status() # Raise an error for bad responses
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
+
|
|
|
|
+ content = soup.get_text(separator='\n', strip=True)
|
|
|
|
+ return content
|
|
|
|
+ except Exception as e:
|
|
|
|
+ return f"Error occurred while extracting content: {str(e)}"
|
|
|
|
+
|
|
|
|
+ def get_tools(self) -> List[FunctionTool]:
|
|
|
|
+ return [FunctionTool(self.extract_web_content)]
|