web_extractor.py 1005 B

1234567891011121314151617181920212223242526272829
  1. from typing import Any, Dict, List
  2. import requests
  3. from bs4 import BeautifulSoup
  4. from pqai_agent.toolkit.base import BaseToolkit
  5. from pqai_agent.toolkit.function_tool import FunctionTool
  6. class WebExtractor(BaseToolkit):
  7. def extract_web_content(self, url: str) -> str:
  8. """
  9. Extracts the content of a webpage given its URL.
  10. Args:
  11. url (str): The URL of the webpage to extract content from.
  12. Returns:
  13. str: The extracted content of the webpage.
  14. """
  15. try:
  16. response = requests.get(url)
  17. response.raise_for_status() # Raise an error for bad responses
  18. soup = BeautifulSoup(response.text, 'html.parser')
  19. content = soup.get_text(separator='\n', strip=True)
  20. return content
  21. except Exception as e:
  22. return f"Error occurred while extracting content: {str(e)}"
  23. def get_tools(self) -> List[FunctionTool]:
  24. return [FunctionTool(self.extract_web_content)]