howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
							"""
URL Pattern Matching - 域名模式匹配工具

用于工具的域名过滤功能，支持 glob 模式：
- *.example.com
- www.example.*
- https://*.example.com/path/*
"""

import re
from typing import List, Optional
from urllib.parse import urlparse


def normalize_pattern(pattern: str) -> str:
	"""
	规范化 URL 模式

	Args:
		pattern: URL 模式（可能包含协议、通配符等）

	Returns:
		规范化的模式
	"""
	# 如果没有协议，添加通配符协议
	if not pattern.startswith(("http://", "https://", "*://")):
		pattern = f"*://{pattern}"

	return pattern


def pattern_to_regex(pattern: str) -> re.Pattern:
	"""
	将 glob 模式转换为正则表达式

	支持的通配符：
	- * : 匹配任意字符（不包括 /）
	- ** : 匹配任意字符（包括 /）

	Args:
		pattern: glob 模式

	Returns:
		编译后的正则表达式
	"""
	# 转义正则表达式特殊字符
	regex = re.escape(pattern)

	# 替换通配符
	regex = regex.replace(r"\*\*", ".__DOUBLE_STAR__")
	regex = regex.replace(r"\*", r"[^/]*")
	regex = regex.replace(".__DOUBLE_STAR__", ".*")

	# 添加开始和结束锚点
	regex = f"^{regex}$"

	return re.compile(regex, re.IGNORECASE)


def match_url_with_pattern(url: str, pattern: str) -> bool:
	"""
	检查 URL 是否匹配模式

	Args:
		url: 要检查的 URL
		pattern: URL 模式（支持通配符）

	Returns:
		是否匹配

	Examples:
		>>> match_url_with_pattern("https://google.com", "*.google.com")
		False
		>>> match_url_with_pattern("https://www.google.com", "*.google.com")
		True
		>>> match_url_with_pattern("https://www.google.co.uk", "www.google.*")
		True
		>>> match_url_with_pattern("https://github.com/user/repo", "https://github.com/**")
		True
	"""
	# 规范化模式
	pattern = normalize_pattern(pattern)

	# 解析 URL
	parsed_url = urlparse(url)

	# 构建完整 URL 字符串用于匹配
	url_str = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
	if parsed_url.query:
		url_str += f"?{parsed_url.query}"

	# 转换为正则并匹配
	regex = pattern_to_regex(pattern)
	return bool(regex.match(url_str))


def match_url_with_patterns(url: str, patterns: List[str]) -> bool:
	"""
	检查 URL 是否匹配任一模式

	Args:
		url: 要检查的 URL
		patterns: URL 模式列表

	Returns:
		是否匹配任一模式
	"""
	return any(match_url_with_pattern(url, pattern) for pattern in patterns)


def filter_by_url(
	items: List[dict],
	current_url: Optional[str],
	url_field: str = "url_patterns"
) -> List[dict]:
	"""
	根据 URL 过滤项目列表

	Args:
		items: 项目列表（每个包含 url_patterns 字段）
		current_url: 当前 URL（None = 只返回无 URL 限制的项）
		url_field: URL 模式字段名

	Returns:
		过滤后的项目列表
	"""
	if current_url is None:
		# 没有 URL 上下文，只返回无 URL 限制的项
		return [item for item in items if not item.get(url_field)]

	# 有 URL 上下文，返回匹配的项
	filtered = []
	for item in items:
		patterns = item.get(url_field)
		if patterns is None:
			# 无 URL 限制，总是包含
			filtered.append(item)
		elif match_url_with_patterns(current_url, patterns):
			# 匹配 URL，包含
			filtered.append(item)

	return filtered