import urllib.request
import urllib.parse
import re
import json
import os
from PIL import Image
from io import BytesIO
ROOT_URL = 'https://comfyanonymous.github.io/ComfyUI_examples/'
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'comfyui_examples')
def fetch_html(url):
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req) as response:
return response.read().decode('utf-8')
def extract_api_from_image(img_data):
try:
img = Image.open(BytesIO(img_data))
if 'prompt' in img.info:
return json.loads(img.info['prompt'])
except Exception as e:
pass
return None
def extract_api_from_webp(img_data):
# WebP metadata extraction for ComfyUI may require different approach,
# but PIL >= 10.0 often maps EXIF or prompts locally.
try:
img = Image.open(BytesIO(img_data))
if hasattr(img, 'info') and 'prompt' in img.info:
if isinstance(img.info['prompt'], str):
return json.loads(img.info['prompt'])
elif img.format == 'WEBP':
# specifically for webp exif holding json, usually PIL handles it but we can skip if it fails.
pass
except Exception:
pass
return None
def main():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
print(f"Fetching root page: {ROOT_URL}")
try:
root_html = fetch_html(ROOT_URL)
except Exception as e:
print("Failed to fetch root URL:", e)
return
# Find categories (href="subdir" or href="subdir/")
# Typically Lora or Lora
# We regex for href="([a-zA-Z0-9_-]+/?)"
categories = re.findall(r'href="([^/][^"]+/?)"', root_html)
# Filter out external links or weird ones
valid_categories = []
for c in categories:
if c.startswith('http') or c.startswith('#') or '.html' in c:
continue
c = c.strip('/')
if c and c not in valid_categories:
valid_categories.append(c)
print(f"Found {len(valid_categories)} example categories: {valid_categories}")
total_workflows = 0
for cat in valid_categories:
cat_url = urllib.parse.urljoin(ROOT_URL, cat + '/')
cat_dir = os.path.join(OUTPUT_DIR, cat)
if not os.path.exists(cat_dir):
os.makedirs(cat_dir)
print(f"\n--- Crawling Category: {cat} ---")
try:
cat_html = fetch_html(cat_url)
except urllib.error.HTTPError as e:
if e.code == 404:
print(f" [Skip] {cat_url} returned 404.")
continue
else:
print(f" [Error] {cat_url} failed: {e}")
continue
except Exception as e:
print(f" [Error] {cat_url} failed: {e}")
continue
# Find images
img_links = re.findall(r'src=["\']([^"\']+\.(?:png|webp|jpg))["\']', cat_html)
for link in img_links:
# resolve url
full_img_url = urllib.parse.urljoin(cat_url, link)
# download image
try:
req_img = urllib.request.Request(full_img_url, headers={'User-Agent': 'Mozilla/5.0'})
with urllib.request.urlopen(req_img, timeout=10) as response_img:
img_data = response_img.read()
api_json = None
if full_img_url.lower().endswith('.png'):
api_json = extract_api_from_image(img_data)
elif full_img_url.lower().endswith('.webp'):
api_json = extract_api_from_webp(img_data)
if api_json:
filename = os.path.basename(link).replace('.png', '').replace('.webp', '') + '_api.json'
out_path = os.path.join(cat_dir, filename)
with open(out_path, 'w', encoding='utf-8') as f:
json.dump(api_json, f, indent=2)
print(f" [Success] Saved {filename} ({len(api_json)} nodes)")
total_workflows += 1
else:
# Not all images have metadata
pass
except Exception as e:
print(f" [Failed] Download/Parse {full_img_url}: {e}")
print(f"\n✅ Crawl Complete! Extracted {total_workflows} API Workflows into {OUTPUT_DIR}")
if __name__ == "__main__":
main()