import urllib.request import urllib.parse import re import json import os from PIL import Image from io import BytesIO ROOT_URL = 'https://comfyanonymous.github.io/ComfyUI_examples/' OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'comfyui_examples') def fetch_html(url): req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(req) as response: return response.read().decode('utf-8') def extract_api_from_image(img_data): try: img = Image.open(BytesIO(img_data)) if 'prompt' in img.info: return json.loads(img.info['prompt']) except Exception as e: pass return None def extract_api_from_webp(img_data): # WebP metadata extraction for ComfyUI may require different approach, # but PIL >= 10.0 often maps EXIF or prompts locally. try: img = Image.open(BytesIO(img_data)) if hasattr(img, 'info') and 'prompt' in img.info: if isinstance(img.info['prompt'], str): return json.loads(img.info['prompt']) elif img.format == 'WEBP': # specifically for webp exif holding json, usually PIL handles it but we can skip if it fails. pass except Exception: pass return None def main(): if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) print(f"Fetching root page: {ROOT_URL}") try: root_html = fetch_html(ROOT_URL) except Exception as e: print("Failed to fetch root URL:", e) return # Find categories (href="subdir" or href="subdir/") # Typically Lora or Lora # We regex for href="([a-zA-Z0-9_-]+/?)" categories = re.findall(r'href="([^/][^"]+/?)"', root_html) # Filter out external links or weird ones valid_categories = [] for c in categories: if c.startswith('http') or c.startswith('#') or '.html' in c: continue c = c.strip('/') if c and c not in valid_categories: valid_categories.append(c) print(f"Found {len(valid_categories)} example categories: {valid_categories}") total_workflows = 0 for cat in valid_categories: cat_url = urllib.parse.urljoin(ROOT_URL, cat + '/') cat_dir = os.path.join(OUTPUT_DIR, cat) if not os.path.exists(cat_dir): os.makedirs(cat_dir) print(f"\n--- Crawling Category: {cat} ---") try: cat_html = fetch_html(cat_url) except urllib.error.HTTPError as e: if e.code == 404: print(f" [Skip] {cat_url} returned 404.") continue else: print(f" [Error] {cat_url} failed: {e}") continue except Exception as e: print(f" [Error] {cat_url} failed: {e}") continue # Find images img_links = re.findall(r'src=["\']([^"\']+\.(?:png|webp|jpg))["\']', cat_html) for link in img_links: # resolve url full_img_url = urllib.parse.urljoin(cat_url, link) # download image try: req_img = urllib.request.Request(full_img_url, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(req_img, timeout=10) as response_img: img_data = response_img.read() api_json = None if full_img_url.lower().endswith('.png'): api_json = extract_api_from_image(img_data) elif full_img_url.lower().endswith('.webp'): api_json = extract_api_from_webp(img_data) if api_json: filename = os.path.basename(link).replace('.png', '').replace('.webp', '') + '_api.json' out_path = os.path.join(cat_dir, filename) with open(out_path, 'w', encoding='utf-8') as f: json.dump(api_json, f, indent=2) print(f" [Success] Saved {filename} ({len(api_json)} nodes)") total_workflows += 1 else: # Not all images have metadata pass except Exception as e: print(f" [Failed] Download/Parse {full_img_url}: {e}") print(f"\n✅ Crawl Complete! Extracted {total_workflows} API Workflows into {OUTPUT_DIR}") if __name__ == "__main__": main()