| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- import urllib.request
- import urllib.parse
- import re
- import json
- import os
- from PIL import Image
- from io import BytesIO
- ROOT_URL = 'https://comfyanonymous.github.io/ComfyUI_examples/'
- OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'comfyui_examples')
- def fetch_html(url):
- req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
- with urllib.request.urlopen(req) as response:
- return response.read().decode('utf-8')
- def extract_api_from_image(img_data):
- try:
- img = Image.open(BytesIO(img_data))
- if 'prompt' in img.info:
- return json.loads(img.info['prompt'])
- except Exception as e:
- pass
- return None
- def extract_api_from_webp(img_data):
- # WebP metadata extraction for ComfyUI may require different approach,
- # but PIL >= 10.0 often maps EXIF or prompts locally.
- try:
- img = Image.open(BytesIO(img_data))
- if hasattr(img, 'info') and 'prompt' in img.info:
- if isinstance(img.info['prompt'], str):
- return json.loads(img.info['prompt'])
- elif img.format == 'WEBP':
- # specifically for webp exif holding json, usually PIL handles it but we can skip if it fails.
- pass
- except Exception:
- pass
- return None
- def main():
- if not os.path.exists(OUTPUT_DIR):
- os.makedirs(OUTPUT_DIR)
- print(f"Fetching root page: {ROOT_URL}")
- try:
- root_html = fetch_html(ROOT_URL)
- except Exception as e:
- print("Failed to fetch root URL:", e)
- return
- # Find categories (href="subdir" or href="subdir/")
- # Typically <a href="lora">Lora</a> or <a href="lora/">Lora</a>
- # We regex for href="([a-zA-Z0-9_-]+/?)"
- categories = re.findall(r'href="([^/][^"]+/?)"', root_html)
-
- # Filter out external links or weird ones
- valid_categories = []
- for c in categories:
- if c.startswith('http') or c.startswith('#') or '.html' in c:
- continue
- c = c.strip('/')
- if c and c not in valid_categories:
- valid_categories.append(c)
-
- print(f"Found {len(valid_categories)} example categories: {valid_categories}")
- total_workflows = 0
- for cat in valid_categories:
- cat_url = urllib.parse.urljoin(ROOT_URL, cat + '/')
- cat_dir = os.path.join(OUTPUT_DIR, cat)
- if not os.path.exists(cat_dir):
- os.makedirs(cat_dir)
-
- print(f"\n--- Crawling Category: {cat} ---")
- try:
- cat_html = fetch_html(cat_url)
- except urllib.error.HTTPError as e:
- if e.code == 404:
- print(f" [Skip] {cat_url} returned 404.")
- continue
- else:
- print(f" [Error] {cat_url} failed: {e}")
- continue
- except Exception as e:
- print(f" [Error] {cat_url} failed: {e}")
- continue
- # Find images
- img_links = re.findall(r'src=["\']([^"\']+\.(?:png|webp|jpg))["\']', cat_html)
-
- for link in img_links:
- # resolve url
- full_img_url = urllib.parse.urljoin(cat_url, link)
-
- # download image
- try:
- req_img = urllib.request.Request(full_img_url, headers={'User-Agent': 'Mozilla/5.0'})
- with urllib.request.urlopen(req_img, timeout=10) as response_img:
- img_data = response_img.read()
-
- api_json = None
- if full_img_url.lower().endswith('.png'):
- api_json = extract_api_from_image(img_data)
- elif full_img_url.lower().endswith('.webp'):
- api_json = extract_api_from_webp(img_data)
-
- if api_json:
- filename = os.path.basename(link).replace('.png', '').replace('.webp', '') + '_api.json'
- out_path = os.path.join(cat_dir, filename)
- with open(out_path, 'w', encoding='utf-8') as f:
- json.dump(api_json, f, indent=2)
- print(f" [Success] Saved {filename} ({len(api_json)} nodes)")
- total_workflows += 1
- else:
- # Not all images have metadata
- pass
- except Exception as e:
- print(f" [Failed] Download/Parse {full_img_url}: {e}")
- print(f"\n✅ Crawl Complete! Extracted {total_workflows} API Workflows into {OUTPUT_DIR}")
- if __name__ == "__main__":
- main()
|