comfyui_example_scraper.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. import urllib.request
  2. import urllib.parse
  3. import re
  4. import json
  5. import os
  6. from PIL import Image
  7. from io import BytesIO
  8. ROOT_URL = 'https://comfyanonymous.github.io/ComfyUI_examples/'
  9. OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'comfyui_examples')
  10. def fetch_html(url):
  11. req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
  12. with urllib.request.urlopen(req) as response:
  13. return response.read().decode('utf-8')
  14. def extract_api_from_image(img_data):
  15. try:
  16. img = Image.open(BytesIO(img_data))
  17. if 'prompt' in img.info:
  18. return json.loads(img.info['prompt'])
  19. except Exception as e:
  20. pass
  21. return None
  22. def extract_api_from_webp(img_data):
  23. # WebP metadata extraction for ComfyUI may require different approach,
  24. # but PIL >= 10.0 often maps EXIF or prompts locally.
  25. try:
  26. img = Image.open(BytesIO(img_data))
  27. if hasattr(img, 'info') and 'prompt' in img.info:
  28. if isinstance(img.info['prompt'], str):
  29. return json.loads(img.info['prompt'])
  30. elif img.format == 'WEBP':
  31. # specifically for webp exif holding json, usually PIL handles it but we can skip if it fails.
  32. pass
  33. except Exception:
  34. pass
  35. return None
  36. def main():
  37. if not os.path.exists(OUTPUT_DIR):
  38. os.makedirs(OUTPUT_DIR)
  39. print(f"Fetching root page: {ROOT_URL}")
  40. try:
  41. root_html = fetch_html(ROOT_URL)
  42. except Exception as e:
  43. print("Failed to fetch root URL:", e)
  44. return
  45. # Find categories (href="subdir" or href="subdir/")
  46. # Typically <a href="lora">Lora</a> or <a href="lora/">Lora</a>
  47. # We regex for href="([a-zA-Z0-9_-]+/?)"
  48. categories = re.findall(r'href="([^/][^"]+/?)"', root_html)
  49. # Filter out external links or weird ones
  50. valid_categories = []
  51. for c in categories:
  52. if c.startswith('http') or c.startswith('#') or '.html' in c:
  53. continue
  54. c = c.strip('/')
  55. if c and c not in valid_categories:
  56. valid_categories.append(c)
  57. print(f"Found {len(valid_categories)} example categories: {valid_categories}")
  58. total_workflows = 0
  59. for cat in valid_categories:
  60. cat_url = urllib.parse.urljoin(ROOT_URL, cat + '/')
  61. cat_dir = os.path.join(OUTPUT_DIR, cat)
  62. if not os.path.exists(cat_dir):
  63. os.makedirs(cat_dir)
  64. print(f"\n--- Crawling Category: {cat} ---")
  65. try:
  66. cat_html = fetch_html(cat_url)
  67. except urllib.error.HTTPError as e:
  68. if e.code == 404:
  69. print(f" [Skip] {cat_url} returned 404.")
  70. continue
  71. else:
  72. print(f" [Error] {cat_url} failed: {e}")
  73. continue
  74. except Exception as e:
  75. print(f" [Error] {cat_url} failed: {e}")
  76. continue
  77. # Find images
  78. img_links = re.findall(r'src=["\']([^"\']+\.(?:png|webp|jpg))["\']', cat_html)
  79. for link in img_links:
  80. # resolve url
  81. full_img_url = urllib.parse.urljoin(cat_url, link)
  82. # download image
  83. try:
  84. req_img = urllib.request.Request(full_img_url, headers={'User-Agent': 'Mozilla/5.0'})
  85. with urllib.request.urlopen(req_img, timeout=10) as response_img:
  86. img_data = response_img.read()
  87. api_json = None
  88. if full_img_url.lower().endswith('.png'):
  89. api_json = extract_api_from_image(img_data)
  90. elif full_img_url.lower().endswith('.webp'):
  91. api_json = extract_api_from_webp(img_data)
  92. if api_json:
  93. filename = os.path.basename(link).replace('.png', '').replace('.webp', '') + '_api.json'
  94. out_path = os.path.join(cat_dir, filename)
  95. with open(out_path, 'w', encoding='utf-8') as f:
  96. json.dump(api_json, f, indent=2)
  97. print(f" [Success] Saved {filename} ({len(api_json)} nodes)")
  98. total_workflows += 1
  99. else:
  100. # Not all images have metadata
  101. pass
  102. except Exception as e:
  103. print(f" [Failed] Download/Parse {full_img_url}: {e}")
  104. print(f"\n✅ Crawl Complete! Extracted {total_workflows} API Workflows into {OUTPUT_DIR}")
  105. if __name__ == "__main__":
  106. main()