import urllib.request
import urllib.parse
import re
import json
import os
from PIL import Image
from io import BytesIO

ROOT_URL = 'https://comfyanonymous.github.io/ComfyUI_examples/'
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'comfyui_examples')

def fetch_html(url):
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    with urllib.request.urlopen(req) as response:
        return response.read().decode('utf-8')

def extract_api_from_image(img_data):
    try:
        img = Image.open(BytesIO(img_data))
        if 'prompt' in img.info:
            return json.loads(img.info['prompt'])
    except Exception as e:
        pass
    return None

def extract_api_from_webp(img_data):
    # WebP metadata extraction for ComfyUI may require different approach,
    # but PIL >= 10.0 often maps EXIF or prompts locally.
    try:
        img = Image.open(BytesIO(img_data))
        if hasattr(img, 'info') and 'prompt' in img.info:
            if isinstance(img.info['prompt'], str):
                 return json.loads(img.info['prompt'])
        elif img.format == 'WEBP':
            # specifically for webp exif holding json, usually PIL handles it but we can skip if it fails.
            pass
    except Exception:
        pass
    return None

def main():
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    print(f"Fetching root page: {ROOT_URL}")
    try:
        root_html = fetch_html(ROOT_URL)
    except Exception as e:
        print("Failed to fetch root URL:", e)
        return

    # Find categories (href="subdir" or href="subdir/")
    # Typically <a href="lora">Lora</a> or <a href="lora/">Lora</a>
    # We regex for href="([a-zA-Z0-9_-]+/?)"
    categories = re.findall(r'href="([^/][^"]+/?)"', root_html)
    
    # Filter out external links or weird ones
    valid_categories = []
    for c in categories:
        if c.startswith('http') or c.startswith('#') or '.html' in c:
            continue
        c = c.strip('/')
        if c and c not in valid_categories:
            valid_categories.append(c)
            
    print(f"Found {len(valid_categories)} example categories: {valid_categories}")

    total_workflows = 0

    for cat in valid_categories:
        cat_url = urllib.parse.urljoin(ROOT_URL, cat + '/')
        cat_dir = os.path.join(OUTPUT_DIR, cat)
        if not os.path.exists(cat_dir):
            os.makedirs(cat_dir)
            
        print(f"\n--- Crawling Category: {cat} ---")
        try:
            cat_html = fetch_html(cat_url)
        except urllib.error.HTTPError as e:
            if e.code == 404:
                print(f"  [Skip] {cat_url} returned 404.")
                continue
            else:
                print(f"  [Error] {cat_url} failed: {e}")
                continue
        except Exception as e:
            print(f"  [Error] {cat_url} failed: {e}")
            continue

        # Find images
        img_links = re.findall(r'src=["\']([^"\']+\.(?:png|webp|jpg))["\']', cat_html)
        
        for link in img_links:
            # resolve url
            full_img_url = urllib.parse.urljoin(cat_url, link)
            
            # download image
            try:
                req_img = urllib.request.Request(full_img_url, headers={'User-Agent': 'Mozilla/5.0'})
                with urllib.request.urlopen(req_img, timeout=10) as response_img:
                    img_data = response_img.read()
                
                api_json = None
                if full_img_url.lower().endswith('.png'):
                    api_json = extract_api_from_image(img_data)
                elif full_img_url.lower().endswith('.webp'):
                    api_json = extract_api_from_webp(img_data)
                    
                if api_json:
                    filename = os.path.basename(link).replace('.png', '').replace('.webp', '') + '_api.json'
                    out_path = os.path.join(cat_dir, filename)
                    with open(out_path, 'w', encoding='utf-8') as f:
                        json.dump(api_json, f, indent=2)
                    print(f"  [Success] Saved {filename} ({len(api_json)} nodes)")
                    total_workflows += 1
                else:
                    # Not all images have metadata
                    pass
            except Exception as e:
                print(f"  [Failed] Download/Parse {full_img_url}: {e}")

    print(f"\n✅ Crawl Complete! Extracted {total_workflows} API Workflows into {OUTPUT_DIR}")

if __name__ == "__main__":
    main()