| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- #!/usr/bin/env python3
- import argparse
- import os
- import time
- import numpy as np
- import nvidia_smi
- import psutil
- import torch
- from sorawm.iopaint.model_manager import ModelManager
- from sorawm.iopaint.schema import HDStrategy, InpaintRequest, SDSampler
- try:
- torch._C._jit_override_can_fuse_on_cpu(False)
- torch._C._jit_override_can_fuse_on_gpu(False)
- torch._C._jit_set_texpr_fuser_enabled(False)
- torch._C._jit_set_nvfuser_enabled(False)
- except:
- pass
- NUM_THREADS = str(4)
- os.environ["OMP_NUM_THREADS"] = NUM_THREADS
- os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
- os.environ["MKL_NUM_THREADS"] = NUM_THREADS
- os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
- os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
- if os.environ.get("CACHE_DIR"):
- os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"]
- def run_model(model, size):
- # RGB
- image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8)
- mask = np.random.randint(0, 255, size).astype(np.uint8)
- config = InpaintRequest(
- ldm_steps=2,
- hd_strategy=HDStrategy.ORIGINAL,
- hd_strategy_crop_margin=128,
- hd_strategy_crop_trigger_size=128,
- hd_strategy_resize_limit=128,
- prompt="a fox is sitting on a bench",
- sd_steps=5,
- sd_sampler=SDSampler.ddim,
- )
- model(image, mask, config)
- def benchmark(model, times: int, empty_cache: bool):
- sizes = [(512, 512)]
- nvidia_smi.nvmlInit()
- device_id = 0
- handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
- def format(metrics):
- return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}"
- process = psutil.Process(os.getpid())
- # 每个 size 给出显存和内存占用的指标
- for size in sizes:
- torch.cuda.empty_cache()
- time_metrics = []
- cpu_metrics = []
- memory_metrics = []
- gpu_memory_metrics = []
- for _ in range(times):
- start = time.time()
- run_model(model, size)
- torch.cuda.synchronize()
- # cpu_metrics.append(process.cpu_percent())
- time_metrics.append((time.time() - start) * 1000)
- memory_metrics.append(process.memory_info().rss / 1024 / 1024)
- gpu_memory_metrics.append(
- nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024
- )
- print(f"size: {size}".center(80, "-"))
- # print(f"cpu: {format(cpu_metrics)}")
- print(f"latency: {format(time_metrics)}ms")
- print(f"memory: {format(memory_metrics)} MB")
- print(f"gpu memory: {format(gpu_memory_metrics)} MB")
- nvidia_smi.nvmlShutdown()
- def get_args_parser():
- parser = argparse.ArgumentParser()
- parser.add_argument("--name")
- parser.add_argument("--device", default="cuda", type=str)
- parser.add_argument("--times", default=10, type=int)
- parser.add_argument("--empty-cache", action="store_true")
- return parser.parse_args()
- if __name__ == "__main__":
- args = get_args_parser()
- device = torch.device(args.device)
- model = ModelManager(
- name=args.name,
- device=device,
- disable_nsfw=True,
- sd_cpu_textencoder=True,
- )
- benchmark(model, args.times, args.empty_cache)
|