benchmark.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/env python3
  2. import argparse
  3. import os
  4. import time
  5. import numpy as np
  6. import nvidia_smi
  7. import psutil
  8. import torch
  9. from sorawm.iopaint.model_manager import ModelManager
  10. from sorawm.iopaint.schema import HDStrategy, InpaintRequest, SDSampler
  11. try:
  12. torch._C._jit_override_can_fuse_on_cpu(False)
  13. torch._C._jit_override_can_fuse_on_gpu(False)
  14. torch._C._jit_set_texpr_fuser_enabled(False)
  15. torch._C._jit_set_nvfuser_enabled(False)
  16. except:
  17. pass
  18. NUM_THREADS = str(4)
  19. os.environ["OMP_NUM_THREADS"] = NUM_THREADS
  20. os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS
  21. os.environ["MKL_NUM_THREADS"] = NUM_THREADS
  22. os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS
  23. os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS
  24. if os.environ.get("CACHE_DIR"):
  25. os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"]
  26. def run_model(model, size):
  27. # RGB
  28. image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8)
  29. mask = np.random.randint(0, 255, size).astype(np.uint8)
  30. config = InpaintRequest(
  31. ldm_steps=2,
  32. hd_strategy=HDStrategy.ORIGINAL,
  33. hd_strategy_crop_margin=128,
  34. hd_strategy_crop_trigger_size=128,
  35. hd_strategy_resize_limit=128,
  36. prompt="a fox is sitting on a bench",
  37. sd_steps=5,
  38. sd_sampler=SDSampler.ddim,
  39. )
  40. model(image, mask, config)
  41. def benchmark(model, times: int, empty_cache: bool):
  42. sizes = [(512, 512)]
  43. nvidia_smi.nvmlInit()
  44. device_id = 0
  45. handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)
  46. def format(metrics):
  47. return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}"
  48. process = psutil.Process(os.getpid())
  49. # 每个 size 给出显存和内存占用的指标
  50. for size in sizes:
  51. torch.cuda.empty_cache()
  52. time_metrics = []
  53. cpu_metrics = []
  54. memory_metrics = []
  55. gpu_memory_metrics = []
  56. for _ in range(times):
  57. start = time.time()
  58. run_model(model, size)
  59. torch.cuda.synchronize()
  60. # cpu_metrics.append(process.cpu_percent())
  61. time_metrics.append((time.time() - start) * 1000)
  62. memory_metrics.append(process.memory_info().rss / 1024 / 1024)
  63. gpu_memory_metrics.append(
  64. nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024
  65. )
  66. print(f"size: {size}".center(80, "-"))
  67. # print(f"cpu: {format(cpu_metrics)}")
  68. print(f"latency: {format(time_metrics)}ms")
  69. print(f"memory: {format(memory_metrics)} MB")
  70. print(f"gpu memory: {format(gpu_memory_metrics)} MB")
  71. nvidia_smi.nvmlShutdown()
  72. def get_args_parser():
  73. parser = argparse.ArgumentParser()
  74. parser.add_argument("--name")
  75. parser.add_argument("--device", default="cuda", type=str)
  76. parser.add_argument("--times", default=10, type=int)
  77. parser.add_argument("--empty-cache", action="store_true")
  78. return parser.parse_args()
  79. if __name__ == "__main__":
  80. args = get_args_parser()
  81. device = torch.device(args.device)
  82. model = ModelManager(
  83. name=args.name,
  84. device=device,
  85. disable_nsfw=True,
  86. sd_cpu_textencoder=True,
  87. )
  88. benchmark(model, args.times, args.empty_cache)