v2-inference-v.yaml 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. model:
  2. base_learning_rate: 1.0e-4
  3. target: ldm.models.diffusion.ddpm.LatentDiffusion
  4. params:
  5. parameterization: "v"
  6. linear_start: 0.00085
  7. linear_end: 0.0120
  8. num_timesteps_cond: 1
  9. log_every_t: 200
  10. timesteps: 1000
  11. first_stage_key: "jpg"
  12. cond_stage_key: "txt"
  13. image_size: 64
  14. channels: 4
  15. cond_stage_trainable: false
  16. conditioning_key: crossattn
  17. monitor: val/loss_simple_ema
  18. scale_factor: 0.18215
  19. use_ema: False # we set this to false because this is an inference only config
  20. unet_config:
  21. target: ldm.modules.diffusionmodules.openaimodel.UNetModel
  22. params:
  23. use_checkpoint: True
  24. use_fp16: True
  25. image_size: 32 # unused
  26. in_channels: 4
  27. out_channels: 4
  28. model_channels: 320
  29. attention_resolutions: [ 4, 2, 1 ]
  30. num_res_blocks: 2
  31. channel_mult: [ 1, 2, 4, 4 ]
  32. num_head_channels: 64 # need to fix for flash-attn
  33. use_spatial_transformer: True
  34. use_linear_in_transformer: True
  35. transformer_depth: 1
  36. context_dim: 1024
  37. legacy: False
  38. first_stage_config:
  39. target: ldm.models.autoencoder.AutoencoderKL
  40. params:
  41. embed_dim: 4
  42. monitor: val/rec_loss
  43. ddconfig:
  44. #attn_type: "vanilla-xformers"
  45. double_z: true
  46. z_channels: 4
  47. resolution: 256
  48. in_channels: 3
  49. out_ch: 3
  50. ch: 128
  51. ch_mult:
  52. - 1
  53. - 2
  54. - 4
  55. - 4
  56. num_res_blocks: 2
  57. attn_resolutions: []
  58. dropout: 0.0
  59. lossconfig:
  60. target: torch.nn.Identity
  61. cond_stage_config:
  62. target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
  63. params:
  64. freeze: True
  65. layer: "penultimate"