anytext_sd15.yaml 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. model:
  2. target: sorawm.iopaint.model.anytext.cldm.cldm.ControlLDM
  3. params:
  4. linear_start: 0.00085
  5. linear_end: 0.0120
  6. num_timesteps_cond: 1
  7. log_every_t: 200
  8. timesteps: 1000
  9. first_stage_key: "img"
  10. cond_stage_key: "caption"
  11. control_key: "hint"
  12. glyph_key: "glyphs"
  13. position_key: "positions"
  14. image_size: 64
  15. channels: 4
  16. cond_stage_trainable: true # need be true when embedding_manager is valid
  17. conditioning_key: crossattn
  18. monitor: val/loss_simple_ema
  19. scale_factor: 0.18215
  20. use_ema: False
  21. only_mid_control: False
  22. loss_alpha: 0 # perceptual loss, 0.003
  23. loss_beta: 0 # ctc loss
  24. latin_weight: 1.0 # latin text line may need smaller weigth
  25. with_step_weight: true
  26. use_vae_upsample: true
  27. embedding_manager_config:
  28. target: sorawm.iopaint.model.anytext.cldm.embedding_manager.EmbeddingManager
  29. params:
  30. valid: true # v6
  31. emb_type: ocr # ocr, vit, conv
  32. glyph_channels: 1
  33. position_channels: 1
  34. add_pos: false
  35. placeholder_string: '*'
  36. control_stage_config:
  37. target: sorawm.iopaint.model.anytext.cldm.cldm.ControlNet
  38. params:
  39. image_size: 32 # unused
  40. in_channels: 4
  41. model_channels: 320
  42. glyph_channels: 1
  43. position_channels: 1
  44. attention_resolutions: [ 4, 2, 1 ]
  45. num_res_blocks: 2
  46. channel_mult: [ 1, 2, 4, 4 ]
  47. num_heads: 8
  48. use_spatial_transformer: True
  49. transformer_depth: 1
  50. context_dim: 768
  51. use_checkpoint: True
  52. legacy: False
  53. unet_config:
  54. target: sorawm.iopaint.model.anytext.cldm.cldm.ControlledUnetModel
  55. params:
  56. image_size: 32 # unused
  57. in_channels: 4
  58. out_channels: 4
  59. model_channels: 320
  60. attention_resolutions: [ 4, 2, 1 ]
  61. num_res_blocks: 2
  62. channel_mult: [ 1, 2, 4, 4 ]
  63. num_heads: 8
  64. use_spatial_transformer: True
  65. transformer_depth: 1
  66. context_dim: 768
  67. use_checkpoint: True
  68. legacy: False
  69. first_stage_config:
  70. target: sorawm.iopaint.model.anytext.ldm.models.autoencoder.AutoencoderKL
  71. params:
  72. embed_dim: 4
  73. monitor: val/rec_loss
  74. ddconfig:
  75. double_z: true
  76. z_channels: 4
  77. resolution: 256
  78. in_channels: 3
  79. out_ch: 3
  80. ch: 128
  81. ch_mult:
  82. - 1
  83. - 2
  84. - 4
  85. - 4
  86. num_res_blocks: 2
  87. attn_resolutions: []
  88. dropout: 0.0
  89. lossconfig:
  90. target: torch.nn.Identity
  91. cond_stage_config:
  92. target: sorawm.iopaint.model.anytext.ldm.modules.encoders.modules.FrozenCLIPEmbedderT3
  93. params:
  94. version: openai/clip-vit-large-patch14
  95. use_vision: false # v6