v1-inference.yaml 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. model:
  2. base_learning_rate: 1.0e-04
  3. target: ldm.models.diffusion.ddpm.LatentDiffusion
  4. params:
  5. linear_start: 0.00085
  6. linear_end: 0.0120
  7. num_timesteps_cond: 1
  8. log_every_t: 200
  9. timesteps: 1000
  10. first_stage_key: "jpg"
  11. cond_stage_key: "txt"
  12. image_size: 64
  13. channels: 4
  14. cond_stage_trainable: false # Note: different from the one we trained before
  15. conditioning_key: crossattn
  16. monitor: val/loss_simple_ema
  17. scale_factor: 0.18215
  18. use_ema: False
  19. scheduler_config: # 10000 warmup steps
  20. target: ldm.lr_scheduler.LambdaLinearScheduler
  21. params:
  22. warm_up_steps: [ 10000 ]
  23. cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
  24. f_start: [ 1.e-6 ]
  25. f_max: [ 1. ]
  26. f_min: [ 1. ]
  27. unet_config:
  28. target: ldm.modules.diffusionmodules.openaimodel.UNetModel
  29. params:
  30. image_size: 32 # unused
  31. in_channels: 4
  32. out_channels: 4
  33. model_channels: 320
  34. attention_resolutions: [ 4, 2, 1 ]
  35. num_res_blocks: 2
  36. channel_mult: [ 1, 2, 4, 4 ]
  37. num_heads: 8
  38. use_spatial_transformer: True
  39. transformer_depth: 1
  40. context_dim: 768
  41. use_checkpoint: True
  42. legacy: False
  43. first_stage_config:
  44. target: ldm.models.autoencoder.AutoencoderKL
  45. params:
  46. embed_dim: 4
  47. monitor: val/rec_loss
  48. ddconfig:
  49. double_z: true
  50. z_channels: 4
  51. resolution: 256
  52. in_channels: 3
  53. out_ch: 3
  54. ch: 128
  55. ch_mult:
  56. - 1
  57. - 2
  58. - 4
  59. - 4
  60. num_res_blocks: 2
  61. attn_resolutions: []
  62. dropout: 0.0
  63. lossconfig:
  64. target: torch.nn.Identity
  65. cond_stage_config:
  66. target: ldm.modules.encoders.modules.FrozenCLIPEmbedder