open_images_scene_images_transformer.yaml 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. model:
  2. base_learning_rate: 4.5e-06
  3. target: taming.models.cond_transformer.Net2NetTransformer
  4. params:
  5. cond_stage_key: objects_bbox
  6. transformer_config:
  7. target: taming.modules.transformer.mingpt.GPT
  8. params:
  9. vocab_size: 8192
  10. block_size: 348 # = 256 + 92 = dim(vqgan_latent_space,16x16) + dim(conditional_builder.embedding_dim)
  11. n_layer: 36
  12. n_head: 16
  13. n_embd: 1536
  14. embd_pdrop: 0.1
  15. resid_pdrop: 0.1
  16. attn_pdrop: 0.1
  17. first_stage_config:
  18. target: taming.models.vqgan.VQModel
  19. params:
  20. ckpt_path: /path/to/coco_oi_epoch12.ckpt # https://heibox.uni-heidelberg.de/f/461d9a9f4fcf48ab84f4/
  21. embed_dim: 256
  22. n_embed: 8192
  23. ddconfig:
  24. double_z: false
  25. z_channels: 256
  26. resolution: 256
  27. in_channels: 3
  28. out_ch: 3
  29. ch: 128
  30. ch_mult:
  31. - 1
  32. - 1
  33. - 2
  34. - 2
  35. - 4
  36. num_res_blocks: 2
  37. attn_resolutions:
  38. - 16
  39. dropout: 0.0
  40. lossconfig:
  41. target: taming.modules.losses.DummyLoss
  42. cond_stage_config:
  43. target: taming.models.dummy_cond_stage.DummyCondStage
  44. params:
  45. conditional_key: objects_bbox
  46. data:
  47. target: main.DataModuleFromConfig
  48. params:
  49. batch_size: 6
  50. train:
  51. target: taming.data.annotated_objects_open_images.AnnotatedObjectsOpenImages
  52. params:
  53. data_path: data/open_images_annotations_100 # substitute with path to full dataset
  54. split: train
  55. keys: [image, objects_bbox, file_name, annotations]
  56. no_tokens: 8192
  57. target_image_size: 256
  58. category_allow_list_target: taming.data.open_images_helper.top_300_classes_plus_coco_compatibility
  59. category_mapping_target: taming.data.open_images_helper.open_images_unify_categories_for_coco
  60. min_object_area: 0.0001
  61. min_objects_per_image: 2
  62. max_objects_per_image: 30
  63. crop_method: random-2d
  64. random_flip: true
  65. use_group_parameter: true
  66. use_additional_parameters: true
  67. encode_crop: true
  68. validation:
  69. target: taming.data.annotated_objects_open_images.AnnotatedObjectsOpenImages
  70. params:
  71. data_path: data/open_images_annotations_100 # substitute with path to full dataset
  72. split: validation
  73. keys: [image, objects_bbox, file_name, annotations]
  74. no_tokens: 8192
  75. target_image_size: 256
  76. category_allow_list_target: taming.data.open_images_helper.top_300_classes_plus_coco_compatibility
  77. category_mapping_target: taming.data.open_images_helper.open_images_unify_categories_for_coco
  78. min_object_area: 0.0001
  79. min_objects_per_image: 2
  80. max_objects_per_image: 30
  81. crop_method: center
  82. random_flip: false
  83. use_group_parameter: true
  84. use_additional_parameters: true
  85. encode_crop: true