config.json 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. {
  2. "_name_or_path": "clip-vit-large-patch14/",
  3. "architectures": [
  4. "CLIPModel"
  5. ],
  6. "initializer_factor": 1.0,
  7. "logit_scale_init_value": 2.6592,
  8. "model_type": "clip",
  9. "projection_dim": 768,
  10. "text_config": {
  11. "_name_or_path": "",
  12. "add_cross_attention": false,
  13. "architectures": null,
  14. "attention_dropout": 0.0,
  15. "bad_words_ids": null,
  16. "bos_token_id": 0,
  17. "chunk_size_feed_forward": 0,
  18. "cross_attention_hidden_size": null,
  19. "decoder_start_token_id": null,
  20. "diversity_penalty": 0.0,
  21. "do_sample": false,
  22. "dropout": 0.0,
  23. "early_stopping": false,
  24. "encoder_no_repeat_ngram_size": 0,
  25. "eos_token_id": 2,
  26. "finetuning_task": null,
  27. "forced_bos_token_id": null,
  28. "forced_eos_token_id": null,
  29. "hidden_act": "quick_gelu",
  30. "hidden_size": 768,
  31. "id2label": {
  32. "0": "LABEL_0",
  33. "1": "LABEL_1"
  34. },
  35. "initializer_factor": 1.0,
  36. "initializer_range": 0.02,
  37. "intermediate_size": 3072,
  38. "is_decoder": false,
  39. "is_encoder_decoder": false,
  40. "label2id": {
  41. "LABEL_0": 0,
  42. "LABEL_1": 1
  43. },
  44. "layer_norm_eps": 1e-05,
  45. "length_penalty": 1.0,
  46. "max_length": 20,
  47. "max_position_embeddings": 77,
  48. "min_length": 0,
  49. "model_type": "clip_text_model",
  50. "no_repeat_ngram_size": 0,
  51. "num_attention_heads": 12,
  52. "num_beam_groups": 1,
  53. "num_beams": 1,
  54. "num_hidden_layers": 12,
  55. "num_return_sequences": 1,
  56. "output_attentions": false,
  57. "output_hidden_states": false,
  58. "output_scores": false,
  59. "pad_token_id": 1,
  60. "prefix": null,
  61. "problem_type": null,
  62. "projection_dim" : 768,
  63. "pruned_heads": {},
  64. "remove_invalid_values": false,
  65. "repetition_penalty": 1.0,
  66. "return_dict": true,
  67. "return_dict_in_generate": false,
  68. "sep_token_id": null,
  69. "task_specific_params": null,
  70. "temperature": 1.0,
  71. "tie_encoder_decoder": false,
  72. "tie_word_embeddings": true,
  73. "tokenizer_class": null,
  74. "top_k": 50,
  75. "top_p": 1.0,
  76. "torch_dtype": null,
  77. "torchscript": false,
  78. "transformers_version": "4.16.0.dev0",
  79. "use_bfloat16": false,
  80. "vocab_size": 49408
  81. },
  82. "text_config_dict": {
  83. "hidden_size": 768,
  84. "intermediate_size": 3072,
  85. "num_attention_heads": 12,
  86. "num_hidden_layers": 12,
  87. "projection_dim": 768
  88. },
  89. "torch_dtype": "float32",
  90. "transformers_version": null,
  91. "vision_config": {
  92. "_name_or_path": "",
  93. "add_cross_attention": false,
  94. "architectures": null,
  95. "attention_dropout": 0.0,
  96. "bad_words_ids": null,
  97. "bos_token_id": null,
  98. "chunk_size_feed_forward": 0,
  99. "cross_attention_hidden_size": null,
  100. "decoder_start_token_id": null,
  101. "diversity_penalty": 0.0,
  102. "do_sample": false,
  103. "dropout": 0.0,
  104. "early_stopping": false,
  105. "encoder_no_repeat_ngram_size": 0,
  106. "eos_token_id": null,
  107. "finetuning_task": null,
  108. "forced_bos_token_id": null,
  109. "forced_eos_token_id": null,
  110. "hidden_act": "quick_gelu",
  111. "hidden_size": 1024,
  112. "id2label": {
  113. "0": "LABEL_0",
  114. "1": "LABEL_1"
  115. },
  116. "image_size": 224,
  117. "initializer_factor": 1.0,
  118. "initializer_range": 0.02,
  119. "intermediate_size": 4096,
  120. "is_decoder": false,
  121. "is_encoder_decoder": false,
  122. "label2id": {
  123. "LABEL_0": 0,
  124. "LABEL_1": 1
  125. },
  126. "layer_norm_eps": 1e-05,
  127. "length_penalty": 1.0,
  128. "max_length": 20,
  129. "min_length": 0,
  130. "model_type": "clip_vision_model",
  131. "no_repeat_ngram_size": 0,
  132. "num_attention_heads": 16,
  133. "num_beam_groups": 1,
  134. "num_beams": 1,
  135. "num_hidden_layers": 24,
  136. "num_return_sequences": 1,
  137. "output_attentions": false,
  138. "output_hidden_states": false,
  139. "output_scores": false,
  140. "pad_token_id": null,
  141. "patch_size": 14,
  142. "prefix": null,
  143. "problem_type": null,
  144. "projection_dim" : 768,
  145. "pruned_heads": {},
  146. "remove_invalid_values": false,
  147. "repetition_penalty": 1.0,
  148. "return_dict": true,
  149. "return_dict_in_generate": false,
  150. "sep_token_id": null,
  151. "task_specific_params": null,
  152. "temperature": 1.0,
  153. "tie_encoder_decoder": false,
  154. "tie_word_embeddings": true,
  155. "tokenizer_class": null,
  156. "top_k": 50,
  157. "top_p": 1.0,
  158. "torch_dtype": null,
  159. "torchscript": false,
  160. "transformers_version": "4.16.0.dev0",
  161. "use_bfloat16": false
  162. },
  163. "vision_config_dict": {
  164. "hidden_size": 1024,
  165. "intermediate_size": 4096,
  166. "num_attention_heads": 16,
  167. "num_hidden_layers": 24,
  168. "patch_size": 14,
  169. "projection_dim": 768
  170. }
  171. }