test_inference_simple.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import torch
  2. from PIL import Image
  3. from open_clip.factory import get_tokenizer
  4. import pytest
  5. import open_clip
  6. import os
  7. os.environ["CUDA_VISIBLE_DEVICES"] = ""
  8. if hasattr(torch._C, '_jit_set_profiling_executor'):
  9. # legacy executor is too slow to compile large models for unit tests
  10. # no need for the fusion performance here
  11. torch._C._jit_set_profiling_executor(True)
  12. torch._C._jit_set_profiling_mode(False)
  13. test_simple_models = [
  14. # model, pretrained, jit, force_custom_text
  15. ("ViT-B-32", "laion2b_s34b_b79k", False, False),
  16. ("ViT-B-32", "laion2b_s34b_b79k", True, False),
  17. ("ViT-B-32", "laion2b_s34b_b79k", True, True),
  18. ("roberta-ViT-B-32", "laion2b_s12b_b32k", False, False),
  19. ]
  20. @pytest.mark.parametrize("model_type,pretrained,jit,force_custom_text", test_simple_models)
  21. def test_inference_simple(
  22. model_type,
  23. pretrained,
  24. jit,
  25. force_custom_text,
  26. ):
  27. model, _, preprocess = open_clip.create_model_and_transforms(
  28. model_type,
  29. pretrained=pretrained,
  30. jit=jit,
  31. force_custom_text=force_custom_text,
  32. )
  33. tokenizer = get_tokenizer(model_type)
  34. current_dir = os.path.dirname(os.path.realpath(__file__))
  35. image = preprocess(Image.open(current_dir + "/../docs/CLIP.png")).unsqueeze(0)
  36. text = tokenizer(["a diagram", "a dog", "a cat"])
  37. with torch.no_grad():
  38. image_features = model.encode_image(image)
  39. text_features = model.encode_text(text)
  40. text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
  41. assert text_probs.cpu().numpy()[0].tolist() == [1.0, 0.0, 0.0]