Skip to content

Commit 616db0d

Browse files
committed
Add support for more CLIP like models
1 parent ce823fc commit 616db0d

File tree

1 file changed

+150
-48
lines changed

1 file changed

+150
-48
lines changed

tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/openvino_clip_evaluator.py

Lines changed: 150 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -34,17 +34,19 @@
3434
open_clip = UnsupportedPackage('open_clip', clip_error.msg)
3535

3636
try:
37-
from transformers import AutoModel, AutoTokenizer
37+
from transformers import AutoModel, AutoTokenizer, AutoProcessor
3838
except ImportError as transformers_error:
3939
AutoModel = UnsupportedPackage('AutoModel', transformers_error.msg)
4040
AutoTokenizer = UnsupportedPackage('AutoTokenizer', transformers_error.msg)
41+
AutoProcessor = UnsupportedPackage('AutoProcessor', transformers_error.msg)
4142

4243
try:
4344
import torch
4445
import torch.nn.functional as F
4546
except ImportError as torch_error:
4647
torch = UnsupportedPackage("torch", torch_error.msg)
4748

49+
4850
class OpenVinoClipEvaluator(BaseCustomEvaluator):
4951
def __init__(self, dataset_config, launcher, model, orig_config):
5052
super().__init__(dataset_config, launcher, orig_config)
@@ -53,11 +55,25 @@ def __init__(self, dataset_config, launcher, model, orig_config):
5355
@classmethod
5456
def from_configs(cls, config, delayed_model_loading=False, orig_config=None):
5557
dataset_config, launcher, _ = cls.get_dataset_and_launcher_info(config)
56-
model = OpenVinoClipVitModel(
57-
config.get('network_info', {}), launcher, config.get('_models', []),
58-
config.get('_model_is_blob'),
59-
delayed_model_loading, config
58+
model_classes = {
59+
'openvino_model': OptimumIntelModel,
60+
'text_vision': OpenvinoTextVisionModel,
61+
'default': OpenVinoClipVitModel
62+
}
63+
64+
network_info = config.get('network_info', {})
65+
if 'openvino_model' in network_info.keys():
66+
model_class = model_classes['openvino_model']
67+
elif 'text' in network_info.keys() and 'vision' in network_info.keys():
68+
model_class = model_classes['text_vision']
69+
else:
70+
model_class = model_classes['default']
71+
72+
model = model_class(
73+
network_info, launcher, config.get('_models', []),
74+
config.get('_model_is_blob'), delayed_model_loading, config
6075
)
76+
6177
return cls(dataset_config, launcher, model, orig_config)
6278

6379
def _process(self, output_callback, calculate_metrics, progress_reporter, metric_config, csv_file):
@@ -78,7 +94,6 @@ def _process(self, output_callback, calculate_metrics, progress_reporter, metric
7894
element_identifiers=batch_identifiers, dataset_indices=batch_input_ids)
7995
self._update_progress(progress_reporter, metric_config, batch_id, len(batch_prediction), csv_file)
8096

81-
8297
class OpenVinoJinaClipEvaluator(OpenVinoClipEvaluator):
8398
@classmethod
8499
def from_configs(cls, config, delayed_model_loading=False, orig_config=None):
@@ -96,6 +111,20 @@ def from_configs(cls, config, delayed_model_loading=False, orig_config=None):
96111
return cls(dataset_config, launcher, model, orig_config)
97112

98113

114+
class TransformersClipEvaluator(OpenVinoClipEvaluator):
115+
@classmethod
116+
def from_configs(cls, config, delayed_model_loading=False, orig_config=None):
117+
dataset_config, launcher = config["datasets"], None
118+
delayed_model_loading = False
119+
120+
model = TransformersClipModel(
121+
config.get('network_info', {}), launcher, config.get('_models', []),
122+
config.get('_model_is_blob'),
123+
delayed_model_loading, config
124+
)
125+
return cls(dataset_config, launcher, model, orig_config)
126+
127+
99128
class BaseOpenVinoClipModel(BaseCascadeModel):
100129
def __init__(self, network_info, launcher, models_args, is_blob, delayed_model_loading=False, config=None):
101130
super().__init__(network_info, launcher, delayed_model_loading)
@@ -104,7 +133,7 @@ def __init__(self, network_info, launcher, models_args, is_blob, delayed_model_l
104133
self.config = config or {}
105134
self.templates_file = None
106135
self.parameters_file = None
107-
self.templates = ["a photo of a {classname}"]
136+
self.templates = ["a photo of a {c}"]
108137
self.parts = network_info.keys()
109138
if launcher:
110139
network_info = self.fill_part_with_model(network_info, self.parts,
@@ -117,9 +146,6 @@ def __init__(self, network_info, launcher, models_args, is_blob, delayed_model_l
117146
def create_pipeline(self, launcher, network_info):
118147
raise NotImplementedError("Subclasses should implement this method")
119148

120-
def get_logits(self, image_features, zeroshot_weights):
121-
raise NotImplementedError("Subclasses should implement this method")
122-
123149
def predict(self, identifiers, input_data, zeroshot_weights):
124150
preds = []
125151
for idx, image_data in zip(identifiers, input_data):
@@ -154,8 +180,17 @@ def get_pretrained_model_params(path):
154180
params['beta'] = open_clip_params['beta']
155181
return params
156182

183+
def get_logits(self, image_features, zeroshot_weights):
184+
image_features = self.normalize(image_features, axis=-1)
185+
logits = 100. * image_features @ zeroshot_weights
186+
return logits
187+
157188
def get_class_embeddings(self, texts, params):
158-
raise NotImplementedError("Subclasses should implement this method")
189+
class_embeddings = self.encode_text(texts)
190+
class_embedding = self.normalize(class_embeddings, axis=-1)
191+
class_embedding = np.mean(class_embedding, axis=0)
192+
class_embedding /= np.linalg.norm(class_embedding, ord=2)
193+
return class_embedding
159194

160195
def zero_shot_classifier(self, data_source):
161196
classnames = read_json(os.path.join(data_source, self.classnames_file))
@@ -176,7 +211,7 @@ def zero_shot_classifier(self, data_source):
176211
iterator = tqdm(classnames, mininterval=2)
177212

178213
for classname in iterator:
179-
texts = [template.format(classname=classname) for template in templates]
214+
texts = [template.format(c=classname) for template in templates]
180215
class_embeddings = self.get_class_embeddings(texts, params)
181216
zeroshot_weights.append(class_embeddings)
182217
return np.stack(zeroshot_weights, axis=1)
@@ -244,11 +279,6 @@ def create_pipeline(self, launcher, network_info):
244279
self.parameters_file = self.config.get("pretrained_model_params", None)
245280
self.tokenizer = open_clip.get_tokenizer(orig_model_name)
246281

247-
def get_logits(self, image_features, zeroshot_weights):
248-
image_features = self.normalize(image_features, axis=-1)
249-
logits = 100. * image_features @ zeroshot_weights
250-
return logits
251-
252282
def encode_image(self, image_data):
253283
image = np.expand_dims(image_data, axis=0)
254284
features = self.image_encoder(image)
@@ -268,60 +298,134 @@ def encode_text(self, texts, params):
268298
x = x[np.arange(x.shape[0]), np.argmax(indices, axis=-1)] @ params['text_projection']
269299
return x
270300

271-
def get_class_embeddings(self, texts, params):
272-
class_embeddings = self.encode_text(texts, params)
273-
class_embedding = self.normalize(class_embeddings, axis=-1)
274-
class_embedding = np.mean(class_embedding, axis=0)
275-
class_embedding /= np.linalg.norm(class_embedding, ord=2)
276-
return class_embedding
277-
278-
279-
class OpenVinoJinaClipModel(BaseOpenVinoClipModel):
280-
def create_pipeline(self, launcher, network_info):
301+
class TransformersClipModel(BaseOpenVinoClipModel):
302+
def setup_transformers_part(self, check_point):
281303
if isinstance(AutoTokenizer, UnsupportedPackage):
282304
AutoTokenizer.raise_error(self.__class__.__name__)
283-
if isinstance(AutoModel, UnsupportedPackage):
284-
AutoModel.raise_error(self.__class__.__name__)
305+
if isinstance(AutoProcessor, UnsupportedPackage):
306+
AutoProcessor.raise_error(self.__class__.__name__)
285307
if isinstance(torch, UnsupportedPackage):
286308
torch.raise_error(self.__class__.__name__)
287309

288-
orig_model_name = self.config.get("orig_model_name", "jinaai/jina-clip-v1")
310+
self.processor = AutoProcessor.from_pretrained(check_point, trust_remote_code=True)
311+
self.tokenizer = AutoTokenizer.from_pretrained(check_point, trust_remote_code=True)
289312

290-
model = AutoModel.from_pretrained(orig_model_name, trust_remote_code=True)
291-
if launcher:
292-
self.load_models(network_info, launcher, True)
293-
self.text_encoder = launcher.ie_core.compile_model(self.text_model, launcher.device)
294-
self.vision_encoder = launcher.ie_core.compile_model(self.vision_model, launcher.device)
295-
else:
296-
self.text_encoder = model.text_model
297-
self.vision_encoder = model.vision_model
313+
def create_pipeline(self, launcher, network_info):
314+
check_point = self.config.get("orig_model_name", "jinaai/jina-clip-v1")
315+
self.classnames_file = self.config.get("classnames", "classnames.json")
316+
self.setup_transformers_part(check_point)
317+
318+
self.model = AutoModel.from_pretrained(check_point, trust_remote_code=True)
319+
self.model.eval()
320+
321+
def encode_text(self, texts):
322+
texts = self.tokenizer(texts).to('cpu') # tokenize
323+
with torch.no_grad():
324+
text_embeddings = self.model.get_text_features(**texts).detach().numpy()
325+
return text_embeddings
326+
327+
def encode_image(self, image_data):
328+
image = Image.fromarray(image_data)
329+
inputs = self.processor(images=[image], return_tensors="pt")
330+
with torch.no_grad():
331+
image_embeddings = self.model.get_image_features(**inputs).detach().numpy()
298332

299-
self.templates = ["{classname}"]
333+
return image_embeddings
334+
335+
class OptimumIntelModel(TransformersClipModel):
336+
def create_pipeline(self, launcher, network_info):
337+
check_point = self.config.get("orig_model_name", "jinaai/jina-clip-v1")
300338
self.classnames_file = self.config.get("classnames", "classnames.json")
301-
self.tokenizer = AutoTokenizer.from_pretrained(orig_model_name, trust_remote_code=True)
302-
self.processor = model.get_preprocess()
339+
self.setup_transformers_part(check_point)
340+
341+
self.load_models(network_info, launcher, True)
342+
self.model = launcher.ie_core.compile_model(self.openvino_model_model, launcher.device)
343+
344+
image_array = np.random.randint(0, 256, (224, 224, 3), dtype=np.uint8)
345+
image = Image.fromarray(image_array)
346+
text_descriptions = ["This is a random noise image"]
347+
self.inputs = self.processor(text=text_descriptions, images=[image], return_tensors="pt", padding=True)
348+
349+
350+
def encode_text(self, texts):
351+
texts = self.tokenizer(texts).to('cpu')
352+
input_dict = {k: v.cpu() for k, v in texts.items()}
353+
if "pixel_values" not in input_dict:
354+
input_dict["pixel_values"] = self.inputs["pixel_values"]
355+
text_embeddings = self.model(input_dict)[2]
356+
return text_embeddings
357+
358+
def encode_image(self, image_data):
359+
image = Image.fromarray(image_data)
360+
inputs = self.processor(images=[image], return_tensors="pt").to("cpu")
361+
input_dict = {k: v.squeeze(1).cpu() for k, v in inputs.items()}
362+
if "input_ids" not in input_dict:
363+
input_dict["input_ids"] = self.inputs["input_ids"]
364+
if "attention_mask" not in input_dict:
365+
input_dict["attention_mask"] = self.inputs["attention_mask"]
366+
367+
image_embeddings = self.model(input_dict)[3]
368+
return image_embeddings
369+
370+
class OpenvinoTextVisionModel(TransformersClipModel):
371+
def create_pipeline(self, launcher, network_info):
372+
check_point = self.config.get("orig_model_name", None)
373+
self.classnames_file = self.config.get("classnames", "classnames.json")
374+
self.setup_transformers_part(check_point)
375+
376+
self.load_models(network_info, launcher, True)
377+
self.text_encoder = launcher.ie_core.compile_model(self.text_model, launcher.device)
378+
self.vision_encoder = launcher.ie_core.compile_model(self.vision_model, launcher.device)
379+
380+
def encode_text(self, texts):
381+
text_input = self.tokenizer(texts).to("cpu")
382+
text_embeddings = self.text_encoder(text_input["input_ids"])
383+
if isinstance(text_embeddings, torch.Tensor):
384+
text_embeddings = text_embeddings.detach().numpy()
385+
else:
386+
text_embeddings = text_embeddings[0]
387+
return text_embeddings
303388

304389
def encode_image(self, image_data):
305390
image = Image.fromarray(image_data)
306391
vision_input = self.processor(images=[image], return_tensors="pt")
307392
image_embeddings = self.vision_encoder(vision_input["pixel_values"])
308-
309393
if isinstance(image_embeddings, torch.Tensor):
310394
image_embeddings = image_embeddings.detach().numpy()
311395
else:
312396
image_embeddings = image_embeddings[0]
313-
314397
return image_embeddings
315398

316-
def encode_text(self, text_input):
317-
text_embeddings = self.text_encoder(text_input["input_ids"])
399+
class OpenVinoJinaClipModel(OpenvinoTextVisionModel):
400+
def create_pipeline(self, launcher, network_info):
401+
if isinstance(AutoModel, UnsupportedPackage):
402+
AutoModel.raise_error(self.__class__.__name__)
403+
check_point = self.config.get("orig_model_name", None)
404+
self.classnames_file = self.config.get("classnames", "classnames.json")
405+
self.setup_transformers_part(check_point)
406+
self.tokenizer = AutoTokenizer.from_pretrained(check_point, trust_remote_code=True)
318407

408+
model = AutoModel.from_pretrained(check_point, trust_remote_code=True)
409+
if launcher:
410+
self.load_models(network_info, launcher, True)
411+
self.text_encoder = launcher.ie_core.compile_model(self.text_model, launcher.device)
412+
self.vision_encoder = launcher.ie_core.compile_model(self.vision_model, launcher.device)
413+
else:
414+
self.text_encoder = model.text_model
415+
self.vision_encoder = model.vision_model
416+
417+
def encode_text(self, texts):
418+
text_input = self.tokenizer(texts, return_tensors="pt", padding="max_length",
419+
max_length=512, truncation=True).to("cpu")
420+
421+
text_embeddings = self.text_encoder(text_input["input_ids"])
319422
if isinstance(text_embeddings, torch.Tensor):
320423
text_embeddings = text_embeddings.detach().numpy()
321424
else:
322425
text_embeddings = text_embeddings[0]
323426
return text_embeddings
324427

428+
325429
def get_logits(self, image_features, zeroshot_weights):
326430
text_embeddings = np.squeeze(zeroshot_weights)
327431
similarity = []
@@ -336,6 +440,4 @@ def get_logits(self, image_features, zeroshot_weights):
336440
return logits
337441

338442
def get_class_embeddings(self, texts, params):
339-
text_input = self.tokenizer(texts, return_tensors="pt", padding="max_length",
340-
max_length=512, truncation=True).to("cpu")
341-
return self.encode_text(text_input)
443+
return self.encode_text(texts)

0 commit comments

Comments
 (0)