34
34
open_clip = UnsupportedPackage ('open_clip' , clip_error .msg )
35
35
36
36
try :
37
- from transformers import AutoModel , AutoTokenizer
37
+ from transformers import AutoModel , AutoTokenizer , AutoProcessor
38
38
except ImportError as transformers_error :
39
39
AutoModel = UnsupportedPackage ('AutoModel' , transformers_error .msg )
40
40
AutoTokenizer = UnsupportedPackage ('AutoTokenizer' , transformers_error .msg )
41
+ AutoProcessor = UnsupportedPackage ('AutoProcessor' , transformers_error .msg )
41
42
42
43
try :
43
44
import torch
44
45
import torch .nn .functional as F
45
46
except ImportError as torch_error :
46
47
torch = UnsupportedPackage ("torch" , torch_error .msg )
47
48
49
+
48
50
class OpenVinoClipEvaluator (BaseCustomEvaluator ):
49
51
def __init__ (self , dataset_config , launcher , model , orig_config ):
50
52
super ().__init__ (dataset_config , launcher , orig_config )
@@ -53,11 +55,25 @@ def __init__(self, dataset_config, launcher, model, orig_config):
53
55
@classmethod
54
56
def from_configs (cls , config , delayed_model_loading = False , orig_config = None ):
55
57
dataset_config , launcher , _ = cls .get_dataset_and_launcher_info (config )
56
- model = OpenVinoClipVitModel (
57
- config .get ('network_info' , {}), launcher , config .get ('_models' , []),
58
- config .get ('_model_is_blob' ),
59
- delayed_model_loading , config
58
+ model_classes = {
59
+ 'openvino_model' : OptimumIntelModel ,
60
+ 'text_vision' : OpenvinoTextVisionModel ,
61
+ 'default' : OpenVinoClipVitModel
62
+ }
63
+
64
+ network_info = config .get ('network_info' , {})
65
+ if 'openvino_model' in network_info .keys ():
66
+ model_class = model_classes ['openvino_model' ]
67
+ elif 'text' in network_info .keys () and 'vision' in network_info .keys ():
68
+ model_class = model_classes ['text_vision' ]
69
+ else :
70
+ model_class = model_classes ['default' ]
71
+
72
+ model = model_class (
73
+ network_info , launcher , config .get ('_models' , []),
74
+ config .get ('_model_is_blob' ), delayed_model_loading , config
60
75
)
76
+
61
77
return cls (dataset_config , launcher , model , orig_config )
62
78
63
79
def _process (self , output_callback , calculate_metrics , progress_reporter , metric_config , csv_file ):
@@ -78,7 +94,6 @@ def _process(self, output_callback, calculate_metrics, progress_reporter, metric
78
94
element_identifiers = batch_identifiers , dataset_indices = batch_input_ids )
79
95
self ._update_progress (progress_reporter , metric_config , batch_id , len (batch_prediction ), csv_file )
80
96
81
-
82
97
class OpenVinoJinaClipEvaluator (OpenVinoClipEvaluator ):
83
98
@classmethod
84
99
def from_configs (cls , config , delayed_model_loading = False , orig_config = None ):
@@ -96,6 +111,20 @@ def from_configs(cls, config, delayed_model_loading=False, orig_config=None):
96
111
return cls (dataset_config , launcher , model , orig_config )
97
112
98
113
114
+ class TransformersClipEvaluator (OpenVinoClipEvaluator ):
115
+ @classmethod
116
+ def from_configs (cls , config , delayed_model_loading = False , orig_config = None ):
117
+ dataset_config , launcher = config ["datasets" ], None
118
+ delayed_model_loading = False
119
+
120
+ model = TransformersClipModel (
121
+ config .get ('network_info' , {}), launcher , config .get ('_models' , []),
122
+ config .get ('_model_is_blob' ),
123
+ delayed_model_loading , config
124
+ )
125
+ return cls (dataset_config , launcher , model , orig_config )
126
+
127
+
99
128
class BaseOpenVinoClipModel (BaseCascadeModel ):
100
129
def __init__ (self , network_info , launcher , models_args , is_blob , delayed_model_loading = False , config = None ):
101
130
super ().__init__ (network_info , launcher , delayed_model_loading )
@@ -104,7 +133,7 @@ def __init__(self, network_info, launcher, models_args, is_blob, delayed_model_l
104
133
self .config = config or {}
105
134
self .templates_file = None
106
135
self .parameters_file = None
107
- self .templates = ["a photo of a {classname }" ]
136
+ self .templates = ["a photo of a {c }" ]
108
137
self .parts = network_info .keys ()
109
138
if launcher :
110
139
network_info = self .fill_part_with_model (network_info , self .parts ,
@@ -117,9 +146,6 @@ def __init__(self, network_info, launcher, models_args, is_blob, delayed_model_l
117
146
def create_pipeline (self , launcher , network_info ):
118
147
raise NotImplementedError ("Subclasses should implement this method" )
119
148
120
- def get_logits (self , image_features , zeroshot_weights ):
121
- raise NotImplementedError ("Subclasses should implement this method" )
122
-
123
149
def predict (self , identifiers , input_data , zeroshot_weights ):
124
150
preds = []
125
151
for idx , image_data in zip (identifiers , input_data ):
@@ -154,8 +180,17 @@ def get_pretrained_model_params(path):
154
180
params ['beta' ] = open_clip_params ['beta' ]
155
181
return params
156
182
183
+ def get_logits (self , image_features , zeroshot_weights ):
184
+ image_features = self .normalize (image_features , axis = - 1 )
185
+ logits = 100. * image_features @ zeroshot_weights
186
+ return logits
187
+
157
188
def get_class_embeddings (self , texts , params ):
158
- raise NotImplementedError ("Subclasses should implement this method" )
189
+ class_embeddings = self .encode_text (texts )
190
+ class_embedding = self .normalize (class_embeddings , axis = - 1 )
191
+ class_embedding = np .mean (class_embedding , axis = 0 )
192
+ class_embedding /= np .linalg .norm (class_embedding , ord = 2 )
193
+ return class_embedding
159
194
160
195
def zero_shot_classifier (self , data_source ):
161
196
classnames = read_json (os .path .join (data_source , self .classnames_file ))
@@ -176,7 +211,7 @@ def zero_shot_classifier(self, data_source):
176
211
iterator = tqdm (classnames , mininterval = 2 )
177
212
178
213
for classname in iterator :
179
- texts = [template .format (classname = classname ) for template in templates ]
214
+ texts = [template .format (c = classname ) for template in templates ]
180
215
class_embeddings = self .get_class_embeddings (texts , params )
181
216
zeroshot_weights .append (class_embeddings )
182
217
return np .stack (zeroshot_weights , axis = 1 )
@@ -244,11 +279,6 @@ def create_pipeline(self, launcher, network_info):
244
279
self .parameters_file = self .config .get ("pretrained_model_params" , None )
245
280
self .tokenizer = open_clip .get_tokenizer (orig_model_name )
246
281
247
- def get_logits (self , image_features , zeroshot_weights ):
248
- image_features = self .normalize (image_features , axis = - 1 )
249
- logits = 100. * image_features @ zeroshot_weights
250
- return logits
251
-
252
282
def encode_image (self , image_data ):
253
283
image = np .expand_dims (image_data , axis = 0 )
254
284
features = self .image_encoder (image )
@@ -268,60 +298,134 @@ def encode_text(self, texts, params):
268
298
x = x [np .arange (x .shape [0 ]), np .argmax (indices , axis = - 1 )] @ params ['text_projection' ]
269
299
return x
270
300
271
- def get_class_embeddings (self , texts , params ):
272
- class_embeddings = self .encode_text (texts , params )
273
- class_embedding = self .normalize (class_embeddings , axis = - 1 )
274
- class_embedding = np .mean (class_embedding , axis = 0 )
275
- class_embedding /= np .linalg .norm (class_embedding , ord = 2 )
276
- return class_embedding
277
-
278
-
279
- class OpenVinoJinaClipModel (BaseOpenVinoClipModel ):
280
- def create_pipeline (self , launcher , network_info ):
301
+ class TransformersClipModel (BaseOpenVinoClipModel ):
302
+ def setup_transformers_part (self , check_point ):
281
303
if isinstance (AutoTokenizer , UnsupportedPackage ):
282
304
AutoTokenizer .raise_error (self .__class__ .__name__ )
283
- if isinstance (AutoModel , UnsupportedPackage ):
284
- AutoModel .raise_error (self .__class__ .__name__ )
305
+ if isinstance (AutoProcessor , UnsupportedPackage ):
306
+ AutoProcessor .raise_error (self .__class__ .__name__ )
285
307
if isinstance (torch , UnsupportedPackage ):
286
308
torch .raise_error (self .__class__ .__name__ )
287
309
288
- orig_model_name = self .config .get ("orig_model_name" , "jinaai/jina-clip-v1" )
310
+ self .processor = AutoProcessor .from_pretrained (check_point , trust_remote_code = True )
311
+ self .tokenizer = AutoTokenizer .from_pretrained (check_point , trust_remote_code = True )
289
312
290
- model = AutoModel .from_pretrained (orig_model_name , trust_remote_code = True )
291
- if launcher :
292
- self .load_models (network_info , launcher , True )
293
- self .text_encoder = launcher .ie_core .compile_model (self .text_model , launcher .device )
294
- self .vision_encoder = launcher .ie_core .compile_model (self .vision_model , launcher .device )
295
- else :
296
- self .text_encoder = model .text_model
297
- self .vision_encoder = model .vision_model
313
+ def create_pipeline (self , launcher , network_info ):
314
+ check_point = self .config .get ("orig_model_name" , "jinaai/jina-clip-v1" )
315
+ self .classnames_file = self .config .get ("classnames" , "classnames.json" )
316
+ self .setup_transformers_part (check_point )
317
+
318
+ self .model = AutoModel .from_pretrained (check_point , trust_remote_code = True )
319
+ self .model .eval ()
320
+
321
+ def encode_text (self , texts ):
322
+ texts = self .tokenizer (texts ).to ('cpu' ) # tokenize
323
+ with torch .no_grad ():
324
+ text_embeddings = self .model .get_text_features (** texts ).detach ().numpy ()
325
+ return text_embeddings
326
+
327
+ def encode_image (self , image_data ):
328
+ image = Image .fromarray (image_data )
329
+ inputs = self .processor (images = [image ], return_tensors = "pt" )
330
+ with torch .no_grad ():
331
+ image_embeddings = self .model .get_image_features (** inputs ).detach ().numpy ()
298
332
299
- self .templates = ["{classname}" ]
333
+ return image_embeddings
334
+
335
+ class OptimumIntelModel (TransformersClipModel ):
336
+ def create_pipeline (self , launcher , network_info ):
337
+ check_point = self .config .get ("orig_model_name" , "jinaai/jina-clip-v1" )
300
338
self .classnames_file = self .config .get ("classnames" , "classnames.json" )
301
- self .tokenizer = AutoTokenizer .from_pretrained (orig_model_name , trust_remote_code = True )
302
- self .processor = model .get_preprocess ()
339
+ self .setup_transformers_part (check_point )
340
+
341
+ self .load_models (network_info , launcher , True )
342
+ self .model = launcher .ie_core .compile_model (self .openvino_model_model , launcher .device )
343
+
344
+ image_array = np .random .randint (0 , 256 , (224 , 224 , 3 ), dtype = np .uint8 )
345
+ image = Image .fromarray (image_array )
346
+ text_descriptions = ["This is a random noise image" ]
347
+ self .inputs = self .processor (text = text_descriptions , images = [image ], return_tensors = "pt" , padding = True )
348
+
349
+
350
+ def encode_text (self , texts ):
351
+ texts = self .tokenizer (texts ).to ('cpu' )
352
+ input_dict = {k : v .cpu () for k , v in texts .items ()}
353
+ if "pixel_values" not in input_dict :
354
+ input_dict ["pixel_values" ] = self .inputs ["pixel_values" ]
355
+ text_embeddings = self .model (input_dict )[2 ]
356
+ return text_embeddings
357
+
358
+ def encode_image (self , image_data ):
359
+ image = Image .fromarray (image_data )
360
+ inputs = self .processor (images = [image ], return_tensors = "pt" ).to ("cpu" )
361
+ input_dict = {k : v .squeeze (1 ).cpu () for k , v in inputs .items ()}
362
+ if "input_ids" not in input_dict :
363
+ input_dict ["input_ids" ] = self .inputs ["input_ids" ]
364
+ if "attention_mask" not in input_dict :
365
+ input_dict ["attention_mask" ] = self .inputs ["attention_mask" ]
366
+
367
+ image_embeddings = self .model (input_dict )[3 ]
368
+ return image_embeddings
369
+
370
+ class OpenvinoTextVisionModel (TransformersClipModel ):
371
+ def create_pipeline (self , launcher , network_info ):
372
+ check_point = self .config .get ("orig_model_name" , None )
373
+ self .classnames_file = self .config .get ("classnames" , "classnames.json" )
374
+ self .setup_transformers_part (check_point )
375
+
376
+ self .load_models (network_info , launcher , True )
377
+ self .text_encoder = launcher .ie_core .compile_model (self .text_model , launcher .device )
378
+ self .vision_encoder = launcher .ie_core .compile_model (self .vision_model , launcher .device )
379
+
380
+ def encode_text (self , texts ):
381
+ text_input = self .tokenizer (texts ).to ("cpu" )
382
+ text_embeddings = self .text_encoder (text_input ["input_ids" ])
383
+ if isinstance (text_embeddings , torch .Tensor ):
384
+ text_embeddings = text_embeddings .detach ().numpy ()
385
+ else :
386
+ text_embeddings = text_embeddings [0 ]
387
+ return text_embeddings
303
388
304
389
def encode_image (self , image_data ):
305
390
image = Image .fromarray (image_data )
306
391
vision_input = self .processor (images = [image ], return_tensors = "pt" )
307
392
image_embeddings = self .vision_encoder (vision_input ["pixel_values" ])
308
-
309
393
if isinstance (image_embeddings , torch .Tensor ):
310
394
image_embeddings = image_embeddings .detach ().numpy ()
311
395
else :
312
396
image_embeddings = image_embeddings [0 ]
313
-
314
397
return image_embeddings
315
398
316
- def encode_text (self , text_input ):
317
- text_embeddings = self .text_encoder (text_input ["input_ids" ])
399
+ class OpenVinoJinaClipModel (OpenvinoTextVisionModel ):
400
+ def create_pipeline (self , launcher , network_info ):
401
+ if isinstance (AutoModel , UnsupportedPackage ):
402
+ AutoModel .raise_error (self .__class__ .__name__ )
403
+ check_point = self .config .get ("orig_model_name" , None )
404
+ self .classnames_file = self .config .get ("classnames" , "classnames.json" )
405
+ self .setup_transformers_part (check_point )
406
+ self .tokenizer = AutoTokenizer .from_pretrained (check_point , trust_remote_code = True )
318
407
408
+ model = AutoModel .from_pretrained (check_point , trust_remote_code = True )
409
+ if launcher :
410
+ self .load_models (network_info , launcher , True )
411
+ self .text_encoder = launcher .ie_core .compile_model (self .text_model , launcher .device )
412
+ self .vision_encoder = launcher .ie_core .compile_model (self .vision_model , launcher .device )
413
+ else :
414
+ self .text_encoder = model .text_model
415
+ self .vision_encoder = model .vision_model
416
+
417
+ def encode_text (self , texts ):
418
+ text_input = self .tokenizer (texts , return_tensors = "pt" , padding = "max_length" ,
419
+ max_length = 512 , truncation = True ).to ("cpu" )
420
+
421
+ text_embeddings = self .text_encoder (text_input ["input_ids" ])
319
422
if isinstance (text_embeddings , torch .Tensor ):
320
423
text_embeddings = text_embeddings .detach ().numpy ()
321
424
else :
322
425
text_embeddings = text_embeddings [0 ]
323
426
return text_embeddings
324
427
428
+
325
429
def get_logits (self , image_features , zeroshot_weights ):
326
430
text_embeddings = np .squeeze (zeroshot_weights )
327
431
similarity = []
@@ -336,6 +440,4 @@ def get_logits(self, image_features, zeroshot_weights):
336
440
return logits
337
441
338
442
def get_class_embeddings (self , texts , params ):
339
- text_input = self .tokenizer (texts , return_tensors = "pt" , padding = "max_length" ,
340
- max_length = 512 , truncation = True ).to ("cpu" )
341
- return self .encode_text (text_input )
443
+ return self .encode_text (texts )
0 commit comments