@@ -168,18 +168,23 @@ def cache_linear_coefs(self, X: ArrayLike, model=None, tokenizer_embeddings=None
168
168
print ('\t Nothing to update!' )
169
169
return
170
170
171
- # compute embeddings
172
- """
173
- # Faster version that needs more memory
174
- tokens = tokenizer(ngrams_list, padding=args.padding,
175
- truncation=True, return_tensors="pt")
176
- tokens = tokens.to(device)
171
+ embs = self ._get_embs (ngrams_list , model , tokenizer_embeddings )
172
+ if self .normalize_embs :
173
+ embs = self .normalizer .transform (embs )
177
174
178
- output = model(**tokens) # this takes a while....
179
- embs = output['pooler_output'].cpu().detach().numpy()
180
- return embs
175
+ # save coefs
176
+ coef_embs = self .linear .coef_ .squeeze ().transpose ()
177
+ linear_coef = embs @ coef_embs
178
+ self .coefs_dict_ = {
179
+ ** coefs_dict_old ,
180
+ ** {ngrams_list [i ]: linear_coef [i ]
181
+ for i in range (len (ngrams_list ))}
182
+ }
183
+ print ('coefs_dict_ len' , len (self .coefs_dict_ ))
184
+
185
+ def _get_embs (self , ngrams_list , model , tokenizer_embeddings ):
186
+ """Get embeddings for a list of ngrams (not summed!)
181
187
"""
182
- # Slower way to run things but won't run out of mem
183
188
embs = []
184
189
for i in tqdm (range (len (ngrams_list ))):
185
190
tokens = tokenizer_embeddings (
@@ -191,18 +196,19 @@ def cache_linear_coefs(self, X: ArrayLike, model=None, tokenizer_embeddings=None
191
196
emb = emb .mean (axis = 1 )
192
197
embs .append (emb )
193
198
embs = np .array (embs ).squeeze ()
194
- if self .normalize_embs :
195
- embs = self .normalizer .transform (embs )
199
+ return embs
196
200
197
- # save coefs
198
- coef_embs = self .linear .coef_ .squeeze ().transpose ()
199
- linear_coef = embs @ coef_embs
200
- self .coefs_dict_ = {
201
- ** coefs_dict_old ,
202
- ** {ngrams_list [i ]: linear_coef [i ]
203
- for i in range (len (ngrams_list ))}
204
- }
205
- print ('coefs_dict_ len' , len (self .coefs_dict_ ))
201
+ """
202
+ # Faster version that needs more memory
203
+ tokens = tokenizer(ngrams_list, padding=args.padding,
204
+ truncation=True, return_tensors="pt")
205
+ tokens = tokens.to(device)
206
+
207
+ output = model(**tokens) # this takes a while....
208
+ embs = output['pooler_output'].cpu().detach().numpy()
209
+ return embs
210
+ """
211
+
206
212
207
213
def _get_ngrams_list (self , X ):
208
214
all_ngrams = set ()
@@ -251,7 +257,7 @@ def _predict_cached(self, X, warn):
251
257
n_unseen_ngrams = 0
252
258
for x in X :
253
259
pred = 0
254
- seqs = imodelsx .embgam .embed .generate_ngrams_list (
260
+ seqs = imodelsx .embgam .embed .generate_ngraxms_list (
255
261
x ,
256
262
ngrams = self .ngrams ,
257
263
tokenizer_ngrams = self .tokenizer_ngrams ,
0 commit comments