88
99
1010class  GloVeModel :
11+     """Class to interact with GloVe embeddings. 
12+ 
13+     Attributes 
14+     ---------- 
15+     binarized_vectors : dict[str, list[str]] 
16+         Dict of word: binarized_vector pairs. 
17+     vec_file : str 
18+         Path to GloVe embeddings file. 
19+     vectors : dict[str, np.ndarray] 
20+         Dict of word: vector pairs. 
21+     """ 
22+ 
1123    def  __init__ (self , vec_file : str ):
1224        self .vec_file  =  vec_file 
1325        self ._load_vectors_from_file (vec_file )
26+         self ._binarize_vectors ()
1427
1528    def  __repr__ (self ) ->  str :
1629        return  f"GloVeModel(vec_file={ self .vec_file }  )" 
@@ -51,7 +64,7 @@ def _load_vectors_from_file(self, vec_file: str) -> None:
5164        """Load vectors from gzipped txt file in word2vec format. 
5265
5366        The first line of the file contains the header which is the vocabulary size 
54-         (i.e. number of vectors) and the dimenisions  of the vectors. 
67+         (i.e. number of vectors) and the dimensions  of the vectors. 
5568
5669        All remaining rows contain the token followed by the numeric elements of the 
5770        vector, separated by a space 
@@ -61,7 +74,7 @@ def _load_vectors_from_file(self, vec_file: str) -> None:
6174        vec_file : str 
6275            File to load vectors from. 
6376        """ 
64-         vectors  =  {}
77+         self . vectors  =  {}
6578        with  as_file (files (__package__ ) /  vec_file ) as  p :
6679            with  gzip .open (p , "rt" ) as  f :
6780                # Read first line as header 
@@ -73,6 +86,41 @@ def _load_vectors_from_file(self, vec_file: str) -> None:
7386                    parts  =  line .rstrip ().split ()
7487                    token  =  parts [0 ]
7588                    vector  =  np .array ([float (v ) for  v  in  parts [1 :]], dtype = np .float32 )
76-                     vectors [token ] =  vector 
89+                     self .vectors [token ] =  vector 
90+ 
91+     def  _binarize_vectors (self ):
92+         """Binarize word vectors by converting continuous values into discrete values. 
93+ 
94+         For each word vector, calculate the average value of the positive elements and 
95+         the negative elements. Replace each element of each word vector according to: 
96+         if value < negative_average: 
97+             "NEG" 
98+         elif value > positive_average 
99+             "POS" 
100+         else 
101+             "0" 
77102
78-         self .vectors  =  vectors 
103+         The resulting word vectors are stored in the binarized_vectors attribute. 
104+ 
105+         References 
106+         ---------- 
107+         J. Guo, W. Che, H. Wang, and T. Liu, ‘Revisiting Embedding Features for Simple 
108+         Semi-supervised Learning’, in Proceedings of the 2014 Conference on Empirical 
109+         Methods in Natural Language Processing (EMNLP), Doha, Qatar: Association for 
110+         Computational Linguistics, 2014, pp. 110–120. doi: 10.3115/v1/D14-1012. 
111+         """ 
112+         self .binarized_vectors  =  {}
113+         for  word , vec  in  self .vectors .items ():
114+             positive_avg  =  np .mean (vec [vec  >  0 ])
115+             negative_avg  =  np .mean (vec [vec  <  0 ])
116+ 
117+             binarised_vec  =  []
118+             for  value  in  vec :
119+                 if  value  <  negative_avg :
120+                     binarised_vec .append ("VNEG" )
121+                 elif  value  >  positive_avg :
122+                     binarised_vec .append ("VPOS" )
123+                 else :
124+                     binarised_vec .append ("V0" )
125+ 
126+             self .binarized_vectors [word ] =  binarised_vec 
0 commit comments