@@ -7,16 +7,13 @@ class Aligner:
7
7
8
8
def __init__ (self ):
9
9
self .text_nor = Text_processing ()
10
-
10
+ self . util = Util ()
11
11
def align_sentences (self ,sentence1 ,sentence2 ):
12
12
13
13
sentence1ParseResult = self .text_nor .parser (sentence1 )
14
- # print "sentence1 parse ", sentence1ParseResult
15
- # print ""
16
14
sentence2ParseResult = self .text_nor .parser (sentence2 )
17
15
18
16
sentence1LemmasAndPosTags = self .text_nor .combine_lemmaAndPosTags (sentence1ParseResult )
19
- # print "sentce1 ", sentence1LemmasAndPosTags
20
17
sentence2LemmasAndPosTags = self .text_nor .combine_lemmaAndPosTags (sentence2ParseResult )
21
18
22
19
self .sourceWordIndices = [i + 1 for i in xrange (len (sentence1LemmasAndPosTags ))]
@@ -31,24 +28,29 @@ def align_sentences(self,sentence1,sentence2):
31
28
self .sourcePosTags = [item [4 ] for item in sentence1LemmasAndPosTags ]
32
29
self .targetPosTags = [item [4 ] for item in sentence2LemmasAndPosTags ]
33
30
34
- myWordAlignments = self .alignWords (sentence1LemmasAndPosTags , sentence2LemmasAndPosTags , \
35
- sentence1ParseResult , sentence2ParseResult )
31
+
32
+ myWordAlignments = self .alignWords (sentence1LemmasAndPosTags , sentence2LemmasAndPosTags ,
33
+ sentence1ParseResult , sentence2ParseResult )
36
34
35
+
37
36
align = []
38
37
for i in myWordAlignments :
39
38
align .append ([self .sourceWords [i [0 ]- 1 ], self .targetWords [i [1 ]- 1 ] ])
40
- # print "align words ", align
39
+ print "align words " , align
41
40
42
41
return align
43
42
43
+
44
44
'''
45
45
sourceSent and targetSent is list of:
46
46
[[character begin offset, character end offset], word index, word, lemma, pos tag]
47
47
sourceParseResult and targetParseResult is list of:
48
48
Parse Tree(Constituency tree), Text, Dependencies, words(NE, CharacOffsetEn, CharOffsetBeg,
49
49
POS, Lemma)
50
50
1. Align the punctuations first
51
- 2. Align named entities
51
+ 2. Align common NeighboringWords(atleast bigram or more)
52
+ 3. Align Hyphenated Words
53
+ 4. Align named entitiesss
52
54
'''
53
55
54
56
@@ -59,14 +61,30 @@ def alignWords(self,sourceSent, targetSent, sourceParseResult, targetParseResult
59
61
srcWordAlreadyAligned = [] #sourceWordAlreadyAligned
60
62
tarWordAlreadyAligned = [] #TargetWordAlreadyAligned
61
63
62
- # align the punctuations
64
+ # 1. align the punctuations
65
+ alignments , srcWordAlreadyAligned , tarWordAlreadyAligned = self .align_punctuations (self .sourceWords ,
66
+ self .targetWords , alignments , srcWordAlreadyAligned , tarWordAlreadyAligned ,sourceSent ,targetSent )
67
+
68
+ #2. align commonNeighboringWords (atleast bigram, or more)
63
69
alignments , srcWordAlreadyAligned , tarWordAlreadyAligned = \
64
- self .align_punctuations (self .sourceWords ,self .targetWords , \
65
- alignments , srcWordAlreadyAligned , tarWordAlreadyAligned ,sourceSent ,targetSent )
66
- # align named entities
67
- neAlignments = self .align_namedEntities (sourceSent , targetSent , \
68
- sourceParseResult , targetParseResult , alignments , srcWordAlreadyAligned , tarWordAlreadyAligned )
69
-
70
+ self .align_commonNeighboringWords (self .sourceWords , self .targetWords , \
71
+ srcWordAlreadyAligned , tarWordAlreadyAligned , alignments )
72
+ #3. align Hyphenated words
73
+ checkSourceWordsInTarget = True # check if Source Words have any hyphen words
74
+ alignments , srcWordAlreadyAligned , tarWordAlreadyAligned = \
75
+ self .align_hyphenWords (self .sourceWordIndices , self .sourceWords ,\
76
+ srcWordAlreadyAligned , alignments ,\
77
+ tarWordAlreadyAligned , checkSourceWordsInTarget )
78
+
79
+ checkSourceWordsInTarget = False # check if target Words have any hyphen words
80
+ alignments , srcWordAlreadyAligned , tarWordAlreadyAligned = \
81
+ self .align_hyphenWords (self .targetWordIndices , self .targetWords , tarWordAlreadyAligned , alignments , \
82
+ tarWordAlreadyAligned ,checkSourceWordsInTarget )
83
+
84
+ #4. align named entities
85
+ neAlignments = self .align_namedEntities (sourceSent , targetSent , sourceParseResult , \
86
+ targetParseResult , alignments , srcWordAlreadyAligned , tarWordAlreadyAligned )
87
+
70
88
for item in neAlignments :
71
89
if item not in alignments :
72
90
alignments .append (item )
@@ -84,20 +102,22 @@ def alignWords(self,sourceSent, targetSent, sourceParseResult, targetParseResult
84
102
'''
85
103
86
104
87
- def align_punctuations (self ,sourceWords , targetWords , alignments , \
105
+ def align_punctuations (self ,sourceWords , targetWords , alignments ,
88
106
srcWordAlreadyAligned , tarWordAlreadyAligned , sourceSent , targetSent ):
89
107
90
108
global punctuations
91
109
92
110
# if last word of source sentence is . or ! and last of target sent is . or ! or both are equal
93
- if (sourceWords [len (sourceSent )- 1 ] in ['.' ,'!' ] and targetWords [len (targetSent )- 1 ]\
94
- in ['.' ,'!' ]) or (sourceWords [len (sourceSent )- 1 ]== targetWords [len (targetSent )- 1 ]):
111
+ if (sourceWords [len (sourceSent )- 1 ] in ['.' ,'!' ] and targetWords [len (targetSent )- 1 ] \
112
+ in ['.' ,'!' ]) or (sourceWords [len (sourceSent )- 1 ]== targetWords [len (targetSent )- 1 ]):
95
113
96
114
alignments .append ([len (sourceSent ), len (targetSent )])
97
115
srcWordAlreadyAligned .append (len (sourceSent ))
98
116
tarWordAlreadyAligned .append (len (targetSent ))
99
117
# or if second last of source sent. is . or ! and last word of target sent is . or ! then append too
100
- elif (sourceWords [len (sourceSent )- 2 ] in ['.' , '!' ] and targetWords [len (targetSent )- 1 ] in ['.' , '!' ]):
118
+ elif (sourceWords [len (sourceSent )- 2 ] in ['.' , '!' ] and \
119
+ targetWords [len (targetSent )- 1 ] in ['.' , '!' ]):
120
+
101
121
alignments .append ([len (sourceSent ), len (targetSent )])
102
122
srcWordAlreadyAligned .append (len (sourceSent ))
103
123
tarWordAlreadyAligned .append (len (targetSent ))
@@ -115,6 +135,92 @@ def align_punctuations(self,sourceWords, targetWords, alignments, \
115
135
return alignments , srcWordAlreadyAligned , tarWordAlreadyAligned
116
136
117
137
138
+ '''
139
+ Input: sourceWords, targetWords
140
+ Returns: aligned words that are bigram, trigram,..(not unigram)
141
+ of content words
142
+ '''
143
+
144
+
145
+ def align_commonNeighboringWords (self , sourceWords , targetWords , srcWordAlreadyAligned ,
146
+ tarWordAlreadyAligned , alignments ):
147
+
148
+
149
+ commonNeighboringWords = self .util .get_commonNeighboringWords (sourceWords , targetWords )
150
+
151
+ for commonWords in commonNeighboringWords :
152
+ stopWordsPresent = True
153
+ # print "common Words ", commonWords
154
+ for word in commonWords :
155
+
156
+ if word not in stopwords and word not in punctuations :
157
+ stopWordsPresent = False
158
+ break
159
+ # print "asds ", (len(word[0]))
160
+ if len (commonWords [0 ]) >= 2 and not stopWordsPresent :
161
+
162
+ for j in xrange (len (commonWords [0 ])):
163
+ # print "common Word sss ", commonWords[0][j]+1
164
+ # print "target words ", commonWords[1][j]+1
165
+ if commonWords [0 ][j ]+ 1 not in srcWordAlreadyAligned and commonWords [1 ][j ]+ 1 not in \
166
+ tarWordAlreadyAligned and [commonWords [0 ][j ]+ 1 , commonWords [1 ][j ]+ 1 ] not in alignments :
167
+
168
+ alignments .append ([commonWords [0 ][j ]+ 1 , commonWords [1 ][j ]+ 1 ])
169
+ srcWordAlreadyAligned .append (commonWords [0 ][j ]+ 1 )
170
+ tarWordAlreadyAligned .append (commonWords [1 ][j ]+ 1 )
171
+
172
+ return alignments , srcWordAlreadyAligned , tarWordAlreadyAligned
173
+
174
+
175
+ '''
176
+ Input: wordIndices(srcWordIndices/tarWordIndices) depends upon whether we check sourceWords
177
+ in targetWords, or other way
178
+ Words(srcWordIndices/tarWordIndices)
179
+ srcWordAlreadyAligned, alignments, tarWordAlreadyAligned,
180
+ flag: true, then we check sourceWords in targetWords,
181
+ else we check targetWords in sourceWords
182
+ Returns: aligned hyphen Words(alignments, srcWordAlreadyAligned, tarWordAlreadyAligned)
183
+ '''
184
+
185
+
186
+ def align_hyphenWords (self , wordIndices , Words , srcWordAlreadyAligned , alignments ,
187
+ tarWordAlreadyAligned , flag ):
188
+
189
+
190
+ for i in wordIndices :
191
+ if flag :
192
+ if i in srcWordAlreadyAligned :
193
+ continue
194
+ else :
195
+ if i in tarWordAlreadyAligned :
196
+ continue
197
+
198
+ if '-' in Words [i - 1 ] and Words [i - 1 ] != '-' :
199
+ tokens = Words [i - 1 ].split ('-' )
200
+ #if flag true(means we check source words in target Words)
201
+
202
+ if flag :
203
+ commonNeighboringWords = self .util .get_commonNeighboringWords (tokens , self .targetWords )
204
+
205
+ else :
206
+ commonNeighboringWords = self .util .get_commonNeighboringWords (tokens ,self .sourceWords )
207
+ for pairs in commonNeighboringWords :
208
+
209
+ if len (pairs [0 ]) > 1 :
210
+ for j in pairs [1 ]:
211
+ if [i , j + 1 ] not in alignments :
212
+ if flag :
213
+ alignments .append ([i ,j + 1 ])
214
+ srcWordAlreadyAligned .append (i )
215
+ tarWordAlreadyAligned .append (j + 1 )
216
+ else :
217
+ alignments .append ([j + 1 ,1 ])
218
+ srcWordAlreadyAligned .append (j + 1 )
219
+ tarWordAlreadyAligned .append (i )
220
+
221
+ return alignments , srcWordAlreadyAligned , tarWordAlreadyAligned
222
+
223
+
118
224
'''
119
225
Input: source Sentence, target sentence,
120
226
sourceParseResult, targetParseResult,
@@ -127,13 +233,13 @@ def align_punctuations(self,sourceWords, targetWords, alignments, \
127
233
'''
128
234
129
235
130
- def align_namedEntities (self , sourceSent , targetSent , sourceParseResult , \
131
- targetParseResult , existingAlignments , srcWordAlreadyAligned , tarWordAlreadyAligned ):
236
+ def align_namedEntities (self , sourceSent , targetSent , sourceParseResult , targetParseResult ,
237
+ existingAlignments , srcWordAlreadyAligned , tarWordAlreadyAligned ):
132
238
133
239
134
240
sourceNE = self .text_nor .get_ner (sourceParseResult )
135
241
targetNE = self .text_nor .get_ner (targetParseResult )
136
-
242
+ # print "before sourceNE ", sourceNE
137
243
sourceNE , sourceWords = self .learn_NamedEntities (sourceSent , sourceNE , targetNE )
138
244
targetNE , targetWords = self .learn_NamedEntities (targetSent , targetNE , sourceNE )
139
245
@@ -142,7 +248,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
142
248
143
249
# Align all full matches
144
250
alignment_list , sourceNamedEntitiesAlreadyAligned , targetNamedEntitiesAlreadyAligned = \
145
- self .align_full_matches (sourceNE , targetNE )
251
+ self .align_full_matches (sourceNE , targetNE )
146
252
147
253
# Align Acronyms
148
254
for item in sourceNE :
@@ -169,7 +275,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
169
275
# align subset matches
170
276
for item in sourceNE :
171
277
if item [3 ] not in ['PERSON' , 'ORGANIZATION' , 'LOCATION' ] or item in \
172
- sourceNamedEntitiesAlreadyAligned :
278
+ sourceNamedEntitiesAlreadyAligned :
173
279
continue
174
280
175
281
# do not align if the current source entity is present more than once
@@ -182,7 +288,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
182
288
183
289
for jtem in targetNE :
184
290
if jtem [3 ] not in ['PERSON' , 'ORGANIZATION' , 'LOCATION' ] or jtem in \
185
- targetNamedEntitiesAlreadyAligned :
291
+ targetNamedEntitiesAlreadyAligned :
186
292
continue
187
293
188
294
if item [3 ] != jtem [3 ]:
@@ -197,7 +303,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
197
303
if count_words > 1 :
198
304
continue
199
305
200
- if isSublist (item [2 ], jtem [2 ]):
306
+ if self . util . isSublist (item [2 ], jtem [2 ]):
201
307
unalignedWordIndicesInTheLongerName = []
202
308
for ktem in jtem [1 ]:
203
309
unalignedWordIndicesInTheLongerName .append (ktem )
@@ -216,13 +322,12 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
216
322
break
217
323
if jtem [1 ][l ] not in unalignedWordIndicesInTheLongerName or alreadyInserted :
218
324
continue
219
- if [item [1 ][k ], jtem [1 ][l ]] not in alignment_list and \
220
- targetSent [jtem [1 ][l ]- 1 ][2 ] not in sourceWords and \
221
- item [2 ][k ] not in punctuations and jtem [2 ][l ] not in punctuations :
222
-
325
+ if [item [1 ][k ], jtem [1 ][l ]] not in alignment_list and targetSent [jtem [1 ][l ]- 1 ][2 ] \
326
+ not in sourceWords and item [2 ][k ] not in punctuations and jtem [2 ][l ] \
327
+ not in punctuations :
223
328
alignment_list .append ([item [1 ][k ], jtem [1 ][l ]])
224
329
# else find if the second is a part of the first
225
- elif isSublist (jtem [2 ], item [2 ]):
330
+ elif self . util . isSublist (jtem [2 ], item [2 ]):
226
331
unalignedWordIndicesInTheLongerName = []
227
332
for ktem in item [1 ]:
228
333
unalignedWordIndicesInTheLongerName .append (ktem )
@@ -241,9 +346,9 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
241
346
break
242
347
if item [1 ][l ] not in unalignedWordIndicesInTheLongerName or alreadyInserted :
243
348
continue
244
- if [item [1 ][l ], jtem [1 ][k ]] not in alignment_list and \
245
- sourceSent [ item [1 ][ k ] - 1 ][ 2 ] not in targetWords and \
246
- item [ 2 ][ l ] not in punctuations and jtem [ 2 ][ k ] not in punctuations :
349
+ if [item [1 ][l ], jtem [1 ][k ]] not in alignment_list and sourceSent [ item [ 1 ][ k ] - 1 ][ 2 ] \
350
+ not in targetWords and item [2 ][ l ] not in punctuations and jtem [ 2 ][ k ] \
351
+ not in punctuations :
247
352
248
353
alignment_list .append ([item [1 ][l ], jtem [1 ][k ]])
249
354
@@ -291,7 +396,7 @@ def learn_NamedEntities(self,SentParam, LearnNE, knownNE):
291
396
#construct new item([ [charbegin,charEnd], [sourceWordIndex], [sourceWord], [targetWordNE] ])
292
397
# we replace NE of sourceWord with NE of targetWord
293
398
newItem = [[i [0 ]], [i [1 ]], [i [2 ]], k [3 ]]
294
- print "matched"
399
+ # print "matched"
295
400
partOfABiggerName = False
296
401
for p in xrange (len (LearnNE )):
297
402
if LearnNE [p ][1 ][len (LearnNE [p ][1 ])- 1 ] == newItem [1 ][0 ] - 1 :
@@ -322,7 +427,6 @@ def learn_NamedEntities(self,SentParam, LearnNE, knownNE):
322
427
323
428
def align_full_matches (self ,sourceNE , targetNE ):
324
429
325
-
326
430
# Align all full matches
327
431
sourceNamedEntitiesAlreadyAligned = []
328
432
targetNamedEntitiesAlreadyAligned = []
0 commit comments