Skip to content

Commit 35f4bc6

Browse files
committed
align_commonNeighboringWords and align_hyphentatedWords functions added
1 parent e25b1fe commit 35f4bc6

File tree

1 file changed

+140
-36
lines changed

1 file changed

+140
-36
lines changed

monolingualWordAligner/wordAligner.py

Lines changed: 140 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,13 @@ class Aligner:
77

88
def __init__(self):
99
self.text_nor = Text_processing()
10-
10+
self.util = Util()
1111
def align_sentences(self,sentence1,sentence2):
1212

1313
sentence1ParseResult = self.text_nor.parser(sentence1)
14-
# print "sentence1 parse ", sentence1ParseResult
15-
# print ""
1614
sentence2ParseResult = self.text_nor.parser(sentence2)
1715

1816
sentence1LemmasAndPosTags = self.text_nor.combine_lemmaAndPosTags(sentence1ParseResult)
19-
# print "sentce1 ", sentence1LemmasAndPosTags
2017
sentence2LemmasAndPosTags = self.text_nor.combine_lemmaAndPosTags(sentence2ParseResult)
2118

2219
self.sourceWordIndices = [i+1 for i in xrange(len(sentence1LemmasAndPosTags))]
@@ -31,24 +28,29 @@ def align_sentences(self,sentence1,sentence2):
3128
self.sourcePosTags = [item[4] for item in sentence1LemmasAndPosTags]
3229
self.targetPosTags = [item[4] for item in sentence2LemmasAndPosTags]
3330

34-
myWordAlignments = self.alignWords(sentence1LemmasAndPosTags, sentence2LemmasAndPosTags, \
35-
sentence1ParseResult, sentence2ParseResult)
31+
32+
myWordAlignments = self.alignWords(sentence1LemmasAndPosTags, sentence2LemmasAndPosTags,
33+
sentence1ParseResult, sentence2ParseResult)
3634

35+
3736
align = []
3837
for i in myWordAlignments:
3938
align.append([self.sourceWords[i[0]-1], self.targetWords[i[1]-1] ])
40-
# print "align words ", align
39+
print "align words ", align
4140

4241
return align
4342

43+
4444
'''
4545
sourceSent and targetSent is list of:
4646
[[character begin offset, character end offset], word index, word, lemma, pos tag]
4747
sourceParseResult and targetParseResult is list of:
4848
Parse Tree(Constituency tree), Text, Dependencies, words(NE, CharacOffsetEn, CharOffsetBeg,
4949
POS, Lemma)
5050
1. Align the punctuations first
51-
2. Align named entities
51+
2. Align common NeighboringWords(atleast bigram or more)
52+
3. Align Hyphenated Words
53+
4. Align named entitiesss
5254
'''
5355

5456

@@ -59,14 +61,30 @@ def alignWords(self,sourceSent, targetSent, sourceParseResult, targetParseResult
5961
srcWordAlreadyAligned = [] #sourceWordAlreadyAligned
6062
tarWordAlreadyAligned = [] #TargetWordAlreadyAligned
6163

62-
# align the punctuations
64+
# 1. align the punctuations
65+
alignments, srcWordAlreadyAligned, tarWordAlreadyAligned = self.align_punctuations(self.sourceWords,
66+
self.targetWords, alignments, srcWordAlreadyAligned, tarWordAlreadyAligned,sourceSent,targetSent)
67+
68+
#2. align commonNeighboringWords (atleast bigram, or more)
6369
alignments, srcWordAlreadyAligned, tarWordAlreadyAligned = \
64-
self.align_punctuations(self.sourceWords,self.targetWords, \
65-
alignments, srcWordAlreadyAligned, tarWordAlreadyAligned,sourceSent,targetSent)
66-
# align named entities
67-
neAlignments = self.align_namedEntities(sourceSent, targetSent, \
68-
sourceParseResult, targetParseResult, alignments, srcWordAlreadyAligned, tarWordAlreadyAligned)
69-
70+
self.align_commonNeighboringWords(self.sourceWords, self.targetWords, \
71+
srcWordAlreadyAligned, tarWordAlreadyAligned, alignments)
72+
#3. align Hyphenated words
73+
checkSourceWordsInTarget = True # check if Source Words have any hyphen words
74+
alignments, srcWordAlreadyAligned, tarWordAlreadyAligned = \
75+
self.align_hyphenWords(self.sourceWordIndices, self.sourceWords,\
76+
srcWordAlreadyAligned, alignments,\
77+
tarWordAlreadyAligned, checkSourceWordsInTarget)
78+
79+
checkSourceWordsInTarget = False # check if target Words have any hyphen words
80+
alignments, srcWordAlreadyAligned, tarWordAlreadyAligned = \
81+
self.align_hyphenWords(self.targetWordIndices, self.targetWords, tarWordAlreadyAligned, alignments, \
82+
tarWordAlreadyAligned,checkSourceWordsInTarget)
83+
84+
#4. align named entities
85+
neAlignments = self.align_namedEntities(sourceSent, targetSent, sourceParseResult, \
86+
targetParseResult, alignments, srcWordAlreadyAligned, tarWordAlreadyAligned)
87+
7088
for item in neAlignments:
7189
if item not in alignments:
7290
alignments.append(item)
@@ -84,20 +102,22 @@ def alignWords(self,sourceSent, targetSent, sourceParseResult, targetParseResult
84102
'''
85103

86104

87-
def align_punctuations(self,sourceWords, targetWords, alignments, \
105+
def align_punctuations(self,sourceWords, targetWords, alignments,
88106
srcWordAlreadyAligned, tarWordAlreadyAligned, sourceSent, targetSent):
89107

90108
global punctuations
91109

92110
# if last word of source sentence is . or ! and last of target sent is . or ! or both are equal
93-
if (sourceWords[len(sourceSent)-1] in ['.','!'] and targetWords[len(targetSent)-1]\
94-
in ['.','!']) or (sourceWords[len(sourceSent)-1]==targetWords[len(targetSent)-1]):
111+
if (sourceWords[len(sourceSent)-1] in ['.','!'] and targetWords[len(targetSent)-1] \
112+
in ['.','!']) or (sourceWords[len(sourceSent)-1]==targetWords[len(targetSent)-1]):
95113

96114
alignments.append([len(sourceSent), len(targetSent)])
97115
srcWordAlreadyAligned.append(len(sourceSent))
98116
tarWordAlreadyAligned.append(len(targetSent))
99117
# or if second last of source sent. is . or ! and last word of target sent is . or ! then append too
100-
elif (sourceWords[len(sourceSent)-2] in ['.', '!'] and targetWords[len(targetSent)-1] in ['.', '!']):
118+
elif (sourceWords[len(sourceSent)-2] in ['.', '!'] and \
119+
targetWords[len(targetSent)-1] in ['.', '!']):
120+
101121
alignments.append([len(sourceSent), len(targetSent)])
102122
srcWordAlreadyAligned.append(len(sourceSent))
103123
tarWordAlreadyAligned.append(len(targetSent))
@@ -115,6 +135,92 @@ def align_punctuations(self,sourceWords, targetWords, alignments, \
115135
return alignments, srcWordAlreadyAligned, tarWordAlreadyAligned
116136

117137

138+
'''
139+
Input: sourceWords, targetWords
140+
Returns: aligned words that are bigram, trigram,..(not unigram)
141+
of content words
142+
'''
143+
144+
145+
def align_commonNeighboringWords(self, sourceWords, targetWords, srcWordAlreadyAligned,
146+
tarWordAlreadyAligned, alignments):
147+
148+
149+
commonNeighboringWords = self.util.get_commonNeighboringWords(sourceWords, targetWords)
150+
151+
for commonWords in commonNeighboringWords:
152+
stopWordsPresent = True
153+
# print "common Words ", commonWords
154+
for word in commonWords:
155+
156+
if word not in stopwords and word not in punctuations:
157+
stopWordsPresent = False
158+
break
159+
# print "asds ", (len(word[0]))
160+
if len(commonWords[0]) >= 2 and not stopWordsPresent:
161+
162+
for j in xrange(len(commonWords[0])):
163+
# print "common Word sss ", commonWords[0][j]+1
164+
# print "target words ", commonWords[1][j]+1
165+
if commonWords[0][j]+1 not in srcWordAlreadyAligned and commonWords[1][j]+1 not in \
166+
tarWordAlreadyAligned and [commonWords[0][j]+1, commonWords[1][j]+1] not in alignments:
167+
168+
alignments.append([commonWords[0][j]+1, commonWords[1][j]+1])
169+
srcWordAlreadyAligned.append(commonWords[0][j]+1)
170+
tarWordAlreadyAligned.append(commonWords[1][j]+1)
171+
172+
return alignments, srcWordAlreadyAligned, tarWordAlreadyAligned
173+
174+
175+
'''
176+
Input: wordIndices(srcWordIndices/tarWordIndices) depends upon whether we check sourceWords
177+
in targetWords, or other way
178+
Words(srcWordIndices/tarWordIndices)
179+
srcWordAlreadyAligned, alignments, tarWordAlreadyAligned,
180+
flag: true, then we check sourceWords in targetWords,
181+
else we check targetWords in sourceWords
182+
Returns: aligned hyphen Words(alignments, srcWordAlreadyAligned, tarWordAlreadyAligned)
183+
'''
184+
185+
186+
def align_hyphenWords(self, wordIndices, Words, srcWordAlreadyAligned, alignments,
187+
tarWordAlreadyAligned, flag):
188+
189+
190+
for i in wordIndices:
191+
if flag:
192+
if i in srcWordAlreadyAligned:
193+
continue
194+
else:
195+
if i in tarWordAlreadyAligned:
196+
continue
197+
198+
if '-' in Words[i-1] and Words[i-1] != '-':
199+
tokens = Words[i-1].split('-')
200+
#if flag true(means we check source words in target Words)
201+
202+
if flag:
203+
commonNeighboringWords = self.util.get_commonNeighboringWords(tokens, self.targetWords)
204+
205+
else:
206+
commonNeighboringWords = self.util.get_commonNeighboringWords(tokens,self.sourceWords)
207+
for pairs in commonNeighboringWords:
208+
209+
if len(pairs[0]) > 1:
210+
for j in pairs[1]:
211+
if[i, j+1] not in alignments:
212+
if flag:
213+
alignments.append([i,j+1])
214+
srcWordAlreadyAligned.append(i)
215+
tarWordAlreadyAligned.append(j+1)
216+
else:
217+
alignments.append([j+1,1])
218+
srcWordAlreadyAligned.append(j+1)
219+
tarWordAlreadyAligned.append(i)
220+
221+
return alignments, srcWordAlreadyAligned, tarWordAlreadyAligned
222+
223+
118224
'''
119225
Input: source Sentence, target sentence,
120226
sourceParseResult, targetParseResult,
@@ -127,13 +233,13 @@ def align_punctuations(self,sourceWords, targetWords, alignments, \
127233
'''
128234

129235

130-
def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
131-
targetParseResult, existingAlignments, srcWordAlreadyAligned, tarWordAlreadyAligned):
236+
def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, targetParseResult,
237+
existingAlignments, srcWordAlreadyAligned, tarWordAlreadyAligned):
132238

133239

134240
sourceNE = self.text_nor.get_ner(sourceParseResult)
135241
targetNE = self.text_nor.get_ner(targetParseResult)
136-
242+
# print "before sourceNE ", sourceNE
137243
sourceNE, sourceWords = self.learn_NamedEntities(sourceSent, sourceNE, targetNE)
138244
targetNE, targetWords = self.learn_NamedEntities(targetSent, targetNE, sourceNE)
139245

@@ -142,7 +248,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
142248

143249
# Align all full matches
144250
alignment_list, sourceNamedEntitiesAlreadyAligned, targetNamedEntitiesAlreadyAligned = \
145-
self.align_full_matches(sourceNE, targetNE)
251+
self.align_full_matches(sourceNE, targetNE)
146252

147253
# Align Acronyms
148254
for item in sourceNE:
@@ -169,7 +275,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
169275
# align subset matches
170276
for item in sourceNE:
171277
if item[3] not in ['PERSON', 'ORGANIZATION', 'LOCATION'] or item in \
172-
sourceNamedEntitiesAlreadyAligned:
278+
sourceNamedEntitiesAlreadyAligned:
173279
continue
174280

175281
# do not align if the current source entity is present more than once
@@ -182,7 +288,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
182288

183289
for jtem in targetNE:
184290
if jtem[3] not in ['PERSON', 'ORGANIZATION', 'LOCATION'] or jtem in \
185-
targetNamedEntitiesAlreadyAligned:
291+
targetNamedEntitiesAlreadyAligned:
186292
continue
187293

188294
if item[3] != jtem[3]:
@@ -197,7 +303,7 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
197303
if count_words > 1:
198304
continue
199305

200-
if isSublist(item[2], jtem[2]):
306+
if self.util.isSublist(item[2], jtem[2]):
201307
unalignedWordIndicesInTheLongerName = []
202308
for ktem in jtem[1]:
203309
unalignedWordIndicesInTheLongerName.append(ktem)
@@ -216,13 +322,12 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
216322
break
217323
if jtem[1][l] not in unalignedWordIndicesInTheLongerName or alreadyInserted:
218324
continue
219-
if [item[1][k], jtem[1][l]] not in alignment_list and \
220-
targetSent[jtem[1][l]-1][2] not in sourceWords and \
221-
item[2][k] not in punctuations and jtem[2][l] not in punctuations:
222-
325+
if [item[1][k], jtem[1][l]] not in alignment_list and targetSent[jtem[1][l]-1][2] \
326+
not in sourceWords and item[2][k] not in punctuations and jtem[2][l] \
327+
not in punctuations:
223328
alignment_list.append([item[1][k], jtem[1][l]])
224329
# else find if the second is a part of the first
225-
elif isSublist(jtem[2], item[2]):
330+
elif self.util.isSublist(jtem[2], item[2]):
226331
unalignedWordIndicesInTheLongerName = []
227332
for ktem in item[1]:
228333
unalignedWordIndicesInTheLongerName.append(ktem)
@@ -241,9 +346,9 @@ def align_namedEntities(self, sourceSent, targetSent, sourceParseResult, \
241346
break
242347
if item[1][l] not in unalignedWordIndicesInTheLongerName or alreadyInserted:
243348
continue
244-
if [item[1][l], jtem[1][k]] not in alignment_list and \
245-
sourceSent[item[1][k]-1][2] not in targetWords and \
246-
item[2][l] not in punctuations and jtem[2][k] not in punctuations:
349+
if [item[1][l], jtem[1][k]] not in alignment_list and sourceSent[item[1][k]-1][2] \
350+
not in targetWords and item[2][l] not in punctuations and jtem[2][k] \
351+
not in punctuations:
247352

248353
alignment_list.append([item[1][l], jtem[1][k]])
249354

@@ -291,7 +396,7 @@ def learn_NamedEntities(self,SentParam, LearnNE, knownNE):
291396
#construct new item([ [charbegin,charEnd], [sourceWordIndex], [sourceWord], [targetWordNE] ])
292397
# we replace NE of sourceWord with NE of targetWord
293398
newItem = [[i[0]], [i[1]], [i[2]], k[3]]
294-
print "matched"
399+
# print "matched"
295400
partOfABiggerName = False
296401
for p in xrange(len(LearnNE)):
297402
if LearnNE[p][1][len(LearnNE[p][1])-1] == newItem[1][0] - 1:
@@ -322,7 +427,6 @@ def learn_NamedEntities(self,SentParam, LearnNE, knownNE):
322427

323428
def align_full_matches(self,sourceNE, targetNE):
324429

325-
326430
# Align all full matches
327431
sourceNamedEntitiesAlreadyAligned = []
328432
targetNamedEntitiesAlreadyAligned = []

0 commit comments

Comments
 (0)