Skip to content

Commit ba12b27

Browse files
committed
align noun module completed and example added
1 parent 87e1e49 commit ba12b27

File tree

2 files changed

+271
-3
lines changed

2 files changed

+271
-3
lines changed

monolingualWordAligner/example_align.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
# -*- coding: utf-8 -*-
33
from wordAligner import *
44

5-
sentence1 = "They are washing clothes. I heard some noise. He ran quickly. I work on Sunday"
6-
sentence2 = "I have washed my clothes. They run very slowly. I am hearing songs. I am working tomorrow"
5+
sentence1 = "Ramesh and rubanraj are doing Masters in Autonomus System. Camel and cow gives milk "
6+
sentence2 = "Rubanraj and ramesh are doing their research and development project. Cow and camel produces milk"
77

88
print "sentence1 = ", sentence1
99
print "sentence2 = ", sentence2

monolingualWordAligner/wordAligner.py

Lines changed: 269 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,20 @@ def alignWords(self,sourceSent, targetSent, sourceParseResult, targetParseResult
111111
if item[1] not in tarWordAlreadyAligned:
112112
tarWordAlreadyAligned.append(item[1])
113113

114+
aligned_nouns = self.align_Nouns(self.sourceWordIndices, self.targetWordIndices, sourceWords, targetWords,
115+
self.sourceLemmas, self.targetLemmas, self.sourcePosTags, self.targetPosTags,
116+
sourceParseResult, targetParseResult, alignments, srcWordAlreadyAligned, tarWordAlreadyAligned)
117+
118+
print "aligned nouns ", aligned_nouns
119+
120+
for item in aligned_nouns:
121+
if item not in alignments:
122+
alignments.append(item)
123+
if item[0] not in srcWordAlreadyAligned:
124+
srcWordAlreadyAligned.append(item[0])
125+
if item[1] not in tarWordAlreadyAligned:
126+
tarWordAlreadyAligned.append(item[1])
127+
114128
return alignments
115129

116130

@@ -745,4 +759,258 @@ def align_mainVerbs(self, srcWordIndices, tarWordIndices, srcWords, tarWords, sr
745759
else:
746760
break
747761

748-
return AlignedVerbs
762+
return AlignedVerbs
763+
764+
765+
'''
766+
Returns Aligned Nouns
767+
'''
768+
769+
def align_Nouns(self, srcWordIndices, tarWordIndices, srcWords, tarWords, srcLemmas,\
770+
tarLemmas, srcPosTags, tarPosTags, sourceParseResult,targetParseResult, existingalignments,
771+
srcWordAlreadyAligned, tarWordAlreadyAligned):
772+
773+
774+
nounAlignments = []
775+
numberofNounsInSource = 0
776+
evidenceCountMatrix = {}
777+
relativeAlignmentsMatrix = {} # contains aligned Verbs with their similar child/parents
778+
wordSimilarity = {} # dictionary contains similarity score of two word indices(src and tar)
779+
780+
sourceDependencyParse = self.util.dependencyTreeWithOffSets(sourceParseResult)
781+
targetDependencyParse = self.util.dependencyTreeWithOffSets(targetParseResult)
782+
783+
for i in srcWordIndices:
784+
785+
if i in srcWordAlreadyAligned or (srcPosTags[i-1][0].lower() != 'n' \
786+
and srcPosTags[i-1].lower() != 'prp'):
787+
continue
788+
789+
numberofNounsInSource += 1
790+
791+
for j in tarWordIndices:
792+
793+
if j in tarWordAlreadyAligned or (tarPosTags[j-1][0].lower() != 'n' \
794+
and tarPosTags[j-1].lower() != 'prp'):
795+
continue
796+
797+
getSimilarityScore = max(self.word_similarity.computeWordSimilarityScore(srcWords[i-1], \
798+
srcPosTags[i-1], tarWords[j-1], tarPosTags[j-1]), \
799+
self.word_similarity.computeWordSimilarityScore(srcLemmas[i-1],\
800+
srcPosTags[i-1], tarLemmas[j-1], tarPosTags[j-1]))
801+
if getSimilarityScore < ppdbSim:
802+
continue
803+
804+
wordSimilarity[(i,j)] = getSimilarityScore
805+
806+
sourceWordParents = self.util.findParents(sourceDependencyParse, i, srcWords[i-1])
807+
sourceWordChildren = self.util.findChildren(sourceDependencyParse, i, srcWords[i-1])
808+
targetWordParents = self.util.findParents(targetDependencyParse, j, tarWords[j-1])
809+
targetWordChildren = self.util.findChildren(targetDependencyParse, j, tarWords[j-1])
810+
811+
#search for common or equivalent children
812+
groupOfSimilarRelationsForNounChild = ['pos', 'nn' 'prep_of', 'prep_in', 'prep_at', 'prep_for']
813+
groupOfSimilarRelationsForVerbChild = ['infmod', 'partmod', 'rcmod']
814+
groupOfSimilarRelationsForAdjectiveChild = ['amod', 'rcmod']
815+
816+
for k in sourceWordChildren:
817+
for l in targetWordChildren:
818+
if (k[0], l[0]) in existingalignments+nounAlignments or \
819+
max( self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], \
820+
l[1], tarPosTags[l[0]-1]),\
821+
self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
822+
srcPosTags[k[0]-1],tarLemmas[l[0]-1], tarPosTags[l[0]-1])) \
823+
>= ppdbSim and \
824+
((k[2] == l[2]) or \
825+
(k[2] in groupOfSimilarRelationsForNounChild and l[2] in groupOfSimilarRelationsForNounChild) or \
826+
(k[2] in groupOfSimilarRelationsForVerbChild and l[2] in groupOfSimilarRelationsForVerbChild) or \
827+
(k[2] in groupOfSimilarRelationsForAdjectiveChild and l[2] in groupOfSimilarRelationsForAdjectiveChild)):
828+
829+
if (i, j) in evidenceCountMatrix:
830+
evidenceCountMatrix[(i, j)] += max(self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], l[1], \
831+
tarPosTags[l[0]-1]), self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
832+
srcPosTags[k[0]-1], tarLemmas[l[0]-1], tarPosTags[l[0]-1]))
833+
else:
834+
835+
evidenceCountMatrix[(i, j)] = max(self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], l[1], \
836+
tarPosTags[l[0]-1]), self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
837+
srcPosTags[k[0]-1], tarLemmas[l[0]-1], tarPosTags[l[0]-1]))
838+
839+
840+
if (i, j) in relativeAlignmentsMatrix:
841+
relativeAlignmentsMatrix[(i,j)].append([k[0],l[0]])
842+
843+
else:
844+
relativeAlignmentsMatrix[(i,j)] = []
845+
relativeAlignmentsMatrix[(i,j)].append([k[0],l[0]])
846+
847+
#search for common or equivalent parents
848+
849+
groupOfSimilarRelationsForNounParent = ['pos', 'nn', 'prep_of', 'prep_in', 'prep_at', 'prep_for']
850+
group1OfSimilarRelationsForVerbParent = ['agent', 'nsubj', 'xsubj']
851+
group2OfSimilarRelationsForVerbParent = ['ccomp', 'dobj', 'nsubjpass', 'rel', 'partmod']
852+
group3OfSimilarRelationsForVerbParent = ['tmod' 'prep_in', 'prep_at', 'prep_on']
853+
group4OfSimilarRelationsForVerbParent = ['iobj', 'prep_to']
854+
855+
856+
for k in sourceWordParents:
857+
for l in targetWordParents:
858+
if (k[0], l[0]) in existingalignments+nounAlignments or \
859+
max( self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], \
860+
l[1], tarPosTags[l[0]-1]),\
861+
self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
862+
srcPosTags[k[0]-1],tarLemmas[l[0]-1], tarPosTags[l[0]-1])) \
863+
>= ppdbSim and \
864+
((k[2] == l[2]) or \
865+
(k[2] in groupOfSimilarRelationsForNounParent and l[2] in groupOfSimilarRelationsForNounParent) or \
866+
(k[2] in group1OfSimilarRelationsForVerbParent and l[2] in group1OfSimilarRelationsForVerbParent) or \
867+
(k[2] in group2OfSimilarRelationsForVerbParent and l[2] in group2OfSimilarRelationsForVerbParent) or \
868+
(k[2] in group3OfSimilarRelationsForVerbParent and l[2] in group3OfSimilarRelationsForVerbParent) or \
869+
(k[2] in group4OfSimilarRelationsForVerbParent and k[2] in group4OfSimilarRelationsForVerbParent)):
870+
871+
if (i, j) in evidenceCountMatrix:
872+
evidenceCountMatrix[(i, j)] += max(self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], l[1], \
873+
tarPosTags[l[0]-1]), self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
874+
srcPosTags[k[0]-1], tarLemmas[l[0]-1], tarPosTags[l[0]-1]))
875+
else:
876+
877+
evidenceCountMatrix[(i, j)] = max(self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], l[1], \
878+
tarPosTags[l[0]-1]), self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
879+
srcPosTags[k[0]-1], tarLemmas[l[0]-1], tarPosTags[l[0]-1]))
880+
881+
882+
if (i, j) in relativeAlignmentsMatrix:
883+
relativeAlignmentsMatrix[(i,j)].append([k[0],l[0]])
884+
885+
else:
886+
relativeAlignmentsMatrix[(i,j)] = []
887+
relativeAlignmentsMatrix[(i,j)].append([k[0],l[0]])
888+
889+
groupOfSimilarRelationsInOppositeDirectionForAdjectiveParentAndChild = [['nsubj'], ['amod', 'rcmod']]
890+
groupOfSimilarRelationsInOppositeDirectionForVerbParentAndChild = [['ccomp', 'dobj', 'nsubjpass', 'rel', 'partmod'], ['infmod', 'partmod', 'rcmod']]
891+
group1OfSimilarRelationsInOppositeDirectionForNounParentAndChild = [['conj_and'], ['conj_and']]
892+
group2OfSimilarRelationsInOppositeDirectionForNounParentAndChild = [['conj_or'], ['conj_or']]
893+
group3OfSimilarRelationsInOppositeDirectionForNounParentAndChild = [['conj_nor'], ['conj_nor']]
894+
# search for equivalent parent-child relations
895+
evidenceCountMatrix, relativeAlignmentsMatrix = self.findEquivalentRelationAlignNouns(i, j, sourceWordParents, targetWordChildren, \
896+
nounAlignments, existingalignments,\
897+
srcPosTags, tarPosTags, srcLemmas,tarLemmas, groupOfSimilarRelationsInOppositeDirectionForAdjectiveParentAndChild[0], \
898+
groupOfSimilarRelationsInOppositeDirectionForAdjectiveParentAndChild[1],\
899+
groupOfSimilarRelationsInOppositeDirectionForVerbParentAndChild[0],\
900+
groupOfSimilarRelationsInOppositeDirectionForVerbParentAndChild[1], \
901+
group1OfSimilarRelationsInOppositeDirectionForNounParentAndChild[0],\
902+
group1OfSimilarRelationsInOppositeDirectionForNounParentAndChild[1], \
903+
group2OfSimilarRelationsInOppositeDirectionForNounParentAndChild[0], \
904+
group2OfSimilarRelationsInOppositeDirectionForNounParentAndChild[1],\
905+
group3OfSimilarRelationsInOppositeDirectionForNounParentAndChild[0], \
906+
group3OfSimilarRelationsInOppositeDirectionForNounParentAndChild[1], \
907+
evidenceCountMatrix,relativeAlignmentsMatrix)
908+
909+
# search for equivalent child-parent relations
910+
evidenceCountMatrix, relativeAlignmentsMatrix = self.findEquivalentRelationAlignNouns(i, j, sourceWordChildren, targetWordParents, \
911+
nounAlignments, existingalignments,\
912+
srcPosTags, tarPosTags, srcLemmas,tarLemmas, groupOfSimilarRelationsInOppositeDirectionForAdjectiveParentAndChild[1], \
913+
groupOfSimilarRelationsInOppositeDirectionForAdjectiveParentAndChild[0],\
914+
groupOfSimilarRelationsInOppositeDirectionForVerbParentAndChild[1],\
915+
groupOfSimilarRelationsInOppositeDirectionForVerbParentAndChild[0], \
916+
group1OfSimilarRelationsInOppositeDirectionForNounParentAndChild[1],\
917+
group1OfSimilarRelationsInOppositeDirectionForNounParentAndChild[0], \
918+
group2OfSimilarRelationsInOppositeDirectionForNounParentAndChild[1], \
919+
group2OfSimilarRelationsInOppositeDirectionForNounParentAndChild[0],\
920+
group3OfSimilarRelationsInOppositeDirectionForNounParentAndChild[1], \
921+
group3OfSimilarRelationsInOppositeDirectionForNounParentAndChild[0], \
922+
evidenceCountMatrix,relativeAlignmentsMatrix)
923+
924+
# use collected stats to align
925+
926+
for p in xrange(numberofNounsInSource):
927+
928+
maxEvidenceCountForCurrentPass = 0
929+
maxOverallValueForCurrentPass = 0
930+
indexPairWithStrongestTieForCurrentPass = [-1, -1] # indexes of aligned nouns
931+
932+
for i in srcWordIndices:
933+
if i in srcWordAlreadyAligned or srcPosTags[i-1][0].lower() != 'n' or \
934+
srcLemmas[i-1] in stopwords:
935+
continue
936+
937+
for j in tarWordIndices:
938+
if j in tarWordAlreadyAligned or tarPosTags[j-1][0].lower() != 'n' or \
939+
tarLemmas[j-1] in stopwords:
940+
continue
941+
942+
if(i, j) in evidenceCountMatrix and theta1*wordSimilarity[(i,j)] + \
943+
(1-theta1)*evidenceCountMatrix[(i, j)] > maxOverallValueForCurrentPass:
944+
maxOverallValueForCurrentPass = theta1*wordSimilarity[(i,j)] + \
945+
(1-theta1)*evidenceCountMatrix[(i, j)]
946+
maxEvidenceCountForCurrentPass = evidenceCountMatrix[(i, j)]
947+
indexPairWithStrongestTieForCurrentPass = [i, j]
948+
949+
if maxEvidenceCountForCurrentPass > 0:
950+
nounAlignments.append(indexPairWithStrongestTieForCurrentPass)
951+
srcWordAlreadyAligned.append(indexPairWithStrongestTieForCurrentPass[0])
952+
tarWordAlreadyAligned.append(indexPairWithStrongestTieForCurrentPass[1])
953+
954+
for item in relativeAlignmentsMatrix[(indexPairWithStrongestTieForCurrentPass[0], \
955+
indexPairWithStrongestTieForCurrentPass[1])]:
956+
# item[0] and item[1] != 0 so that we should not store Root-0
957+
if item[0] != 0 and item[1] != 0 and item[0] not in srcWordAlreadyAligned and \
958+
item[1] not in tarWordAlreadyAligned:
959+
nounAlignments.append(item)
960+
srcWordAlreadyAligned.append(item[0])
961+
tarWordAlreadyAligned.append(item[1])
962+
# no aligned nouns formed
963+
else:
964+
break
965+
966+
return nounAlignments
967+
968+
969+
'''
970+
Auxillary verb to find equivalent parent-child / child-parent relation in align nouns
971+
'''
972+
973+
974+
def findEquivalentRelationAlignNouns(self, i, j, sourceDepenency, targetDependency, nounAlignments, existingalignments,\
975+
srcPosTags, tarPosTags, srcLemmas,tarLemmas, AdjParentAndChildSrc, AdjParentAndChildTar,\
976+
OppDirecVerbParentAndChildSrc,OppDirecVerbParentAndChildTar, \
977+
group1OppDirectNounParentAndChildSrc,group1OppDirectNounParentAndChildTar, \
978+
group2OppDirectNounParentAndChildSrc, group2OppDirectNounParentAndChildTar,\
979+
group3OppDirectNounParentAndChildSrc, group3OppDirectNounParentAndChildTar, \
980+
evidenceCountMatrix,relativeAlignmentsMatrix ):
981+
982+
983+
for k in sourceDepenency:
984+
for l in targetDependency:
985+
if (k[0], l[0]) in existingalignments+nounAlignments or \
986+
max( self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], \
987+
l[1], tarPosTags[l[0]-1]),\
988+
self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
989+
srcPosTags[k[0]-1],tarLemmas[l[0]-1], tarPosTags[l[0]-1])) \
990+
>= ppdbSim and \
991+
((k[2] == l[2]) or \
992+
(k[2] in AdjParentAndChildSrc and l[2] in AdjParentAndChildTar) or \
993+
(k[2] in OppDirecVerbParentAndChildSrc and l[2] in OppDirecVerbParentAndChildTar) or \
994+
(k[2] in group1OppDirectNounParentAndChildSrc and l[2] in group1OppDirectNounParentAndChildTar) or \
995+
(k[2] in group2OppDirectNounParentAndChildSrc and l[2] in group2OppDirectNounParentAndChildTar) or \
996+
(k[2] in group3OppDirectNounParentAndChildSrc and k[2] in group3OppDirectNounParentAndChildTar)):
997+
998+
if (i, j) in evidenceCountMatrix:
999+
evidenceCountMatrix[(i, j)] += max(self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], l[1], \
1000+
tarPosTags[l[0]-1]), self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
1001+
srcPosTags[k[0]-1], tarLemmas[l[0]-1], tarPosTags[l[0]-1]))
1002+
else:
1003+
1004+
evidenceCountMatrix[(i, j)] = max(self.word_similarity.computeWordSimilarityScore(k[1], srcPosTags[k[0]-1], l[1], \
1005+
tarPosTags[l[0]-1]), self.word_similarity.computeWordSimilarityScore(srcLemmas[k[0]-1], \
1006+
srcPosTags[k[0]-1], tarLemmas[l[0]-1], tarPosTags[l[0]-1]))
1007+
1008+
1009+
if (i, j) in relativeAlignmentsMatrix:
1010+
relativeAlignmentsMatrix[(i,j)].append([k[0],l[0]])
1011+
1012+
else:
1013+
relativeAlignmentsMatrix[(i,j)] = []
1014+
relativeAlignmentsMatrix[(i,j)].append([k[0],l[0]])
1015+
1016+
return evidenceCountMatrix, relativeAlignmentsMatrix

0 commit comments

Comments
 (0)