Skip to content

Commit 9530c29

Browse files
committed
Use Snowball's english-stemmer.js
1 parent aaf9e07 commit 9530c29

File tree

2 files changed

+2
-374
lines changed

2 files changed

+2
-374
lines changed

sphinx/search/en.py

Lines changed: 1 addition & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -7,197 +7,11 @@
77
from sphinx.search import SearchLanguage
88
from sphinx.search._stopwords.en import ENGLISH_STOPWORDS
99

10-
js_porter_stemmer = """
11-
/**
12-
* Porter Stemmer
13-
*/
14-
var Stemmer = function() {
15-
16-
var step2list = {
17-
ational: 'ate',
18-
tional: 'tion',
19-
enci: 'ence',
20-
anci: 'ance',
21-
izer: 'ize',
22-
bli: 'ble',
23-
alli: 'al',
24-
entli: 'ent',
25-
eli: 'e',
26-
ousli: 'ous',
27-
ization: 'ize',
28-
ation: 'ate',
29-
ator: 'ate',
30-
alism: 'al',
31-
iveness: 'ive',
32-
fulness: 'ful',
33-
ousness: 'ous',
34-
aliti: 'al',
35-
iviti: 'ive',
36-
biliti: 'ble',
37-
logi: 'log'
38-
};
39-
40-
var step3list = {
41-
icate: 'ic',
42-
ative: '',
43-
alize: 'al',
44-
iciti: 'ic',
45-
ical: 'ic',
46-
ful: '',
47-
ness: ''
48-
};
49-
50-
var c = "[^aeiou]"; // consonant
51-
var v = "[aeiouy]"; // vowel
52-
var C = c + "[^aeiouy]*"; // consonant sequence
53-
var V = v + "[aeiou]*"; // vowel sequence
54-
55-
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
56-
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
57-
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
58-
var s_v = "^(" + C + ")?" + v; // vowel in stem
59-
60-
this.stemWord = function (w) {
61-
var stem;
62-
var suffix;
63-
var firstch;
64-
var origword = w;
65-
66-
if (w.length < 3)
67-
return w;
68-
69-
var re;
70-
var re2;
71-
var re3;
72-
var re4;
73-
74-
firstch = w.substr(0,1);
75-
if (firstch == "y")
76-
w = firstch.toUpperCase() + w.substr(1);
77-
78-
// Step 1a
79-
re = /^(.+?)(ss|i)es$/;
80-
re2 = /^(.+?)([^s])s$/;
81-
82-
if (re.test(w))
83-
w = w.replace(re,"$1$2");
84-
else if (re2.test(w))
85-
w = w.replace(re2,"$1$2");
86-
87-
// Step 1b
88-
re = /^(.+?)eed$/;
89-
re2 = /^(.+?)(ed|ing)$/;
90-
if (re.test(w)) {
91-
var fp = re.exec(w);
92-
re = new RegExp(mgr0);
93-
if (re.test(fp[1])) {
94-
re = /.$/;
95-
w = w.replace(re,"");
96-
}
97-
}
98-
else if (re2.test(w)) {
99-
var fp = re2.exec(w);
100-
stem = fp[1];
101-
re2 = new RegExp(s_v);
102-
if (re2.test(stem)) {
103-
w = stem;
104-
re2 = /(at|bl|iz)$/;
105-
re3 = new RegExp("([^aeiouylsz])\\\\1$");
106-
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
107-
if (re2.test(w))
108-
w = w + "e";
109-
else if (re3.test(w)) {
110-
re = /.$/;
111-
w = w.replace(re,"");
112-
}
113-
else if (re4.test(w))
114-
w = w + "e";
115-
}
116-
}
117-
118-
// Step 1c
119-
re = /^(.+?)y$/;
120-
if (re.test(w)) {
121-
var fp = re.exec(w);
122-
stem = fp[1];
123-
re = new RegExp(s_v);
124-
if (re.test(stem))
125-
w = stem + "i";
126-
}
127-
128-
// Step 2
129-
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
130-
ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
131-
if (re.test(w)) {
132-
var fp = re.exec(w);
133-
stem = fp[1];
134-
suffix = fp[2];
135-
re = new RegExp(mgr0);
136-
if (re.test(stem))
137-
w = stem + step2list[suffix];
138-
}
139-
140-
// Step 3
141-
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
142-
if (re.test(w)) {
143-
var fp = re.exec(w);
144-
stem = fp[1];
145-
suffix = fp[2];
146-
re = new RegExp(mgr0);
147-
if (re.test(stem))
148-
w = stem + step3list[suffix];
149-
}
150-
151-
// Step 4
152-
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
153-
iti|ous|ive|ize)$/;
154-
re2 = /^(.+?)(s|t)(ion)$/;
155-
if (re.test(w)) {
156-
var fp = re.exec(w);
157-
stem = fp[1];
158-
re = new RegExp(mgr1);
159-
if (re.test(stem))
160-
w = stem;
161-
}
162-
else if (re2.test(w)) {
163-
var fp = re2.exec(w);
164-
stem = fp[1] + fp[2];
165-
re2 = new RegExp(mgr1);
166-
if (re2.test(stem))
167-
w = stem;
168-
}
169-
170-
// Step 5
171-
re = /^(.+?)e$/;
172-
if (re.test(w)) {
173-
var fp = re.exec(w);
174-
stem = fp[1];
175-
re = new RegExp(mgr1);
176-
re2 = new RegExp(meq1);
177-
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
178-
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
179-
w = stem;
180-
}
181-
re = /ll$/;
182-
re2 = new RegExp(mgr1);
183-
if (re.test(w) && re2.test(w)) {
184-
re = /.$/;
185-
w = w.replace(re,"");
186-
}
187-
188-
// and turn initial Y back to y
189-
if (firstch == "y")
190-
w = firstch.toLowerCase() + w.substr(1);
191-
return w;
192-
}
193-
}
194-
"""
195-
19610

19711
class SearchEnglish(SearchLanguage):
19812
lang = 'en'
19913
language_name = 'English'
200-
js_stemmer_code = js_porter_stemmer
14+
js_stemmer_rawcode = 'english-stemmer.js'
20115
stopwords = ENGLISH_STOPWORDS
20216

20317
def init(self, options: dict[str, str]) -> None:

sphinx/search/zh.py

Lines changed: 1 addition & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -33,199 +33,13 @@ def cut_for_search(sentence: str, HMM: bool = True) -> Iterator[str]:
3333
)
3434
del jieba
3535

36-
js_porter_stemmer = """
37-
/**
38-
* Porter Stemmer
39-
*/
40-
var Stemmer = function() {
41-
42-
var step2list = {
43-
ational: 'ate',
44-
tional: 'tion',
45-
enci: 'ence',
46-
anci: 'ance',
47-
izer: 'ize',
48-
bli: 'ble',
49-
alli: 'al',
50-
entli: 'ent',
51-
eli: 'e',
52-
ousli: 'ous',
53-
ization: 'ize',
54-
ation: 'ate',
55-
ator: 'ate',
56-
alism: 'al',
57-
iveness: 'ive',
58-
fulness: 'ful',
59-
ousness: 'ous',
60-
aliti: 'al',
61-
iviti: 'ive',
62-
biliti: 'ble',
63-
logi: 'log'
64-
};
65-
66-
var step3list = {
67-
icate: 'ic',
68-
ative: '',
69-
alize: 'al',
70-
iciti: 'ic',
71-
ical: 'ic',
72-
ful: '',
73-
ness: ''
74-
};
75-
76-
var c = "[^aeiou]"; // consonant
77-
var v = "[aeiouy]"; // vowel
78-
var C = c + "[^aeiouy]*"; // consonant sequence
79-
var V = v + "[aeiou]*"; // vowel sequence
80-
81-
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
82-
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
83-
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
84-
var s_v = "^(" + C + ")?" + v; // vowel in stem
85-
86-
this.stemWord = function (w) {
87-
var stem;
88-
var suffix;
89-
var firstch;
90-
var origword = w;
91-
92-
if (w.length < 3)
93-
return w;
94-
95-
var re;
96-
var re2;
97-
var re3;
98-
var re4;
99-
100-
firstch = w.substr(0,1);
101-
if (firstch == "y")
102-
w = firstch.toUpperCase() + w.substr(1);
103-
104-
// Step 1a
105-
re = /^(.+?)(ss|i)es$/;
106-
re2 = /^(.+?)([^s])s$/;
107-
108-
if (re.test(w))
109-
w = w.replace(re,"$1$2");
110-
else if (re2.test(w))
111-
w = w.replace(re2,"$1$2");
112-
113-
// Step 1b
114-
re = /^(.+?)eed$/;
115-
re2 = /^(.+?)(ed|ing)$/;
116-
if (re.test(w)) {
117-
var fp = re.exec(w);
118-
re = new RegExp(mgr0);
119-
if (re.test(fp[1])) {
120-
re = /.$/;
121-
w = w.replace(re,"");
122-
}
123-
}
124-
else if (re2.test(w)) {
125-
var fp = re2.exec(w);
126-
stem = fp[1];
127-
re2 = new RegExp(s_v);
128-
if (re2.test(stem)) {
129-
w = stem;
130-
re2 = /(at|bl|iz)$/;
131-
re3 = new RegExp("([^aeiouylsz])\\\\1$");
132-
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
133-
if (re2.test(w))
134-
w = w + "e";
135-
else if (re3.test(w)) {
136-
re = /.$/;
137-
w = w.replace(re,"");
138-
}
139-
else if (re4.test(w))
140-
w = w + "e";
141-
}
142-
}
143-
144-
// Step 1c
145-
re = /^(.+?)y$/;
146-
if (re.test(w)) {
147-
var fp = re.exec(w);
148-
stem = fp[1];
149-
re = new RegExp(s_v);
150-
if (re.test(stem))
151-
w = stem + "i";
152-
}
153-
154-
// Step 2
155-
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
156-
ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
157-
if (re.test(w)) {
158-
var fp = re.exec(w);
159-
stem = fp[1];
160-
suffix = fp[2];
161-
re = new RegExp(mgr0);
162-
if (re.test(stem))
163-
w = stem + step2list[suffix];
164-
}
165-
166-
// Step 3
167-
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
168-
if (re.test(w)) {
169-
var fp = re.exec(w);
170-
stem = fp[1];
171-
suffix = fp[2];
172-
re = new RegExp(mgr0);
173-
if (re.test(stem))
174-
w = stem + step3list[suffix];
175-
}
176-
177-
// Step 4
178-
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|\
179-
iti|ous|ive|ize)$/;
180-
re2 = /^(.+?)(s|t)(ion)$/;
181-
if (re.test(w)) {
182-
var fp = re.exec(w);
183-
stem = fp[1];
184-
re = new RegExp(mgr1);
185-
if (re.test(stem))
186-
w = stem;
187-
}
188-
else if (re2.test(w)) {
189-
var fp = re2.exec(w);
190-
stem = fp[1] + fp[2];
191-
re2 = new RegExp(mgr1);
192-
if (re2.test(stem))
193-
w = stem;
194-
}
195-
196-
// Step 5
197-
re = /^(.+?)e$/;
198-
if (re.test(w)) {
199-
var fp = re.exec(w);
200-
stem = fp[1];
201-
re = new RegExp(mgr1);
202-
re2 = new RegExp(meq1);
203-
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
204-
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
205-
w = stem;
206-
}
207-
re = /ll$/;
208-
re2 = new RegExp(mgr1);
209-
if (re.test(w) && re2.test(w)) {
210-
re = /.$/;
211-
w = w.replace(re,"");
212-
}
213-
214-
// and turn initial Y back to y
215-
if (firstch == "y")
216-
w = firstch.toLowerCase() + w.substr(1);
217-
return w;
218-
}
219-
}
220-
"""
221-
22236

22337
class SearchChinese(SearchLanguage):
22438
"""Chinese search implementation"""
22539

22640
lang = 'zh'
22741
language_name = 'Chinese'
228-
js_stemmer_code = js_porter_stemmer
42+
js_stemmer_rawcode = 'english-stemmer.js'
22943
stopwords = ENGLISH_STOPWORDS
23044
latin1_letters = re.compile(r'[a-zA-Z0-9_]+')
23145

0 commit comments

Comments
 (0)