Skip to content
This repository was archived by the owner on Mar 9, 2023. It is now read-only.

Commit 620f7c3

Browse files
authored
Cython based optimization (#123)
* Remove unecessary deep copy * Add lru_cache on get_word_info * Add lru_cache to get_word_info This seems to be a small speedup. * Basic Cythonization Unlike the other branch the tests pass on this one. Benchmark time went down by a third compared to the previous commit. I'm not sure the _c functions are necessary here - I think that's what cpdef functions are for, but I had difficulty getting them working. Will need to give that another look. * Use cpdef functions Didn't have any issues this time, and it's cleaner with no clear performance difference. * Move build_lattice to Cython, intern some slow parts This should cut execution time by roughly 25% compared to the last commit. * Don't use deepcopy This is not an appropriate use of deepcopy and it's slow. * Add cython to setup_requires * Fix setup.py * Make INHIBITED_CONNECTION literal Minor speed boost. * Bring the matrix into the lattice building This provides a notable speedup. * Various cythonizations Improvements are relatively minor compared to previous commit, but there is a few seconds of speedup. * Inline function for small speed boost * Change import order, make lru cache size explicit Maybe this will make Travis happy? * Add a build command * Use INT_MAX * Remove comment Missed this before, this is fine.
1 parent 4d50586 commit 620f7c3

14 files changed

+148
-84
lines changed

.travis.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ python:
66
- '3.7'
77
install:
88
- pip install flake8 flake8-import-order flake8-builtins && pip install -r requirements.txt
9+
- python setup.py build_ext --inplace
910
before_script:
1011
- cp .travis/system.dic.test tests/resources/system.dic && cp .travis/user.dic.test tests/resources/user.dic
1112
script:

setup.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,17 @@
1414

1515
from setuptools import setup, find_packages
1616

17+
from distutils.extension import Extension
18+
19+
extensions = [
20+
Extension('sudachipy.latticenode', ['sudachipy/latticenode.pyx']),
21+
Extension('sudachipy.lattice', ['sudachipy/lattice.pyx']),
22+
Extension('sudachipy.tokenizer', ['sudachipy/tokenizer.pyx']),
23+
]
24+
1725
setup(name="SudachiPy",
1826
use_scm_version=True,
19-
setup_requires=['setuptools_scm'],
27+
setup_requires=['setuptools_scm', 'cython'],
2028
description="Python version of Sudachi, the Japanese Morphological Analyzer",
2129
long_description=open('README.md', encoding='utf-8').read(),
2230
long_description_content_type="text/markdown",
@@ -33,4 +41,5 @@
3341
"sortedcontainers~=2.1.0",
3442
'dartsclone~=0.9.0',
3543
],
44+
ext_modules=extensions,
3645
)

sudachipy/dictionarylib/lexiconset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from functools import lru_cache
1516
from typing import List
1617

1718
from .lexicon import Lexicon
@@ -57,6 +58,7 @@ def get_cost(self, word_id: int) -> int:
5758
return self.lexicons[self.get_dictionary_id(word_id)]\
5859
.get_cost(self.get_word_id1(word_id))
5960

61+
@lru_cache(1024)
6062
def get_word_info(self, word_id: int) -> 'WordInfo': # noqa: F821
6163
dic_id = self.get_dictionary_id(word_id)
6264
winfo = self.lexicons[dic_id].get_word_info(self.get_word_id1(word_id))

sudachipy/dictionarylib/wordinfolist.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import struct
16+
from functools import lru_cache
1617

1718
from .wordinfo import WordInfo
1819

@@ -23,6 +24,7 @@ def __init__(self, bytes_, offset, word_size):
2324
self.offset = offset
2425
self._word_size = word_size
2526

27+
@lru_cache(2048)
2628
def get_word_info(self, word_id):
2729
orig_pos = self.bytes.tell()
2830
index = self.word_id_to_offset(word_id)

sudachipy/lattice.pxd

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from .latticenode cimport LatticeNode
2+
3+
cdef extern from "limits.h":
4+
cdef int INT_MAX
5+
6+
cdef class Lattice:
7+
8+
cdef int size
9+
cdef int capacity
10+
cdef LatticeNode eos_node
11+
12+
cdef list end_lists
13+
cdef object grammar
14+
cdef object eos_params
15+
cdef const short[:,:] connect_costs
16+
17+
cpdef void resize(self, int size)
18+
cpdef void insert(self, int begin, int end, LatticeNode node)
19+
cdef void connect_node(self, LatticeNode r_node)
20+
cdef void connect_eos_node(self)

sudachipy/lattice.py renamed to sudachipy/lattice.pyx

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,26 +15,26 @@
1515
from typing import List, Optional
1616

1717
from .dictionarylib.grammar import Grammar
18-
from .latticenode import LatticeNode
18+
from .latticenode cimport LatticeNode
1919

20+
cdef class Lattice:
2021

21-
class Lattice:
22+
def __init__(self, grammar: Grammar):
23+
self.size = 0
24+
self.capacity = 0
2225

23-
size = 0
24-
capacity = 0
25-
eos_node = None
2626

27-
def __init__(self, grammar: Grammar):
2827
self.end_lists = []
2928
self.grammar = grammar
3029
self.eos_params = grammar.get_eos_parameter()
31-
bos_node = LatticeNode()
30+
cdef LatticeNode bos_node = LatticeNode()
3231
bos_params = grammar.get_bos_parameter()
3332
bos_node.set_parameter(bos_params[0], bos_params[1], bos_params[2])
3433
bos_node.is_connected_to_bos = True
3534
self.end_lists.append([bos_node])
35+
self.connect_costs = self.grammar._matrix_view
3636

37-
def resize(self, size: int) -> None:
37+
cpdef void resize(self, int size):
3838
if size > self.capacity:
3939
self.expand(size)
4040
self.size = size
@@ -69,7 +69,7 @@ def get_minumum_node(self, begin: int, end: int) -> Optional[LatticeNode]:
6969
min_arg = node
7070
return min_arg
7171

72-
def insert(self, begin: int, end: int, node: LatticeNode) -> None:
72+
cpdef void insert(self, int begin, int end, LatticeNode node):
7373
self.end_lists[end].append(node)
7474
node.begin = begin
7575
node.end = end
@@ -85,15 +85,20 @@ def create_node() -> LatticeNode:
8585
def has_previous_node(self, index: int) -> bool:
8686
return bool(self.end_lists[index])
8787

88-
def connect_node(self, r_node: LatticeNode) -> None:
88+
cdef void connect_node(self, LatticeNode r_node):
8989
begin = r_node.begin
90-
r_node.total_cost = float('inf')
90+
r_node.total_cost = INT_MAX
91+
92+
cdef LatticeNode l_node
93+
cdef int connect_cost
9194
for l_node in self.end_lists[begin]:
9295
if not l_node.is_connected_to_bos:
9396
continue
9497
# right_id and left_id look reversed, but it works ...
95-
connect_cost = self.grammar.get_connect_cost(l_node.right_id, r_node.left_id)
96-
if connect_cost == Grammar.INHIBITED_CONNECTION:
98+
connect_cost = self.connect_costs[l_node.right_id, r_node.left_id]
99+
100+
# 0x7fff == Grammar.INHIBITED_CONNECTION:
101+
if connect_cost == 0x7fff:
97102
continue
98103
cost = l_node.total_cost + connect_cost
99104
if cost < r_node.total_cost:
@@ -103,7 +108,7 @@ def connect_node(self, r_node: LatticeNode) -> None:
103108
r_node.is_connected_to_bos = r_node.best_previous_node is not None
104109
r_node.total_cost += r_node.cost
105110

106-
def connect_eos_node(self) -> None:
111+
cdef void connect_eos_node(self):
107112
self.connect_node(self.eos_node)
108113

109114
def get_best_path(self) -> List[LatticeNode]:

sudachipy/latticenode.pxd

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
cdef class LatticeNode:
2+
3+
cdef int begin
4+
cdef int end
5+
cdef int total_cost
6+
cdef int word_id
7+
cdef bint _is_oov
8+
cdef LatticeNode best_previous_node
9+
cdef bint is_connected_to_bos
10+
cdef object extra_word_info
11+
cdef object undefined_word_info
12+
cdef bint is_defined
13+
cdef object lexicon
14+
cdef int left_id
15+
cdef int right_id
16+
cdef int cost
17+

sudachipy/latticenode.py renamed to sudachipy/latticenode.pyx

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# cython: profile=True
2+
13
# Copyright (c) 2019 Works Applications Co., Ltd.
24
#
35
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,27 +17,22 @@
1517
from .dictionarylib.wordinfo import WordInfo
1618

1719
__NULL_SURFACE = '(null)'
18-
UNK = WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
19-
__NULL_SURFACE, __NULL_SURFACE, [], [], [])
20-
20+
UNK =\
21+
WordInfo(__NULL_SURFACE, 0, -1, __NULL_SURFACE, -1,
22+
__NULL_SURFACE, __NULL_SURFACE, [], [], [])
2123

22-
class LatticeNode:
23-
24-
begin = 0
25-
end = 0
26-
total_cost = 0
27-
word_id = 0
28-
_is_oov = False
29-
best_previous_node = None
30-
is_connected_to_bos = None
31-
extra_word_info = None
32-
lexicon = None
33-
left_id = None
34-
right_id = None
35-
cost = None
24+
cdef class LatticeNode:
3625

3726
def __init__(self, lexicon=None, left_id=None, right_id=None, cost=None, word_id=None):
3827

28+
self.begin = 0
29+
self.end = 0
30+
self.word_id = 0
31+
self._is_oov = False
32+
self.best_previous_node = None
33+
self.is_connected_to_bos = False
34+
self.extra_word_info = None
35+
3936
self.is_defined = True
4037
if lexicon is left_id is right_id is cost is word_id is None:
4138
self.is_defined = False
@@ -54,9 +51,15 @@ def set_parameter(self, left_id: int, right_id: int, cost: int) -> None:
5451
def get_begin(self) -> int:
5552
return self.begin
5653

54+
def set_begin(self, begin) -> None:
55+
self.begin = begin
56+
5757
def get_end(self) -> int:
5858
return self.end
5959

60+
def set_end(self, end) -> None:
61+
self.end = end
62+
6063
def set_range(self, begin: int, end: int) -> None:
6164
self.begin = begin
6265
self.end = end

sudachipy/plugin/oov/mecab_oov_plugin.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,9 @@ def __init__(self):
3232

3333
class OOV:
3434
def __init__(self):
35-
self.left_id = None
36-
self.right_id = None
37-
self.cost = None
35+
self.left_id = -1
36+
self.right_id = -1
37+
self.cost = -1
3838
self.pos_id = None
3939

4040
def __init__(self, json_obj=None):

sudachipy/plugin/oov/oov_provider_plugin.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ def provide_oov(self, input_text: UTF8InputText, offset: int, has_other_words: b
3333
def get_oov(self, input_text: UTF8InputText, offset: int, has_other_words: bool) -> List[LatticeNode]:
3434
nodes = self.provide_oov(input_text, offset, has_other_words)
3535
for node in nodes:
36-
node.begin = offset
37-
node.end = offset + node.get_word_info().length()
36+
node.set_begin(offset)
37+
node.set_end(offset + node.get_word_info().length())
3838
return nodes
3939

4040
@staticmethod

sudachipy/plugin/path_rewrite/join_katakana_oov_plugin.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,4 +80,4 @@ def can_oov_bow_node(self, text, node):
8080

8181
@staticmethod
8282
def is_shorter(length: int, text: UTF8InputText, node: LatticeNode):
83-
return text.code_point_count(node.begin, node.end) < length
83+
return text.code_point_count(node.get_begin(), node.get_end()) < length

sudachipy/tokenizer.py renamed to sudachipy/tokenizer.pyx

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,58 @@
2020
from .dictionarylib.categorytype import CategoryType
2121
from .dictionarylib.grammar import Grammar
2222
from .dictionarylib.lexicon import Lexicon
23-
from .lattice import Lattice
24-
from .latticenode import LatticeNode
23+
from .lattice cimport Lattice
24+
from .latticenode cimport LatticeNode
2525
from .morphemelist import MorphemeList
2626
from .plugin.input_text import InputTextPlugin
2727
from .plugin.path_rewrite import PathRewritePlugin
2828
from .utf8inputtext import UTF8InputText
2929
from .utf8inputtextbuilder import UTF8InputTextBuilder
3030

3131

32+
cdef void build_lattice_c(object tokenizer, object input_):
33+
bytes_ = input_.get_byte_text()
34+
35+
cdef Lattice lattice = tokenizer._lattice
36+
lattice.resize(len(bytes_))
37+
38+
cdef unsigned int i, word_id, end, idx
39+
cdef int left_id, right_id, cost
40+
cdef object lexicon = tokenizer._lexicon
41+
cdef list oov_provider_plugins = tokenizer._oov_provider_plugins
42+
43+
for i in range(len(bytes_)):
44+
if not input_.can_bow(i) or not lattice.has_previous_node(i):
45+
continue
46+
iterator = lexicon.lookup(bytes_, i)
47+
has_words = False
48+
for word_id, end in iterator:
49+
if (end < len(bytes_)) and (not input_.can_bow(end)):
50+
continue
51+
has_words = True
52+
53+
lex = lexicon.lexicons[word_id >> 28]
54+
idx = (0x0FFFFFFF & word_id) * 3 # 3 is ELEMENT_SIZE_AS_SHORT
55+
left_id, right_id, cost = lex.word_params._array_view[idx:idx+3]
56+
n = LatticeNode(lexicon, left_id, right_id, cost, word_id)
57+
58+
lattice.insert(i, end, n)
59+
60+
# OOV
61+
if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
62+
for oov_plugin in tokenizer._oov_provider_plugins:
63+
for node in oov_plugin.get_oov(input_, i, has_words):
64+
has_words = True
65+
lattice.insert(node.get_begin(), node.get_end(), node)
66+
if not has_words and tokenizer.default_oov_provider:
67+
for node in tokenizer.default_oov_provider.get_oov(input_, i, has_words):
68+
has_words = True
69+
lattice.insert(node.get_begin(), node.get_end(), node)
70+
71+
if not has_words:
72+
raise RuntimeError("there is no morpheme at " + str(i))
73+
lattice.connect_node(lattice.eos_node)
74+
3275
class Tokenizer:
3376
""" tokenizer of morphological analysis
3477
@@ -124,38 +167,7 @@ def tokenize(self, text: str, mode=None, logger=None) -> MorphemeList:
124167
return ml
125168

126169
def _build_lattice(self, input_: UTF8InputText):
127-
bytes_ = input_.get_byte_text()
128-
self._lattice.resize(len(bytes_))
129-
for i in range(len(bytes_)):
130-
if not input_.can_bow(i) or not self._lattice.has_previous_node(i):
131-
continue
132-
iterator = self._lexicon.lookup(bytes_, i)
133-
has_words = False
134-
for word_id, end in iterator:
135-
if (end < len(bytes_)) and (not input_.can_bow(end)):
136-
continue
137-
has_words = True
138-
n = LatticeNode(self._lexicon,
139-
self._lexicon.get_left_id(word_id),
140-
self._lexicon.get_right_id(word_id),
141-
self._lexicon.get_cost(word_id),
142-
word_id)
143-
self._lattice.insert(i, end, n)
144-
145-
# OOV
146-
if CategoryType.NOOOVBOW not in input_.get_char_category_types(i):
147-
for oov_plugin in self._oov_provider_plugins:
148-
for node in oov_plugin.get_oov(input_, i, has_words):
149-
has_words = True
150-
self._lattice.insert(node.get_begin(), node.get_end(), node)
151-
if not has_words and self.default_oov_provider:
152-
for node in self.default_oov_provider.get_oov(input_, i, has_words):
153-
has_words = True
154-
self._lattice.insert(node.get_begin(), node.get_end(), node)
155-
156-
if not has_words:
157-
raise RuntimeError("there is no morpheme at " + str(i))
158-
self._lattice.connect_eos_node()
170+
build_lattice_c(self, input_)
159171

160172
def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeNode]:
161173
if mode == self.SplitMode.C:
@@ -172,9 +184,9 @@ def _split_path(self, path: List[LatticeNode], mode: SplitMode) -> List[LatticeN
172184
offset = node.get_begin()
173185
for wid in wids:
174186
n = LatticeNode(self._lexicon, 0, 0, 0, wid)
175-
n.begin = offset
187+
n.set_begin(offset)
176188
offset += n.get_word_info().head_word_length
177-
n.end = offset
189+
n.set_end(offset)
178190
new_path.append(n)
179191
return new_path
180192

0 commit comments

Comments
 (0)