7
7
import json
8
8
import logging
9
9
import pkg_resources
10
+ import os
11
+ import multiprocessing
10
12
11
13
# Semi-dependencies
12
14
try :
@@ -106,25 +108,36 @@ def clean_text(text, lang='en_US'):
106
108
return _get_classifier (lang ).clean_text (text )
107
109
108
110
111
+ def _clean_text_lang (lang ):
112
+ return _get_classifier (lang ).clean_text
113
+
114
+
109
115
###############################################################################
110
116
def train_classifier (parameters = None ,
111
117
ngram_range = (1 , 1 ),
112
118
store = True ,
113
- lang = 'en_US' ):
119
+ lang = 'en_US' ,
120
+ n_jobs = len (os .sched_getaffinity (0 ))):
114
121
"""
115
122
Train the intent classifier
116
123
TODO auto invoke if sklearn version is new or first install or sth
117
124
@:param store (bool) store classifier in clf.joblib
118
125
"""
126
+ _LOGGER .info ("Started training, parallelized with {} jobs" .format (n_jobs ))
119
127
_LOGGER .info ("Loading training set" )
120
128
training_set = load .training_set (lang )
121
129
target_names = list (frozenset ([i ['unit' ] for i in training_set ]))
122
130
123
131
_LOGGER .info ("Preparing training set" )
124
- train_data , train_target = [], []
125
- for example in training_set :
126
- train_data .append (clean_text (example ['text' ], lang ))
127
- train_target .append (target_names .index (example ['unit' ]))
132
+
133
+ if n_jobs > 1 :
134
+ with multiprocessing .Pool (processes = n_jobs ) as p :
135
+ train_data = p .map (_clean_text_lang (lang ), [ex ['text' ] for ex in training_set ])
136
+ else :
137
+ # This allows for classifier training in the interactive python shell
138
+ train_data = [_clean_text_lang (lang )(ex ['text' ]) for ex in training_set ]
139
+
140
+ train_target = [target_names .index (example ['unit' ]) for example in training_set ]
128
141
129
142
tfidf_model = TfidfVectorizer (
130
143
sublinear_tf = True ,
@@ -139,7 +152,7 @@ def train_classifier(parameters=None,
139
152
'loss' : 'log' ,
140
153
'penalty' : 'l2' ,
141
154
'tol' : 1e-3 ,
142
- 'n_jobs' : - 1 ,
155
+ 'n_jobs' : n_jobs ,
143
156
'alpha' : 0.0001 ,
144
157
'fit_intercept' : True ,
145
158
'random_state' : 0 ,
0 commit comments