Skip to content

Commit 84f6905

Browse files
authored
Release 2.0.1 (#234)
* Switched away from pyramid ARIMA due to stability issues * Now supports uncompressed pickle files (rather than just bzip2 compressed) * Using Python 3.7.3 * Handles imported data when dates are stored as strings rather than Timestamp objects * Corrected unigram handling
1 parent 776dda3 commit 84f6905

25 files changed

+516
-126
lines changed

.travis.yml

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@ language: python
22

33
matrix:
44
include:
5-
# Use the built in venv for linux builds
6-
- os: linux
7-
sudo: required
8-
python: "3.6.6"
9-
dist: trusty
5+
# Use the built in venv for linux builds
6+
- os: linux
7+
sudo: required
8+
python: "3.7.3"
9+
dist: xenial
1010

11-
# Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
11+
# Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html
1212
# - os: osx
1313
# language: generic
1414
# env: PYTHON=3.6.6
15-
15+
1616
before_install: |
1717
if [ "$TRAVIS_OS_NAME" == "osx" ]; then
1818
brew update
@@ -21,26 +21,26 @@ before_install: |
2121
# See https://docs.travis-ci.com/user/osx-ci-environment/#A-note-on-upgrading-packages.
2222
# I didn't do this above because it works and I'm lazy.
2323
brew outdated pyenv || brew upgrade pyenv
24-
24+
2525
# virtualenv doesn't work without pyenv knowledge. venv in Python 3.3
2626
# doesn't provide Pip by default. So, use `pyenv-virtualenv <https://github.com/yyuu/pyenv-virtualenv/blob/master/README.md>`_.
2727
brew install pyenv-virtualenv
2828
pyenv install $PYTHON
29-
29+
3030
# I would expect something like ``pyenv init; pyenv local $PYTHON`` or
3131
# ``pyenv shell $PYTHON`` would work, but ``pyenv init`` doesn't seem to
3232
# modify the Bash environment. ??? So, I hand-set the variables instead.
3333
export PYENV_VERSION=$PYTHON
3434
export PATH="/Users/travis/.pyenv/shims:${PATH}"
3535
pyenv-virtualenv venv
3636
source venv/bin/activate
37-
37+
3838
# A manual check that the correct version of Python is running.
3939
python --version
4040
fi
4141
4242
export BOTO_CONFIG=/dev/null
43-
43+
4444
install:
4545
- python --version
4646
- python -m pip install -U pip
@@ -53,6 +53,8 @@ install:
5353
script:
5454
# for codecov support
5555
- pip install pytest pytest-cov
56+
# to report installed packages
57+
- pip freeze
5658
# command to run tests
5759
- pytest --cov-config .coveragerc --cov=./ tests/
5860

appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ build: none
55
environment:
66
matrix:
77
- PYTHON: "C:\\Python36-x64"
8-
PYTHON_VERSION: 3.6.6
8+
PYTHON_VERSION: 3.7.3
99
PYTHON_ARCH: 64
1010
init:
1111
- ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%

config/stopwords_glob.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ everybody
8282
everyone
8383
everything
8484
everywhere
85+
excess
8586
f
8687
few
8788
find

config/stopwords_n.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
situation
22
consist
33
first
4-
plurality
4+
plurality
5+
second

config/stopwords_uni.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
etc
1+
etc
2+
cover
3+
adjacent

pygrams.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ def get_args(command_line_arguments):
128128

129129
args = parser.parse_args(command_line_arguments)
130130

131-
args.path = 'data'
132131
return args
133132

134133

@@ -165,7 +164,7 @@ def main(supplied_args):
165164
pickled_tf_idf_file_name=pickled_tf_idf_path,
166165
output_name=args.outputs_name, emerging_technology=args.emerging_technology)
167166

168-
pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=50)
167+
pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report)
169168

170169
# emtech integration
171170
if args.emerging_technology:

scripts/algorithms/arima.py

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,76 @@
1+
import warnings
2+
3+
import numpy as np
14
from numpy import clip, inf
2-
from pyramid.arima import auto_arima
5+
from sklearn.metrics import mean_squared_error
6+
from statsmodels.tsa.arima_model import ARIMA
37

48

59
class ARIMAForecast(object):
610

7-
def __init__(self, data_in, num_prediction_periods):
8-
if not all(isinstance(x, float) for x in data_in):
9-
raise ValueError('Time series must be all float values')
11+
def __evaluate_models(self, dataset, p_values, d_values, q_values):
12+
dataset=np.array(dataset)
13+
dataset = dataset.astype('float32')
14+
best_score, best_cfg = float("inf"), None
15+
for p in p_values:
16+
for d in d_values:
17+
for q in q_values:
18+
order = (p, d, q)
19+
try:
20+
mse = self.__evaluate_arima_model(dataset, order, ground_truth_in_history=True)
21+
if mse < best_score:
22+
best_score = mse
23+
best_cfg = order
24+
except:
25+
continue
26+
return best_cfg, best_score
27+
28+
def __evaluate_arima_model(self, X, arima_order, ground_truth_in_history=False):
29+
30+
train_ratio = 0.8
31+
train_size = int(len(X) * train_ratio)
32+
train, test = X[0:train_size], X[train_size:]
33+
history = [x for x in train]
34+
predictions = list()
1035

11-
self.__history = data_in
12-
self.__num_prediction_periods = num_prediction_periods
36+
for t in range(len(test)):
37+
model = ARIMA(history, order=arima_order)
38+
model_fit = model.fit(disp=0, maxiter=200)
39+
yhat = model_fit.forecast()[0][0]
40+
predictions.append(yhat)
41+
history.append(test[t] if ground_truth_in_history else yhat)
42+
error = mean_squared_error(test, predictions)
43+
return error
1344

14-
self.__stepwise_model = auto_arima(
15-
data_in,
16-
seasonal=False,
17-
error_action='ignore', suppress_warnings=True, stepwise=True
18-
)
45+
def __arima_model_predict(self, X, arima_order, steps_ahead):
46+
# make predictions
47+
predictions = list()
48+
try:
49+
for t in range(steps_ahead):
50+
model = ARIMA(X, order=arima_order)
51+
model_fit = model.fit(disp=0)
52+
yhat = model_fit.forecast()[0][0]
53+
predictions.append(yhat)
54+
X = np.append(X, yhat)
55+
except:
56+
predictions.extend([np.nan] * (steps_ahead - len(predictions)))
57+
58+
return predictions
59+
60+
def __init__(self, data_in, num_prediction_periods ):
61+
if not all(isinstance(x, float) for x in data_in):
62+
raise ValueError('Time series must be all float values')
1963

20-
self.__stepwise_model.fit(data_in)
64+
p_values = [0, 1, 2, 4, 6]
65+
d_values = range(0, 3)
66+
q_values = range(0, 3)
67+
warnings.filterwarnings("ignore")
68+
self.__order, score = self.__evaluate_models(data_in, p_values, d_values, q_values)
69+
self.__predictions = self.__arima_model_predict(data_in, self.__order, num_prediction_periods)
2170

2271
@property
2372
def configuration(self):
24-
return self.__stepwise_model.order
73+
return self.__order
2574

2675
def predict_counts(self):
27-
return clip(self.__stepwise_model.predict(n_periods=self.__num_prediction_periods), 0, inf)
76+
return clip(self.__predictions, 0, inf)

scripts/data_factory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ def get(doc_source_file_name):
1010
if not os.path.isfile(doc_source_file_name):
1111
raise PygramsException('file: ' + doc_source_file_name + ' does not exist in data folder')
1212

13-
if doc_source_file_name.endswith('.pkl.bz2'):
13+
if doc_source_file_name.endswith('.pkl.bz2') or doc_source_file_name.endswith('.pkl'):
1414
return read_pickle(doc_source_file_name)
1515
elif doc_source_file_name.endswith('.xls'):
1616
return read_excel(doc_source_file_name)

scripts/pipeline.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import pickle
33
from os import makedirs, path
44

5-
from pandas import read_pickle
5+
from pandas import read_pickle, to_datetime
6+
from pandas.api.types import is_string_dtype
67
from tqdm import tqdm
78

89
import scripts.data_factory as datafactory
@@ -11,7 +12,7 @@
1112
from scripts.documents_filter import DocumentsFilter
1213
from scripts.documents_weights import DocumentsWeights
1314
from scripts.filter_terms import FilterTerms
14-
from scripts.text_processing import LemmaTokenizer
15+
from scripts.text_processing import LemmaTokenizer, WordAnalyzer, lowercase_strip_accents_and_ownership
1516
from scripts.tfidf_mask import TfidfMask
1617
from scripts.tfidf_reduce import TfidfReduce
1718
from scripts.tfidf_wrapper import TFIDF
@@ -21,14 +22,24 @@
2122
from scripts.vandv.predictor import evaluate_prediction
2223

2324

24-
def checkdf( df, emtec, docs_mask_dict, text_header):
25+
def checkdf(df, emtec, docs_mask_dict, text_header, term_counts):
2526
app_exit = False
2627

27-
if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None:
28+
if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts:
2829
if docs_mask_dict['date_header'] not in df.columns:
2930
print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe")
3031
app_exit = True
3132

33+
if docs_mask_dict['date_header'] is not None:
34+
if is_string_dtype(df[docs_mask_dict['date_header']]):
35+
df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']])
36+
37+
min_date = min(df[docs_mask_dict['date_header']])
38+
max_date = max(df[docs_mask_dict['date_header']])
39+
print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
40+
else:
41+
print('Document dates not specified')
42+
3243
if text_header not in df.columns:
3344
print(f"text_header '{text_header}' not in dataframe")
3445
app_exit = True
@@ -61,7 +72,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
6172
if pickled_tf_idf_file_name is None:
6273

6374
self.__dataframe = datafactory.get(data_filename)
64-
checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header)
75+
checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts)
6576

6677
remove_empty_documents(self.__dataframe, text_header)
6778
self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range,
@@ -70,7 +81,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
7081
self.__text_lengths = self.__dataframe[text_header].map(len).tolist()
7182
self.__dataframe.drop(columns=[text_header], inplace=True)
7283

73-
tfidf_filename = path.join('outputs', 'tfidf', output_name + '-tfidf.pkl.bz2')
84+
tfidf_filename = path.join('outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
7485
makedirs(path.dirname(tfidf_filename), exist_ok=True)
7586
with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
7687
pickle.dump(
@@ -81,6 +92,17 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
8192
else:
8293
print(f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}')
8394
self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(pickled_tf_idf_file_name)
95+
if docs_mask_dict['date_header'] is None:
96+
print('Document dates not specified')
97+
else:
98+
min_date = min(self.__dataframe[docs_mask_dict['date_header']])
99+
max_date = max(self.__dataframe[docs_mask_dict['date_header']])
100+
print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}')
101+
102+
WordAnalyzer.init(
103+
tokenizer=LemmaTokenizer(),
104+
preprocess=lowercase_strip_accents_and_ownership,
105+
ngram_range=ngram_range)
84106

85107
# todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
86108
# the original. We're really just filtering down.
@@ -140,6 +162,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range
140162
docs_mask_dict['date_header'])
141163
# if other outputs
142164
self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
165+
self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
143166

144167
# todo: no output method; just if statements to call output functions...?
145168
# Only supply what they each directly require

scripts/text_processing.py

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
3232
DAMAGE.
3333
"""
34+
import scripts.utils.utils as ut
3435
import string
3536

3637
from nltk import word_tokenize, PorterStemmer, pos_tag
@@ -86,6 +87,7 @@ class WordAnalyzer(object):
8687
stemmed_stop_word_set_n = None
8788
stemmed_stop_word_set_uni = None
8889

90+
8991
@staticmethod
9092
def init(tokenizer, preprocess, ngram_range):
9193
WordAnalyzer.tokenizer = tokenizer
@@ -110,39 +112,23 @@ def init(tokenizer, preprocess, ngram_range):
110112
def analyzer(doc):
111113
"""based on VectorizerMixin._word_ngrams in sklearn/feature_extraction/text.py,
112114
from scikit-learn; extended to prevent generation of n-grams containing stop words"""
113-
tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
114-
115-
# handle token n-grams
116115
min_n, max_n = WordAnalyzer.ngram_range
117-
if max_n != 1:
118-
original_tokens = tokens
119-
if min_n == 1:
120-
# no need to do any slicing for unigrams
121-
# just iterate through the original tokens
122-
tokens = [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni and not w.isdigit()]
123-
# tokens = list(original_tokens)
124-
min_n += 1
125-
else:
126-
tokens = []
116+
original_tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc))
117+
tokens = original_tokens if min_n == 1 else []
127118

119+
# handle token n-grams
120+
if max_n > 1:
121+
min_phrase = max(min_n, 2)
128122
n_original_tokens = len(original_tokens)
129123

130124
# bind method outside of loop to reduce overhead
131125
tokens_append = tokens.append
132126
space_join = " ".join
133127

134-
for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
128+
for n in range(min_phrase, min(max_n + 1, n_original_tokens + 1)):
135129
for i in range(n_original_tokens - n + 1):
136130
candidate_ngram = original_tokens[i: i + n]
137-
hasdigit = False
138-
for ngram in candidate_ngram:
139-
if ngram.isdigit():
140-
hasdigit = True
131+
tokens_append(space_join(candidate_ngram))
141132

142-
ngram_stop_word_set = set(candidate_ngram) & WordAnalyzer.stemmed_stop_word_set_n
143-
if len(ngram_stop_word_set) == 0 and not hasdigit:
144-
tokens_append(space_join(candidate_ngram))
133+
return ut.stop(tokens,WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)
145134

146-
return tokens
147-
else:
148-
return [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni]

scripts/tfidf_mask.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8):
77
self.__feature_names = tfidf_obj.feature_names
88
self.__tfidf_mask = self.__tfidf_matrix.copy()
99
self.__tfidf_mask.data = np.ones(len(self.__tfidf_matrix.data))
10-
self.__vectorizer = tfidf_obj.vectorizer
10+
self.__vocabulary = tfidf_obj.vocabulary
1111
self.__uni_factor = uni_factor
1212
self.__idf = tfidf_obj.idf
1313

@@ -88,8 +88,8 @@ def __unbias_ngrams(self, max_ngram_length):
8888
ngram_minus_front = ' '.join(big_ngram_terms[1:])
8989
ngram_minus_back = ' '.join(big_ngram_terms[:len(big_ngram_terms) - 1])
9090

91-
idx_ngram_minus_front = self.__vectorizer.vocabulary_.get(ngram_minus_front)
92-
idx_ngram_minus_back = self.__vectorizer.vocabulary_.get(ngram_minus_back)
91+
idx_ngram_minus_front = self.__vocabulary.get(ngram_minus_front)
92+
idx_ngram_minus_back = self.__vocabulary.get(ngram_minus_back)
9393

9494
indices_slice = self.__tfidf_matrix.indices[start_idx_ptr:end_idx_ptr]
9595
ngram_counts = self.__tfidf_matrix.data[j] / self.__idf[col_idx]

scripts/tfidf_wrapper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ def tfidf_matrix(self):
3434
return self.__tfidf_matrix
3535

3636
@property
37-
def vectorizer(self):
38-
return self.__vectorizer
37+
def vocabulary(self):
38+
return self.__vectorizer.vocabulary_
3939

4040
@property
4141
def feature_names(self):

0 commit comments

Comments
 (0)