Skip to content

Commit 46f1314

Browse files
committed
more fixes to metadata filtering
1 parent 524fb5c commit 46f1314

File tree

1 file changed

+6
-21
lines changed

1 file changed

+6
-21
lines changed

python/philologic/runtime/Query.py

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ def __rows_as_void(rows):
161161
void_dtype = np.dtype((np.void, arr_contiguous.dtype.itemsize * arr_contiguous.shape[1]))
162162
return arr_contiguous.view(void_dtype).ravel()
163163

164-
165164
corpus_philo_ids_void = __rows_as_void(corpus_philo_ids)
166165
philo_ids_void = __rows_as_void(philo_ids)
167166
matching_indices_void = np.isin(philo_ids_void, corpus_philo_ids_void)
@@ -228,15 +227,10 @@ def search_word(db_path, hitlist_filename, corpus_file=None):
228227
def search_phrase(db_path, hitlist_filename, corpus_file=None):
229228
"""Phrase searches where words need to be in a specific order"""
230229
word_groups = get_word_groups(f"{hitlist_filename}.terms")
231-
object_level = None
232-
corpus_philo_ids = None
233-
if corpus_file is not None:
234-
corpus_philo_ids, object_level = get_corpus_philo_ids(corpus_file)
235230
common_object_ids = get_cooccurrence_groups(
236-
db_path, word_groups, corpus_philo_ids=corpus_philo_ids, object_level=object_level, cooc_order=True
231+
db_path, word_groups, corpus_file=corpus_file, cooc_order=True
237232
)
238233
mapping_order = next(common_object_ids)
239-
240234
with open(hitlist_filename, "wb") as output_file:
241235
for philo_id_groups in common_object_ids:
242236
for group_combination in product(*philo_id_groups):
@@ -256,12 +250,8 @@ def search_phrase(db_path, hitlist_filename, corpus_file=None):
256250
def search_within_word_span(db_path, hitlist_filename, n, cooc_order, exact_distance, corpus_file=None):
257251
"""Search for co-occurrences of multiple words within n words of each other in the database."""
258252
word_groups = get_word_groups(f"{hitlist_filename}.terms")
259-
object_level = None
260-
corpus_philo_ids = None
261-
if corpus_file is not None:
262-
corpus_philo_ids, object_level = get_corpus_philo_ids(corpus_file)
263253
common_object_ids = get_cooccurrence_groups(
264-
db_path, word_groups, corpus_philo_ids=corpus_philo_ids, object_level=object_level, cooc_order=cooc_order
254+
db_path, word_groups, corpus_file=corpus_file, cooc_order=cooc_order
265255
)
266256

267257
if cooc_order is True:
@@ -302,16 +292,11 @@ def search_within_word_span(db_path, hitlist_filename, n, cooc_order, exact_dist
302292
def search_within_text_object(db_path, hitlist_filename, level, cooc_order, corpus_file=None):
303293
"""Search for co-occurrences of multiple words in the same sentence in the database."""
304294
word_groups = get_word_groups(f"{hitlist_filename}.terms")
305-
object_level = None
306-
corpus_philo_ids = None
307-
if corpus_file is not None:
308-
corpus_philo_ids, object_level = get_corpus_philo_ids(corpus_file)
309295
common_object_ids = get_cooccurrence_groups(
310296
db_path,
311297
word_groups,
312298
level=level,
313-
corpus_philo_ids=corpus_philo_ids,
314-
object_level=object_level,
299+
corpus_file=corpus_file,
315300
cooc_order=cooc_order,
316301
)
317302

@@ -358,7 +343,7 @@ def get_word_groups(terms_file):
358343

359344

360345
def get_cooccurrence_groups(
361-
db_path, word_groups, level="sent", corpus_philo_ids=None, object_level=None, cooc_order=False
346+
db_path, word_groups, level="sent", corpus_file=None, cooc_order=False
362347
):
363348
cooc_slice = 6
364349
if level == "para":
@@ -396,8 +381,8 @@ def one_word_generator(word):
396381
else:
397382
group_generators.append(merge_word_group(txn, words, chunk_size=36 * 1000))
398383

399-
if corpus_philo_ids is not None:
400-
first_group_data = filter_philo_ids(first_group_data, corpus_philo_ids, object_level)
384+
if corpus_file is not None:
385+
first_group_data = filter_philo_ids(corpus_file, first_group_data)
401386

402387
group_data = [None for _ in range(len(word_groups) - 1)] # Start with None for each group
403388
break_out = False

0 commit comments

Comments
 (0)