@@ -161,7 +161,6 @@ def __rows_as_void(rows):
161
161
void_dtype = np .dtype ((np .void , arr_contiguous .dtype .itemsize * arr_contiguous .shape [1 ]))
162
162
return arr_contiguous .view (void_dtype ).ravel ()
163
163
164
-
165
164
corpus_philo_ids_void = __rows_as_void (corpus_philo_ids )
166
165
philo_ids_void = __rows_as_void (philo_ids )
167
166
matching_indices_void = np .isin (philo_ids_void , corpus_philo_ids_void )
@@ -228,15 +227,10 @@ def search_word(db_path, hitlist_filename, corpus_file=None):
228
227
def search_phrase (db_path , hitlist_filename , corpus_file = None ):
229
228
"""Phrase searches where words need to be in a specific order"""
230
229
word_groups = get_word_groups (f"{ hitlist_filename } .terms" )
231
- object_level = None
232
- corpus_philo_ids = None
233
- if corpus_file is not None :
234
- corpus_philo_ids , object_level = get_corpus_philo_ids (corpus_file )
235
230
common_object_ids = get_cooccurrence_groups (
236
- db_path , word_groups , corpus_philo_ids = corpus_philo_ids , object_level = object_level , cooc_order = True
231
+ db_path , word_groups , corpus_file = corpus_file , cooc_order = True
237
232
)
238
233
mapping_order = next (common_object_ids )
239
-
240
234
with open (hitlist_filename , "wb" ) as output_file :
241
235
for philo_id_groups in common_object_ids :
242
236
for group_combination in product (* philo_id_groups ):
@@ -256,12 +250,8 @@ def search_phrase(db_path, hitlist_filename, corpus_file=None):
256
250
def search_within_word_span (db_path , hitlist_filename , n , cooc_order , exact_distance , corpus_file = None ):
257
251
"""Search for co-occurrences of multiple words within n words of each other in the database."""
258
252
word_groups = get_word_groups (f"{ hitlist_filename } .terms" )
259
- object_level = None
260
- corpus_philo_ids = None
261
- if corpus_file is not None :
262
- corpus_philo_ids , object_level = get_corpus_philo_ids (corpus_file )
263
253
common_object_ids = get_cooccurrence_groups (
264
- db_path , word_groups , corpus_philo_ids = corpus_philo_ids , object_level = object_level , cooc_order = cooc_order
254
+ db_path , word_groups , corpus_file = corpus_file , cooc_order = cooc_order
265
255
)
266
256
267
257
if cooc_order is True :
@@ -302,16 +292,11 @@ def search_within_word_span(db_path, hitlist_filename, n, cooc_order, exact_dist
302
292
def search_within_text_object (db_path , hitlist_filename , level , cooc_order , corpus_file = None ):
303
293
"""Search for co-occurrences of multiple words in the same sentence in the database."""
304
294
word_groups = get_word_groups (f"{ hitlist_filename } .terms" )
305
- object_level = None
306
- corpus_philo_ids = None
307
- if corpus_file is not None :
308
- corpus_philo_ids , object_level = get_corpus_philo_ids (corpus_file )
309
295
common_object_ids = get_cooccurrence_groups (
310
296
db_path ,
311
297
word_groups ,
312
298
level = level ,
313
- corpus_philo_ids = corpus_philo_ids ,
314
- object_level = object_level ,
299
+ corpus_file = corpus_file ,
315
300
cooc_order = cooc_order ,
316
301
)
317
302
@@ -358,7 +343,7 @@ def get_word_groups(terms_file):
358
343
359
344
360
345
def get_cooccurrence_groups (
361
- db_path , word_groups , level = "sent" , corpus_philo_ids = None , object_level = None , cooc_order = False
346
+ db_path , word_groups , level = "sent" , corpus_file = None , cooc_order = False
362
347
):
363
348
cooc_slice = 6
364
349
if level == "para" :
@@ -396,8 +381,8 @@ def one_word_generator(word):
396
381
else :
397
382
group_generators .append (merge_word_group (txn , words , chunk_size = 36 * 1000 ))
398
383
399
- if corpus_philo_ids is not None :
400
- first_group_data = filter_philo_ids (first_group_data , corpus_philo_ids , object_level )
384
+ if corpus_file is not None :
385
+ first_group_data = filter_philo_ids (corpus_file , first_group_data )
401
386
402
387
group_data = [None for _ in range (len (word_groups ) - 1 )] # Start with None for each group
403
388
break_out = False
0 commit comments