split find_valid_full_path to find_full_path and find_root_directory

Thinh Nguyen · Thinh Nguyen · commit 258839b3a97c · 2021-04-19T15:07:30.000-05:00
diff --git a/element_array_ephys/__init__.py b/element_array_ephys/__init__.py
@@ -5,29 +5,51 @@
 dj.config['enable_python_native_blobs'] = True
 
 
-def find_valid_full_path(potential_root_directories, path):
+def find_full_path(root_directories, relative_path):
     """
-    Given multiple potential root directories and a single path
-    Search and return one directory that is the parent of the given path
-        :param potential_root_directories: potential root directories
-        :param path: the path to search the root directory
-        :return: (fullpath, root_directory)
+    Given a relative path, search and return the full-path
+     from provided potential root directories (in the given order)
+        :param root_directories: potential root directories
+        :param relative_path: the relative path to find the valid root directory
+        :return: root_directory
     """
-    path = pathlib.Path(path)
+    relative_path = pathlib.Path(relative_path)
+
+    if relative_path.exists():
+        return relative_path
 
     # turn to list if only a single root directory is provided
-    if isinstance(potential_root_directories, (str, pathlib.Path)):
-        potential_root_directories = [potential_root_directories]
-
-    # search routine
-    for root_dir in potential_root_directories:
-        root_dir = pathlib.Path(root_dir)
-        if path.exists():
-            if root_dir in list(path.parents):
-                return path, root_dir
-        else:
-            if (root_dir / path).exists():
-                return root_dir / path, root_dir
-
-    raise FileNotFoundError('Unable to identify root-directory (from {})'
-                            ' associated with {}'.format(potential_root_directories, path))
+    if isinstance(root_directories, (str, pathlib.Path)):
+        root_directories = [root_directories]
+
+    for root_dir in root_directories:
+        if (pathlib.Path(root_dir) / relative_path).exists():
+            return pathlib.Path(root_dir) / relative_path
+
+    raise FileNotFoundError('No valid full-path found (from {})'
+                            ' for {}'.format(root_directories, relative_path))
+
+
+def find_root_directory(root_directories, full_path):
+    """
+    Given multiple potential root directories and a full-path,
+    search and return one directory that is the parent of the given path
+        :param root_directories: potential root directories
+        :param full_path: the relative path to search the root directory
+        :return: full-path
+    """
+    full_path = pathlib.Path(full_path)
+
+    if not full_path.exists():
+        raise FileNotFoundError(f'{full_path} does not exist!')
+
+    # turn to list if only a single root directory is provided
+    if isinstance(root_directories, (str, pathlib.Path)):
+        root_directories = [root_directories]
+
+    try:
+        return next(root_dir for root_dir in root_directories
+                    if full_path.is_relative_to(root_dir))
+    except StopIteration:
+        raise FileNotFoundError('No valid root directory found (from {})'
+                                ' for {}'.format(root_directories, full_path))
diff --git a/element_array_ephys/ephys.py b/element_array_ephys/ephys.py
@@ -8,7 +8,7 @@
 import importlib
 
 from .readers import spikeglx, kilosort, openephys
-from . import probe, find_valid_full_path
+from . import probe, find_full_path, find_root_directory
 
 schema = dj.schema()
 
@@ -186,7 +186,7 @@ def make(self, key):
                           'acq_software': acq_software,
                           'sampling_rate': spikeglx_meta.meta['imSampRate']})
 
-            _, root_dir = find_valid_full_path(get_ephys_root_data_dir(), meta_filepath)
+            root_dir = find_root_directory(get_ephys_root_data_dir(), meta_filepath)
             self.EphysFile.insert1({
                 **key,
                 'file_path': meta_filepath.relative_to(root_dir).as_posix()})
@@ -221,7 +221,7 @@ def make(self, key):
                           'acq_software': acq_software,
                           'sampling_rate': probe_data.ap_meta['sample_rate']})
 
-            _, root_dir = find_valid_full_path(
+            root_dir = find_root_directory(
                 get_ephys_root_data_dir(),
                 probe_data.recording_info['recording_files'][0])
             self.EphysFile.insert([{**key,
@@ -417,7 +417,7 @@ class Clustering(dj.Imported):
     def make(self, key):
         task_mode, output_dir = (ClusteringTask & key).fetch1(
             'task_mode', 'clustering_output_dir')
-        kilosort_dir, _ = find_valid_full_path(get_ephys_root_data_dir(), output_dir)
+        kilosort_dir = find_full_path(get_ephys_root_data_dir(), output_dir)
 
         if task_mode == 'load':
             kilosort_dataset = kilosort.Kilosort(kilosort_dir)  # check if the directory is a valid Kilosort output
@@ -455,7 +455,7 @@ def create1_from_clustering_task(self, key, curation_note=''):
 
         task_mode, output_dir = (ClusteringTask & key).fetch1(
             'task_mode', 'clustering_output_dir')
-        kilosort_dir, _ = find_valid_full_path(get_ephys_root_data_dir(), output_dir)
+        kilosort_dir = find_full_path(get_ephys_root_data_dir(), output_dir)
 
         creation_time, is_curated, is_qc = kilosort.extract_clustering_info(kilosort_dir)
         # Synthesize curation_id
@@ -487,7 +487,7 @@ class Unit(dj.Part):
 
     def make(self, key):
         output_dir = (Curation & key).fetch1('curation_output_dir')
-        kilosort_dir, _ = find_valid_full_path(get_ephys_root_data_dir(), output_dir)
+        kilosort_dir = find_full_path(get_ephys_root_data_dir(), output_dir)
 
         kilosort_dataset = kilosort.Kilosort(kilosort_dir)
         acq_software = (EphysRecording & key).fetch1('acq_software')
@@ -561,7 +561,7 @@ class UnitElectrode(dj.Part):
 
     def make(self, key):
         output_dir = (Curation & key).fetch1('curation_output_dir')
-        kilosort_dir, _ = find_valid_full_path(get_ephys_root_data_dir(), output_dir)
+        kilosort_dir = find_full_path(get_ephys_root_data_dir(), output_dir)
 
         kilosort_dataset = kilosort.Kilosort(kilosort_dir)
 
@@ -645,8 +645,8 @@ def get_spikeglx_meta_filepath(ephys_recording_key):
                               & 'file_path LIKE "%.ap.meta"').fetch1('file_path')
 
     try:
-        spikeglx_meta_filepath, _ = find_valid_full_path(get_ephys_root_data_dir(),
-                                                         spikeglx_meta_filepath)
+        spikeglx_meta_filepath = find_full_path(get_ephys_root_data_dir(),
+                                                spikeglx_meta_filepath)
     except FileNotFoundError:
         # if not found, search in session_dir again
         if not spikeglx_meta_filepath.exists():
diff --git a/element_array_ephys/readers/kilosort.py b/element_array_ephys/readers/kilosort.py
@@ -12,7 +12,7 @@
 
 class Kilosort:
 
-    ks_files = [
+    kilosort_files = [
         'params.py',
         'amplitudes.npy',
         'channel_map.npy',
@@ -36,18 +36,18 @@ class Kilosort:
     ]
 
     # keys to self.files, .data are file name e.g. self.data['params'], etc.
-    ks_keys = [path.splitext(ks_file)[0] for ks_file in ks_files]
+    kilosort_keys = [path.splitext(kilosort_file)[0] for kilosort_file in kilosort_files]
 
-    def __init__(self, ks_dir):
-        self._ks_dir = pathlib.Path(ks_dir)
+    def __init__(self, kilosort_dir):
+        self._kilosort_dir = pathlib.Path(kilosort_dir)
         self._files = {}
         self._data = None
         self._clusters = None
 
-        params_filepath = ks_dir / 'params.py'
+        params_filepath = kilosort_dir / 'params.py'
 
         if not params_filepath.exists():
-            raise FileNotFoundError(f'No Kilosort output found in: {ks_dir}')
+            raise FileNotFoundError(f'No Kilosort output found in: {kilosort_dir}')
 
         self._info = {'time_created': datetime.fromtimestamp(params_filepath.stat().st_ctime),
                       'time_modified': datetime.fromtimestamp(params_filepath.stat().st_mtime)}
@@ -64,42 +64,44 @@ def info(self):
 
     def _stat(self):
         self._data = {}
-        for ks_filename in Kilosort.ks_files:
-            ks_filepath = self._ks_dir / ks_filename
+        for kilosort_filename in Kilosort.kilosort_files:
+            kilosort_filepath = self._kilosort_dir / kilosort_filename
 
-            if not ks_filepath.exists():
-                log.debug('skipping {} - does not exist'.format(ks_filepath))
+            if not kilosort_filepath.exists():
+                log.debug('skipping {} - does not exist'.format(kilosort_filepath))
                 continue
 
-            base, ext = path.splitext(ks_filename)
-            self._files[base] = ks_filepath
+            base, ext = path.splitext(kilosort_filename)
+            self._files[base] = kilosort_filepath
 
-            if ks_filename == 'params.py':
-                log.debug('loading params.py {}'.format(ks_filepath))
+            if kilosort_filename == 'params.py':
+                log.debug('loading params.py {}'.format(kilosort_filepath))
                 # params.py is a 'key = val' file
                 params = {}
-                for line in open(ks_filepath, 'r').readlines():
+                for line in open(kilosort_filepath, 'r').readlines():
                     k, v = line.strip('\n').split('=')
                     params[k.strip()] = convert_to_number(v.strip())
                 log.debug('params: {}'.format(params))
                 self._data[base] = params
 
             if ext == '.npy':
-                log.debug('loading npy {}'.format(ks_filepath))
-                d = np.load(ks_filepath, mmap_mode='r', allow_pickle=False, fix_imports=False)
+                log.debug('loading npy {}'.format(kilosort_filepath))
+                d = np.load(kilosort_filepath, mmap_mode='r',
+                            allow_pickle=False, fix_imports=False)
                 self._data[base] = (np.reshape(d, d.shape[0])
                                     if d.ndim == 2 and d.shape[1] == 1 else d)
 
         # Read the Cluster Groups
         for cluster_pattern, cluster_col_name in zip(['cluster_groups.*', 'cluster_KSLabel.*'],
                                                      ['group', 'KSLabel']):
             try:
-                cluster_file = next(self._ks_dir.glob(cluster_pattern))
-                cluster_file_suffix = cluster_file.suffix
-                assert cluster_file_suffix in ('.csv', '.tsv', '.xlsx')
-                break
+                cluster_file = next(self._kilosort_dir.glob(cluster_pattern))
             except StopIteration:
                 pass
+
+            cluster_file_suffix = cluster_file.suffix
+            assert cluster_file_suffix in ('.csv', '.tsv', '.xlsx')
+            break
         else:
             raise FileNotFoundError(
                 'Neither "cluster_groups" nor "cluster_KSLabel" file found!')
@@ -118,7 +120,7 @@ def get_best_channel(self, unit):
         template_idx = self.data['spike_templates'][
             np.where(self.data['spike_clusters'] == unit)[0][0]]
         channel_templates = self.data['templates'][template_idx, :, :]
-        max_channel_idx = np.abs(np.abs(channel_templates).max(axis=0)).argmax()
+        max_channel_idx = np.abs(channel_templates).max(axis=0).argmax()
         max_channel = self.data['channel_map'][max_channel_idx]
 
         return max_channel, max_channel_idx
@@ -174,12 +176,10 @@ def extract_clustering_info(cluster_output_dir):
 
     # ---- Quality control? ----
     metric_filepath = cluster_output_dir / 'metrics.csv'
-    if metric_filepath.exists():
-        is_qc = True
+    is_qc = metric_filepath.exists()
+    if is_qc:
         if creation_time is None:
             creation_time = datetime.fromtimestamp(metric_filepath.stat().st_ctime)
-    else:
-        is_qc = False
 
     if creation_time is None:
         spiketimes_filepath = next(cluster_output_dir.glob('spike_times.npy'))