prototype design for multiple curations

Thinh Nguyen · Thinh Nguyen · commit 94686f5d2237 · 2021-03-06T12:14:59.000-06:00
diff --git a/elements_ephys/ephys.py b/elements_ephys/ephys.py
@@ -319,16 +319,6 @@ def insert_new_params(cls, processing_method: str, paramset_idx: int, paramset_d
             cls.insert1(param_dict)
 
 
-@schema
-class ClusteringTask(dj.Manual):
-    definition = """
-    -> EphysRecording
-    -> ClusteringParamSet
-    ---
-    clustering_output_dir: varchar(255)  #  clustering output directory relative to root data directory
-    """
-
-
 @schema
 class ClusterQualityLabel(dj.Lookup):
     definition = """
@@ -345,39 +335,89 @@ class ClusterQualityLabel(dj.Lookup):
     ]
 
 
+@schema
+class ClusteringTask(dj.Manual):
+    definition = """
+    -> EphysRecording
+    -> ClusteringParamSet
+    ---
+    clustering_output_dir: varchar(255)  #  clustering output directory relative to root data directory
+    task_mode='load': enum('load', 'trigger')  # 'load': load computed analysis results, 'trigger': trigger computation
+    """
+
+
 @schema
 class Clustering(dj.Imported):
+    """
+    A processing table to handle each ClusteringTask:
+    + If `task_mode == "trigger"`: trigger clustering analysis according to the ClusteringParamSet (e.g. launch a kilosort job)
+    + If `task_mode == "load"`: verify output and create a corresponding entry in the Curation table
+    """
     definition = """
     -> ClusteringTask
     ---
-    clustering_time: datetime  # time of generation of this set of clustering results 
-    quality_control: bool      # has this clustering result undergone quality control?
-    manual_curation: bool      # has manual curation been performed on this clustering result?
-    clustering_note='': varchar(2000)  
+    clustering_time: datetime             # time of generation of this set of clustering results 
+    """
+
+    def make(self, key):
+        root_dir = pathlib.Path(get_ephys_root_data_dir())
+        task_mode, output_dir = (ClusteringTask & key).fetch1('task_mode', 'clustering_output_dir')
+        ks_dir = root_dir / output_dir
+
+        if task_mode == 'load':
+            ks = kilosort.Kilosort(ks_dir)  # check if the directory is a valid Kilosort output
+            creation_time, is_curated, is_qc = kilosort.extract_clustering_info(ks_dir)
+            # Synthesize curation_id
+            curation_id = (dj.U().aggr(Curation & key, n='max(curation_id)').fetch1('n') or 0) + 1
+
+            self.insert1({**key, 'clustering_time': creation_time})
+            Curation.insert1({**key, 'curation_id': curation_id,
+                              'curation_time': creation_time, 'curation_output_dir': output_dir,
+                              'quality_control': is_qc, 'manual_curation': is_curated})
+        elif task_mode == 'trigger':
+            raise NotImplementedError('Automatic triggering of clustering analysis is not yet supported')
+        else:
+            raise ValueError(f'Unknown task mode: {task_mode}')
+
+
+@schema
+class Curation(dj.Manual):
+    definition = """
+    -> ClusteringTask
+    curation_id: int
+    ---
+    curation_time: datetime             # time of generation of this set of curated clustering results 
+    curation_output_dir: varchar(255)   #  output directory of the curated results, relative to root data directory
+    quality_control: bool               # has this clustering result undergone quality control?
+    manual_curation: bool               # has manual curation been performed on this clustering result?
+    curation_note='': varchar(2000)  
     """
 
-    class Unit(dj.Part):
-        definition = """   
-        -> master
-        unit: int
-        ---
-        -> probe.ElectrodeConfig.Electrode  # electrode on the probe that this unit has highest response amplitude
-        -> ClusterQualityLabel
-        spike_count: int         # how many spikes in this recording of this unit
-        spike_times: longblob    # (s) spike times of this unit, relative to the start of the EphysRecording
-        spike_sites : longblob   # array of electrode associated with each spike
-        spike_depths : longblob  # (um) array of depths associated with each spike, relative to the (0, 0) of the probe    
-        """
+
+@schema
+class Unit(dj.Imported):
+    definition = """   
+    -> Curation
+    unit: int
+    ---
+    -> probe.ElectrodeConfig.Electrode  # electrode on the probe that this unit has highest response amplitude
+    -> ClusterQualityLabel
+    spike_count: int         # how many spikes in this recording of this unit
+    spike_times: longblob    # (s) spike times of this unit, relative to the start of the EphysRecording
+    spike_sites : longblob   # array of electrode associated with each spike
+    spike_depths : longblob  # (um) array of depths associated with each spike, relative to the (0, 0) of the probe    
+    """
+
+    @property
+    def key_source(self):
+        return Curation()
 
     def make(self, key):
         root_dir = pathlib.Path(get_ephys_root_data_dir())
-        ks_dir = root_dir / (ClusteringTask & key).fetch1('clustering_output_dir')
+        ks_dir = root_dir / (Curation & key).fetch1('curation_output_dir')
         ks = kilosort.Kilosort(ks_dir)
         acq_software = (EphysRecording & key).fetch1('acq_software')
 
-        # ---------- Clustering ----------
-        creation_time, is_curated, is_qc = kilosort.extract_clustering_info(ks_dir)
-
         # ---------- Unit ----------
         # -- Remove 0-spike units
         withspike_idx = [i for i, u in enumerate(ks.data['cluster_ids']) if (ks.data['spike_clusters'] == u).any()]
@@ -413,15 +453,13 @@ def make(self, key):
                               'spike_sites': spike_sites[ks.data['spike_clusters'] == unit],
                               'spike_depths': spike_depths[ks.data['spike_clusters'] == unit]})
 
-        self.insert1({**key, 'clustering_time': creation_time,
-                      'quality_control': is_qc, 'manual_curation': is_curated})
-        self.Unit.insert([{**key, **u} for u in units])
+        self.insert([{**key, **u} for u in units])
 
 
 @schema
 class Waveform(dj.Imported):
     definition = """
-    -> Clustering.Unit
+    -> Unit
     ---
     peak_chn_waveform_mean: longblob  # mean over all spikes at the peak channel for this unit
     """
@@ -437,11 +475,11 @@ class Electrode(dj.Part):
 
     @property
     def key_source(self):
-        return Clustering()
+        return Curation()
 
     def make(self, key):
         root_dir = pathlib.Path(get_ephys_root_data_dir())
-        ks_dir = root_dir / (ClusteringTask & key).fetch1('clustering_output_dir')
+        ks_dir = root_dir / (Curation & key).fetch1('curation_output_dir')
         ks = kilosort.Kilosort(ks_dir)
 
         acq_software, probe_sn = (EphysRecording * ProbeInsertion & key).fetch1('acq_software', 'probe')
@@ -450,10 +488,10 @@ def make(self, key):
         rec_key = (EphysRecording & key).fetch1('KEY')
         chn2electrodes = get_neuropixels_chn2electrode_map(rec_key, acq_software)
 
-        is_qc = (Clustering & key).fetch1('quality_control')
+        is_qc = (Curation & key).fetch1('quality_control')
 
         # Get all units
-        units = {u['unit']: u for u in (Clustering.Unit & key).fetch(as_dict=True, order_by='unit')}
+        units = {u['unit']: u for u in (Unit & key).fetch(as_dict=True, order_by='unit')}
 
         unit_waveforms, unit_peak_waveforms = [], []
         if is_qc:
@@ -494,7 +532,7 @@ def make(self, key):
 @schema
 class ClusterQualityMetrics(dj.Imported):
     definition = """
-    -> Clustering.Unit
+    -> Unit
     ---
     amp: float
     snr: float
diff --git a/elements_ephys/readers/kilosort.py b/elements_ephys/readers/kilosort.py
@@ -1,5 +1,6 @@
 from os import path
 from datetime import datetime
+import pathlib
 import pandas as pd
 import numpy as np
 import re
@@ -37,14 +38,19 @@ class Kilosort:
     # keys to self.files, .data are file name e.g. self.data['params'], etc.
     ks_keys = [path.splitext(i)[0] for i in ks_files]
 
-    def __init__(self, dname):
-        self._dname = dname
+    def __init__(self, ks_dir):
+        self._ks_dir = pathlib.Path(ks_dir)
         self._files = {}
         self._data = None
         self._clusters = None
 
-        self._info = {'time_created': datetime.fromtimestamp((dname / 'params.py').stat().st_ctime),
-                      'time_modified': datetime.fromtimestamp((dname / 'params.py').stat().st_mtime)}
+        params_fp = ks_dir / 'params.py'
+
+        if not params_fp.exists():
+            raise FileNotFoundError(f'No Kilosort output found in: {ks_dir}')
+
+        self._info = {'time_created': datetime.fromtimestamp(params_fp.stat().st_ctime),
+                      'time_modified': datetime.fromtimestamp(params_fp.stat().st_mtime)}
 
     @property
     def data(self):
@@ -59,7 +65,7 @@ def info(self):
     def _stat(self):
         self._data = {}
         for i in Kilosort.ks_files:
-            f = self._dname / i
+            f = self._ks_dir / i
 
             if not f.exists():
                 log.debug('skipping {} - doesnt exist'.format(f))
@@ -84,12 +90,12 @@ def _stat(self):
                 self._data[base] = np.reshape(d, d.shape[0]) if d.ndim == 2 and d.shape[1] == 1 else d
 
         # Read the Cluster Groups
-        if (self._dname / 'cluster_groups.csv').exists():
-            df = pd.read_csv(self._dname / 'cluster_groups.csv', delimiter='\t')
+        if (self._ks_dir / 'cluster_groups.csv').exists():
+            df = pd.read_csv(self._ks_dir / 'cluster_groups.csv', delimiter= '\t')
             self._data['cluster_groups'] = np.array(df['group'].values)
             self._data['cluster_ids'] = np.array(df['cluster_id'].values)
-        elif (self._dname / 'cluster_KSLabel.tsv').exists():
-            df = pd.read_csv(self._dname / 'cluster_KSLabel.tsv', sep = "\t", header = 0)
+        elif (self._ks_dir / 'cluster_KSLabel.tsv').exists():
+            df = pd.read_csv(self._ks_dir / 'cluster_KSLabel.tsv', sep = "\t", header = 0)
             self._data['cluster_groups'] = np.array(df['KSLabel'].values)
             self._data['cluster_ids'] = np.array(df['cluster_id'].values)
         else: