add QC to ephys_no_curation

Thinh Nguyen · Thinh Nguyen · commit db448f72a1f7 · 2022-09-29T10:47:02.000-05:00
diff --git a/element_array_ephys/ephys_no_curation.py b/element_array_ephys/ephys_no_curation.py
@@ -6,6 +6,7 @@
 import importlib
 import gc
 from decimal import Decimal
+import pandas as pd
 
 from element_interface.utils import find_root_directory, find_full_path, dict_to_uuid
 
@@ -683,7 +684,7 @@ class Unit(dj.Part):
         spike_count: int         # how many spikes in this recording for this unit
         spike_times: longblob    # (s) spike times of this unit, relative to the start of the EphysRecording
         spike_sites : longblob   # array of electrode associated with each spike
-        spike_depths : longblob  # (um) array of depths associated with each spike, relative to the (0, 0) of the probe    
+        spike_depths=null : longblob  # (um) array of depths associated with each spike, relative to the (0, 0) of the probe    
         """
 
     def make(self, key):
@@ -845,6 +846,74 @@ def yield_unit_waveforms():
                 self.Waveform.insert(unit_electrode_waveforms, ignore_extra_fields=True)
 
 
+@schema
+class QualityMetrics(dj.Imported):
+    definition = """
+    # Clusters and waveforms metrics
+    -> CuratedClustering    
+    """
+
+    class Cluster(dj.Part):
+        definition = """   
+        # Cluster metrics for a particular unit
+        -> master
+        -> CuratedClustering.Unit
+        ---
+        firing_rate=null: float # (Hz) firing rate for a unit 
+        snr=null: float  # signal-to-noise ratio for a unit
+        presence_ratio=null: float  # fraction of time in which spikes are present
+        isi_violation=null: float   # rate of ISI violation as a fraction of overall rate
+        number_violation=null: int  # total number of ISI violations
+        amplitude_cutoff=null: float  # estimate of miss rate based on amplitude histogram
+        isolation_distance=null: float  # distance to nearest cluster in Mahalanobis space
+        l_ratio=null: float  # 
+        d_prime=null: float  # Classification accuracy based on LDA
+        nn_hit_rate=null: float  # Fraction of neighbors for target cluster that are also in target cluster
+        nn_miss_rate=null: float # Fraction of neighbors outside target cluster that are in target cluster
+        silhouette_score=null: float  # Standard metric for cluster overlap
+        max_drift=null: float  # Maximum change in spike depth throughout recording
+        cumulative_drift=null: float  # Cumulative change in spike depth throughout recording 
+        contamination_rate=null: float # 
+        """
+
+    class Waveform(dj.Part):
+        definition = """   
+        # Waveform metrics for a particular unit
+        -> master
+        -> CuratedClustering.Unit
+        ---
+        amplitude: float  # (uV) absolute difference between waveform peak and trough
+        duration: float  # (ms) time between waveform peak and trough
+        halfwidth=null: float  # (ms) spike width at half max amplitude
+        pt_ratio=null: float  # absolute amplitude of peak divided by absolute amplitude of trough relative to 0
+        repolarization_slope=null: float  # the repolarization slope was defined by fitting a regression line to the first 30us from trough to peak
+        recovery_slope=null: float  # the recovery slope was defined by fitting a regression line to the first 30us from peak to tail
+        spread=null: float  # (um) the range with amplitude above 12% of the maximum amplitude along the probe
+        velocity_above=null: float  # (s/m) inverse velocity of waveform propagation from the soma toward the top of the probe
+        velocity_below=null: float  # (s/m) inverse velocity of waveform propagation from the soma toward the bottom of the probe
+        """
+
+    def make(self, key):
+        output_dir = (ClusteringTask & key).fetch1('clustering_output_dir')
+        kilosort_dir = find_full_path(get_ephys_root_data_dir(), output_dir)
+
+        metric_fp = kilosort_dir / 'metrics.csv'
+
+        if not metric_fp.exists():
+            raise FileNotFoundError(f'QC metrics file not found: {metric_fp}')
+
+        metrics_df = pd.read_csv(metric_fp)
+        metrics_df.set_index('cluster_id', inplace=True)
+
+        metrics_list = [
+            dict(metrics_df.loc[unit_key['unit']], **unit_key)
+            for unit_key in (CuratedClustering.Unit & key).fetch('KEY')]
+
+        self.insert1(key)
+        self.Cluster.insert(metrics_list, ignore_extra_fields=True)
+        self.Waveform.insert(metrics_list, ignore_extra_fields=True)
+
+
 # ---------------- HELPER FUNCTIONS ----------------
 
 def get_spikeglx_meta_filepath(ephys_recording_key):