Skip to content

io refactored, tests expanded, selection of different datasets possible #38

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 136 additions & 152 deletions src/iblphotometry/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,103 @@
)


def from_array(
times: np.ndarray, data: np.ndarray, channel_names: list[str] | None = None
def from_raw_neurophotometrics_file_to_raw_df(
path: str | Path,
validate=True,
) -> pd.DataFrame:
return pd.DataFrame(data, index=times, columns=channel_names)
path = Path(path) if isinstance(path, str) else path
match path.suffix:
case '.csv':
raw_df = pd.read_csv(path)
case '.pqt':
raw_df = pd.read_parquet(path)

if validate:
raw_df = _validate_neurophotometrics_df(raw_df)

return raw_df


def from_raw_neurophotometrics_df_to_ibl_df(
raw_df: pd.DataFrame, rois=None, drop_first=True
) -> pd.DataFrame:
if rois is None:
rois = infer_data_columns(raw_df)

ibl_df = raw_df.filter(items=rois, axis=1).sort_index(axis=1)
timestamp_name = (
'SystemTimestamp' if 'SystemTimestamp' in raw_df.columns else 'Timestamp'
)
ibl_df['times'] = raw_df[timestamp_name]
ibl_df['wavelength'] = np.nan
ibl_df['name'] = ''
ibl_df['color'] = ''

# TODO the names column in channel_meta_map should actually be user defined (experiment description file?)
channel_meta_map = pd.DataFrame(LIGHT_SOURCE_MAP)
led_states = pd.DataFrame(LED_STATES).set_index('Condition')
states = raw_df['LedState']

for state in states.unique():
ir, ic = np.where(led_states == state)
# if not present, multiple LEDs are active
if ic.size == 0:
# find row
ir = np.argmax(led_states['No LED ON'] > state) - 1
# find active combo
possible_led_combos = [(1, 2), (1, 3), (2, 3), (1, 2, 3)]
for combo in possible_led_combos: # drop enumerate
if state == sum([led_states.iloc[ir, c] for c in combo]):
name = '+'.join([channel_meta_map['name'][c] for c in combo])
color = '+'.join([channel_meta_map['color'][c] for c in combo])
wavelength = np.nan
ibl_df.loc[states == state, ['name', 'color', 'wavelength']] = (
name,
color,
wavelength,
)
else:
for cn in ['name', 'color', 'wavelength']:
ibl_df.loc[states == state, cn] = channel_meta_map.iloc[ic[0]][cn]

# drop first frame
if drop_first:
ibl_df = ibl_df.iloc[1:].reset_index()

return ibl_df


def from_raw_neurophotometrics_file_to_ibl_df(
path: str | Path,
drop_first=True,
validate=True,
) -> pd.DataFrame:
raw_df = from_raw_neurophotometrics_file_to_raw_df(path, validate=validate)
ibl_df = from_raw_neurophotometrics_df_to_ibl_df(raw_df, drop_first=drop_first)

return ibl_df


def from_ibl_pqt_to_ibl_df(path: str | Path, validate=False):
if validate is True:
# TODO
raise NotImplementedError
return pd.read_parquet(path)


def from_ibl_dataframe(
raw_df: pd.DataFrame,
ibl_df: pd.DataFrame,
data_columns: list[str] | None = None,
time_column: str | None = None,
channel_column: str = 'name',
channel_names: list[str] | None = None,
rename: dict | None = None,
) -> dict:
"""reads in a pandas.DataFrame and converts it into nap.TsdDataframes. Performs the time demultiplexing operation.
"""main function to convert to analysis ready format


Args:
raw_df (pd.DataFrame): the dataframe, as stored in the photometry.signal.pqt
ibl_df (pd.DataFrame): the dataframe, as stored in the photometry.signal.pqt
data_columns (list[str], optional): The names of the columns in the dataframe that contain the signals of different fibers. By default, they are named RegionXX. If None is provided, All columns that start with `Region` are treated as data columns. Defaults to None.
time_column (str, optional): The name of the column that contains the timestamps. If None is provided, it is assumed that `time` is in the name. Defaults to None.
channel_column (str, optional): The name of the column that contains. Defaults to 'name'.
Expand All @@ -43,75 +121,46 @@ def from_ibl_dataframe(
# data_columns is a list of str that specifies the names of the column that hold the actual data, like 'RegionXX'
# channel_column is the column that specifies the temporally multiplexed acquisition channels

# infer if not explicitly provided: defaults to everything that starts with 'Region'
if data_columns is None:
# this hacky parser currently deals with the inconsistency between carolinas and alejandros extraction
# https://github.com/int-brain-lab/ibl-photometry/issues/35
data_columns = [
col
for col in raw_df.columns
if col.startswith('Region') or col.startswith('G')
]
data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns

# infer name of time column if not provided
if time_column is None:
time_columns = [col for col in raw_df.columns if 'time' in col.lower()]
time_columns = [col for col in ibl_df.columns if 'time' in col.lower()]
assert len(time_columns) == 1
time_column = time_columns[0]

# infer channel names if they are not explicitly provided
if channel_names is None:
channel_names = raw_df[channel_column].unique()
channel_names = ibl_df[channel_column].unique()

# drop empty acquisition channels
to_drop = ['None', '']
channel_names = [ch for ch in channel_names if ch not in to_drop]

raw_dfs = {}
dfs = {}
for channel in channel_names:
# get the data for the band
df = raw_df.groupby(channel_column).get_group(channel)
df = ibl_df.groupby(channel_column).get_group(channel)
# if rename dict is passed, rename Region0X to the corresponding brain region
if rename is not None:
df = df.rename(columns=rename)
data_columns = rename.values()
raw_dfs[channel] = df.set_index(time_column)[data_columns]

return raw_dfs
dfs[channel] = df.set_index(time_column)[data_columns]


def from_ibl_dataframes(raw_df: pd.DataFrame, locations_df: pd.DataFrame):
data_columns = (list(locations_df.index),)
rename = locations_df['brain_region'].to_dict()

read_config = dict(
data_columns=data_columns,
time_column='times',
channel_column='name',
rename=rename,
)

return from_ibl_dataframe(raw_df, **read_config)
return dfs


def from_ibl_pqt(
signal_pqt_path: str | Path,
locations_pqt_path: Optional[str | Path] = None,
):
"""reads in a photometry.signal.pqt files as they are registered in alyx.
# read from a single pqt
# if both are provided, do both

Args:
signal_pqt_path (str | Path): _description_
locations_pqt_path (str | Path, optional): _description_. Defaults to None.

Returns:
_type_: _description_
"""

raw_df = pd.read_parquet(signal_pqt_path)
ibl_df = pd.read_parquet(signal_pqt_path)
if locations_pqt_path is not None:
locations_df = pd.read_parquet(locations_pqt_path)
return from_ibl_dataframes(raw_df, locations_df)
return from_ibl_dataframes(ibl_df, locations_df)
else:
warnings.warn(
'loading a photometry.signal.pqt file without its corresponding photometryROI.locations.pqt'
Expand All @@ -126,135 +175,61 @@ def from_ibl_pqt(
rename=rename,
)

return from_ibl_dataframe(raw_df, **read_config)


def from_raw_neurophotometrics_ibl_df(
raw_df: pd.DataFrame, rois=None, drop_first=True
) -> pd.DataFrame:
"""reads in parses the output of the neurophotometrics FP3002

Args:
raw_df (pd.DataFrame): _description_
rois (_type_, optional): _description_. Defaults to None.

Returns:
pd.DataFrame: a dataframe in the same format as stored in alyx as pqt.
"""
if rois is None:
rois = [col for col in raw_df.columns if col.startswith('G')]

df = raw_df.filter(items=rois, axis=1).sort_index(axis=1)
timestamp_name = (
'SystemTimestamp' if 'SystemTimestamp' in raw_df.columns else 'Timestamp'
)
df['times'] = raw_df[timestamp_name]
df['wavelength'] = np.nan
df['name'] = ''
df['color'] = ''

# TODO the names column in channel_meta_map should actually be user defined (experiment description file?)
channel_meta_map = pd.DataFrame(LIGHT_SOURCE_MAP)
led_states = pd.DataFrame(LED_STATES).set_index('Condition')
states = raw_df['LedState']
return from_ibl_dataframe(ibl_df, **read_config)

for state in states.unique():
ir, ic = np.where(led_states == state)
# if not present, multiple LEDs are active
if ic.size == 0:
# find row
ir = np.argmax(led_states['No LED ON'] > state) - 1
# find active combo
possible_led_combos = [(1, 2), (1, 3), (2, 3), (1, 2, 3)]
for combo in possible_led_combos: # drop enumerate
if state == sum([led_states.iloc[ir, c] for c in combo]):
name = '+'.join([channel_meta_map['name'][c] for c in combo])
color = '+'.join([channel_meta_map['color'][c] for c in combo])
wavelength = np.nan
df.loc[states == state, ['name', 'color', 'wavelength']] = (
name,
color,
wavelength,
)
else:
for cn in ['name', 'color', 'wavelength']:
df.loc[states == state, cn] = channel_meta_map.iloc[ic[0]][cn]

# drop first frame
if drop_first:
df = df.iloc[1:].reset_index()

return df


def from_raw_neurophotometrics_file_to_ibl_df(
path: str | Path,
drop_first=True,
validate=True,
) -> pd.DataFrame:
"""reads a raw neurophotometrics file (in .csv or .pqt format) as they are written by the neurophotometrics software

Args:
path (str | Path): path to either the .csv file as written by the neurophotometrics bonsai workflow, or a path to a .pqt file as stored in alyx
drop_first (bool, optional): The first frame is all LEDs on. If true, this frame is dropped. Defaults to True.
validate (bool, optional): if true, enforces pydantic validation of the datatypes. Defaults to TRue

Raises:
NotImplementedError: _description_
def from_ibl_dataframes(ibl_df: pd.DataFrame, locations_df: pd.DataFrame):
# if locations are present
data_columns = (list(locations_df.index),)
rename = locations_df['brain_region'].to_dict()

Returns:
nap.TsdFrame: _description_ # FIXME
"""
warnings.warn(
'loading photometry from raw neurophotometrics output. The data will _not_ be synced and is being split into channels by LedState (converted to LED wavelength in nm)'
read_config = dict(
data_columns=data_columns,
time_column='times',
channel_column='name',
rename=rename,
)
if isinstance(path, str):
path = Path(path)
if path.suffix == '.csv':
# really raw as it comes out of the device
# todo figure out the header
raw_df = pd.read_csv(path)
elif path.suffix == '.pqt':
# as it is stored
raw_df = pd.read_parquet(path)
else:
raise NotImplementedError

if validate:
raw_df = _validate_ibl_dataframe(raw_df)

df = from_raw_neurophotometrics_ibl_df(raw_df)

# drop first frame
if drop_first:
df = df.iloc[1:].reset_index()

return df
return from_ibl_dataframe(ibl_df, **read_config)


def from_raw_neurophotometrics_file(
path: str | Path,
drop_first=True,
validate=True,
) -> dict:
df = from_raw_neurophotometrics_file_to_ibl_df(
# this one bypasses everything
ibl_df = from_raw_neurophotometrics_file_to_ibl_df(
path, drop_first=drop_first, validate=validate
)
data_columns = [col for col in df.columns if col.startswith('G')]
# data_columns = infer_data_columns(ibl_df) if data_columns is None else data_columns
read_config = dict(
data_columns=data_columns,
# data_columns=data_columns,
time_column='times',
channel_column='name',
)
return from_ibl_dataframe(df, **read_config)
return from_ibl_dataframe(ibl_df, **read_config)


"""
## ## ### ## #### ######## ### ######## #### ####### ## ##
## ## ## ## ## ## ## ## ## ## ## ## ## ## ### ##
## ## ## ## ## ## ## ## ## ## ## ## ## ## #### ##
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
## ## ######### ## ## ## ## ######### ## ## ## ## ## ####
## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ###
### ## ## ######## #### ######## ## ## ## #### ####### ## ##
"""


def _validate_ibl_dataframe(
def validate_ibl_dataframe(df: pd.DataFrame) -> pd.DataFrame: ...


def _validate_neurophotometrics_df(
df: pd.DataFrame,
data_columns=None,
) -> pd.DataFrame:
if data_columns is None:
data_columns = [col for col in df.columns if col.startswith('G')]
data_columns = infer_data_columns(df) if data_columns is None else data_columns

schema_raw_data = pandera.DataFrameSchema(
columns=dict(
Expand All @@ -280,3 +255,12 @@ def _validate_neurophotometrics_digital_inputs(df: pd.DataFrame) -> pd.DataFrame
)
)
return schema_digital_inputs.validate(df)


def infer_data_columns(df: pd.DataFrame) -> list[str]:
# this hacky parser currently deals with the inconsistency between carolinas and alejandros extraction
# https://github.com/int-brain-lab/ibl-photometry/issues/35
data_columns = [
col for col in df.columns if col.startswith('Region') or col.startswith('G')
]
return data_columns
Loading
Loading