Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
95586e5
feat: add function to calculate feature correlation matrix
strixy16 Dec 3, 2024
5193fb4
feat: add function to generate a heatmap plot figure from a correlati…
strixy16 Dec 3, 2024
a0771c6
feat: add init file to analyze directory
strixy16 Dec 3, 2024
cf26afc
feat: add error handling in getFeatureCorrelations
strixy16 Dec 3, 2024
e643349
feat: add general loading file, add loading config and data file func…
strixy16 Dec 3, 2024
f5882da
feat: add file for loading functions related to feature files
strixy16 Dec 3, 2024
5495550
build: add numpy and seaborn for correlation code
strixy16 Dec 3, 2024
decf8e5
refactor: remove so far unused imports
strixy16 Dec 3, 2024
fcc1b9e
feat: started test function for getFeatureCorrelations
strixy16 Dec 3, 2024
a708182
feat: make files for better function organization
strixy16 Dec 3, 2024
d706863
Merge remote-tracking branch 'origin/main' into katys/integrate-analy…
strixy16 Dec 6, 2024
d63a1c5
fix: remove duplicate tool.pixi.dependencies from merge
strixy16 Dec 6, 2024
484c12e
build: add seaborn for correlation plot functions, need to specify nu…
strixy16 Dec 6, 2024
c6b945f
feat: add init files for new directories
strixy16 Dec 6, 2024
fc83d69
feat: add function to calculate feature correlations and a function t…
strixy16 Dec 6, 2024
46f0773
feat: add function to drop a set of features at the beginning of a pa…
strixy16 Dec 6, 2024
fe56257
fix: set continuous setting in StructureSetToSegmentation to False
strixy16 Dec 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9,718 changes: 6,544 additions & 3,174 deletions pixi.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ channels = ["conda-forge"]
platforms = ["linux-64", "osx-arm64"]

[tool.pixi.dependencies]
numpy = "1.26.4.*"
seaborn = ">=0.13.2,<0.14"


[tool.pixi.pypi-dependencies]
Expand Down Expand Up @@ -191,3 +193,4 @@ publish-test = { cmd = [
], depends-on = [
"build",
], description = "Publish to test PyPI" }

1 change: 0 additions & 1 deletion src/readii/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# read version from installed package
from importlib.metadata import version
__version__ = "1.18.0"

Empty file added src/readii/analyze/__init__.py
Empty file.
143 changes: 143 additions & 0 deletions src/readii/analyze/correlation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import pandas as pd
from typing import Optional
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.linalg import issymmetric


def getFeatureCorrelations(vertical_features:pd.DataFrame,
horizontal_features:pd.DataFrame,
method:str = "pearson",
vertical_feature_name:Optional[str] = "",
horizontal_feature_name:Optional[str] = ""):
""" Function to calculate correlation between two sets of features.

Parameters
----------
vertical_features : pd.DataFrame
Dataframe containing features to calculate correlations with. Index must be the same as the index of the horizontal_features dataframe.
horizontal_features : pd.DataFrame
Dataframe containing features to calculate correlations with. Index must be the same as the index of the vertical_features dataframe.
method : str
Method to use for calculating correlations. Default is "pearson".
vertical_feature_name : str
Name of the vertical features to use as suffix in correlation dataframe. Default is blank "".
horizontal_feature_name : str
Name of the horizontal features to use as suffix in correlation dataframe. Default is blank "".

Returns
-------
correlation_matrix : pd.DataFrame
Dataframe containing correlation values.
"""
# Check that features are dataframes
assert isinstance(vertical_features, pd.DataFrame), "vertical_features must be a pandas DataFrame"
assert isinstance(horizontal_features, pd.DataFrame), "horizontal_features must be a pandas DataFrame"

if method not in ["pearson", "spearman", "kendall"]:
raise ValueError("Correlation method must be one of 'pearson', 'spearman', or 'kendall'.")

if not vertical_features.index.equals(horizontal_features.index):
raise ValueError("Vertical and horizontal features must have the same index to calculate correlation. Set the index to the intersection of patient IDs.")

# Add _ to beginnging of feature names if they are not blank so they can be used as suffixes
if vertical_feature_name: vertical_feature_name = f"_{vertical_feature_name}"
if horizontal_feature_name: horizontal_feature_name = f"_{horizontal_feature_name}"

# Join the features into one dataframe
# Use inner join to keep only the rows that have a value in both vertical and horizontal features
features_to_correlate = vertical_features.join(horizontal_features,
how='inner',
lsuffix=vertical_feature_name,
rsuffix=horizontal_feature_name)

try:
# Calculate correlation between vertical features and horizontal features
correlation_matrix = features_to_correlate.corr(method=method)
except Exception as e:
raise ValueError(f"Error calculating correlation matrix: {e}")

return correlation_matrix


def plotCorrelationHeatmap(correlation_matrix_df:pd.DataFrame,
diagonal:Optional[bool] = False,
triangle:Optional[str] = "lower",
cmap:Optional[str] = "nipy_spectral",
xlabel:Optional[str] = "",
ylabel:Optional[str] = "",
title:Optional[str] = "",
subtitle:Optional[str] = "",
show_tick_labels:Optional[bool] = False
):
"""Function to plot a correlation heatmap.

Parameters
----------
correlation_matrix_df : pd.DataFrame
Dataframe containing the correlation matrix to plot.
diagonal : bool, optional
Whether to only plot half of the matrix. The default is False.
triangle : str, optional
Which triangle half of the matrixto plot. The default is "lower".
xlabel : str, optional
Label for the x-axis. The default is "".
ylabel : str, optional
Label for the y-axis. The default is "".
title : str, optional
Title for the plot. The default is "".
subtitle : str, optional
Subtitle for the plot. The default is "".
show_tick_labels : bool, optional
Whether to show the tick labels on the x and y axes. These would be the feature names. The default is False.

Returns
-------
corr_fig : matplotlib.pyplot.figure
Figure object containing a Seaborn heatmap.
"""

if diagonal:
# Set up mask for hiding half the matrix in the plot
if triangle == "lower":
# Mask out the upper right triangle half of the matrix
mask = np.triu(correlation_matrix_df)
elif triangle == "upper":
# Mask out the lower left triangle half of the matrix
mask = np.tril(correlation_matrix_df)
else:
raise ValueError("If diagonal is True, triangle must be either 'lower' or 'upper'.")
else:
# The entire correlation matrix will be visisble in the plot
mask = None

# Set a default title if one is not provided
if not title:
title = "Correlation Heatmap"

# Set up figure and axes for the plot
corr_fig, corr_ax = plt.subplots()

# Plot the correlation matrix
corr_ax = sns.heatmap(correlation_matrix_df,
mask = mask,
cmap=cmap,
vmin=-1.0,
vmax=1.0)

if not show_tick_labels:
# Remove the individual feature names from the axes
corr_ax.set_xticklabels(labels=[])
corr_ax.set_yticklabels(labels=[])

# Set axis labels
corr_ax.set_xlabel(xlabel)
corr_ax.set_ylabel(ylabel)

# Set title and subtitle
# Suptitle is the super title, which will be above the title
plt.title(subtitle, fontsize=12)
plt.suptitle(title, fontsize=14)

return corr_fig
Empty file added src/readii/data/labelling.py
Empty file.
43 changes: 43 additions & 0 deletions src/readii/data/process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pandas import DataFrame
from typing import Optional

def dropUpToFeature(dataframe:DataFrame,
feature_name:str,
keep_feature_name_column:Optional[bool] = False
):
""" Function to drop all columns up to and possibly including the specified feature.

Parameters
----------
dataframe : DataFrame
Dataframe to drop columns from.
feature_name : str
Name of the feature to drop up to.
keep_feature_name_column : bool, optional
Whether to keep the specified feature name column in the dataframe or drop it. The default is False.

Returns
-------
dataframe : DataFrame
Dataframe with all columns up to and including the specified feature dropped.
"""
try:
if keep_feature_name_column:
# Get the column names up to but not including the specified feature
column_names = dataframe.columns.to_list()[:dataframe.columns.get_loc(feature_name)]
else:
# Get the column names up to and including the specified feature
column_names = dataframe.columns.to_list()[:dataframe.columns.get_loc(feature_name)+1]

# Drop all columns up to and including the specified feature
dataframe_dropped_columns = dataframe.drop(columns=column_names)

return dataframe_dropped_columns

except KeyError:
print(f"Feature {feature_name} was not found as a column in dataframe. No columns dropped.")
return dataframe

except Exception as e:
print(f"An error occurred: {e}")
return None
Comment on lines +41 to +43
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consider removing general except clause or provide specific handling

Catching a general exception and returning None can obscure unexpected errors and make debugging difficult.

Apply this diff to remove the general except clause or handle specific exceptions:

         except KeyError:
             print(f"Feature {feature_name} was not found as a column in dataframe. No columns dropped.")
             return dataframe
-        except Exception as e:
-            print(f"An error occurred: {e}")
-            return None

Alternatively, specify the exceptions you expect:

         except KeyError:
             print(f"Feature {feature_name} was not found as a column in dataframe. No columns dropped.")
             return dataframe
         except SomeSpecificException as e:
             print(f"An error occurred: {e}")
             return None

Committable suggestion skipped: line range outside the PR's diff.

Empty file added src/readii/io/__init__.py
Empty file.
Empty file.
70 changes: 70 additions & 0 deletions src/readii/io/loaders/features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import pandas as pd

from typing import Optional, Dict

from readii.io.loaders.general import loadFileToDataFrame


def loadFeatureFilesFromImageTypes(extracted_feature_dir:str,
image_types:Optional[list]=['original'],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Avoid mutable default arguments

Using mutable default arguments like lists can lead to unexpected behavior because default arguments are evaluated only once at function definition time.

Change the default arguments to None and initialize within the function:

 def loadFeatureFilesFromImageTypes(extracted_feature_dir: str,
-                                   image_types: Optional[list] = ['original'], 
-                                   drop_labels: Optional[bool] = True, 
-                                   labels_to_drop: Optional[list] = ["patient_ID", "survival_time_in_years", "survival_event_binary"]) -> Dict[str, pd.DataFrame]:
+                                   image_types: Optional[list] = None, 
+                                   drop_labels: Optional[bool] = True, 
+                                   labels_to_drop: Optional[list] = None) -> Dict[str, pd.DataFrame]:
     """Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes."""
+    if image_types is None:
+        image_types = ['original']
+    if labels_to_drop is None:
+        labels_to_drop = ["patient_ID", "survival_time_in_years", "survival_event_binary"]

Also applies to: 12-12

🧰 Tools
🪛 Ruff (0.8.0)

10-10: Do not use mutable data structures for argument defaults

Replace with None; initialize within function

(B006)

drop_labels:Optional[bool]=True,
labels_to_drop:Optional[list]=["patient_ID","survival_time_in_years","survival_event_binary"])->Dict[str,pd.DataFrame]:
"""Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes.

Parameters
----------
extracted_feature_dir : str
Path to the directory containing the extracted feature csv files
image_types : list, optional
List of image types to load in. The default is ['original'].
drop_labels : bool, optional
Whether to drop the labels from the dataframes. Use when loading labelled data from data_setup_for_modeling.ipynb. The default is True.
labels_to_drop : list, optional
List of labels to drop from the dataframes. The default is ["patient_ID","survival_time_in_years","survival_event_binary"] based on code
in data_setup_for_modeling.ipynb.

Returns
-------
feature_sets : dict
Dictionary of dataframes containing the extracted radiomics features.
"""
# Initialize dictionary to store the feature sets
feature_sets = {}

feature_file_list = os.listdir(extracted_feature_dir)

# Loop through all the files in the directory
for image_type in image_types:
try:
# Extract the image type feature csv file from the feature directory
# This should return a list of length 1, so we can just take the first element
image_type_feature_file = [file for file in feature_file_list if (image_type in file) and (file.endswith(".csv"))][0]
# Remove the image type file from the list of feature files
feature_file_list.remove(image_type_feature_file)
except Exception as e:
print(f"{e}\n No {image_type} feature csv files found in {extracted_feature_dir}")
# Skip to the next image type
continue


# Get the full path to the feature file
feature_file_path = os.path.join(extracted_feature_dir, image_type_feature_file)

# Load the feature data into a pandas dataframe
raw_feature_data = loadFileToDataFrame(feature_file_path)

try:
# Drop the labels from the dataframe if specified
if drop_labels:
# Data is now only extracted features
raw_feature_data.drop(labels_to_drop, axis=1, inplace=True)
except Exception as e:
print(f"{feature_file_path} does not have the labels {labels_to_drop} to drop.")
# Skip to the next image type
continue

# Save the dataframe to the feature_sets dictionary
feature_sets[image_type] = raw_feature_data

return feature_sets
70 changes: 70 additions & 0 deletions src/readii/io/loaders/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import pandas as pd
import yaml

from typing import Optional, Dict, Union


def loadImageDatasetConfig(dataset_name:str,
config_dir_path:str) -> dict:
"""Load the configuration file for a given dataset. Expects the configuration file to be named <dataset_name>.yaml.

Parameters
----------
dataset_name : str
Name of the dataset to load the configuration file for.
config_dir_path : str
Path to the directory containing the configuration files.

Returns
-------
dict
Dictionary containing the configuration settings for the dataset.

Examples
--------
>>> config = loadImageDatasetConfig("NSCLC_Radiogenomics", "config/")
"""
# Make full path to config file
config_file_path = os.path.join(config_dir_path, f"{dataset_name}.yaml")

# Check if config file exists
if os.path.exists(config_file_path):
# Load the config file
config = yaml.safe_load(open(config_file_path, "r"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Use context manager when opening files

It's recommended to use a context manager (with statement) when opening files to ensure they are properly closed.

Apply this diff to use a context manager:

 # Load the config file
-config = yaml.safe_load(open(config_file_path, "r"))
+with open(config_file_path, "r") as file:
+    config = yaml.safe_load(file)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
config = yaml.safe_load(open(config_file_path, "r"))
with open(config_file_path, "r") as file:
config = yaml.safe_load(file)
🧰 Tools
🪛 Ruff (0.8.0)

34-34: Use a context manager for opening files

(SIM115)

return config
else:
print(f"Config file {config_file_path} does not exist.")
return None



def loadFileToDataFrame(file_path:str) -> pd.DataFrame:
"""Load data from a csv or xlsx file into a pandas dataframe.

Parameters
----------
file_path (str): Path to the data file.

Returns
-------
pd.DataFrame: Dataframe containing the data from the file.
"""
# Get the file extension
_, file_extension = os.path.splitext(file_path)

try:
# Check if the file is an Excel file
if file_extension == '.xlsx':
df = pd.read_excel(file_path)
# Check if the file is a CSV file
elif file_extension == '.csv':
df = pd.read_csv(file_path)
else:
raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")

return df

except Exception as e:
print(f"An error occurred: {e}")
return None
Empty file added src/readii/io/loaders/images.py
Empty file.
Empty file.
6 changes: 5 additions & 1 deletion src/readii/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ def loadRTSTRUCTSITK(

# Set up segmentation loader
logger.debug(f"Making mask using ROI names: {roiNames}")
makeMask = StructureSetToSegmentation(roi_names=roiNames)

# Initialize med-imagetools loader to convert RTSTRUCT point cloud to a segmentation
# Set continous to False to ensure indices are integers and not floats
makeMask = StructureSetToSegmentation(roi_names=roiNames,
continuous=False)

try:
# Get the individual ROI masks
Expand Down
Loading
Loading