bhklab · strixy16 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,8 @@ channels = ["conda-forge"]
 platforms = ["linux-64", "osx-arm64"]
 
 [tool.pixi.dependencies]
+numpy = "1.26.4.*"
+seaborn = ">=0.13.2,<0.14"
 
 
 [tool.pixi.pypi-dependencies]
@@ -191,3 +193,4 @@ publish-test = { cmd = [
 ], depends-on = [
   "build",
 ], description = "Publish to test PyPI" }
+
diff --git a/src/readii/__init__.py b/src/readii/__init__.py
@@ -1,4 +1,3 @@
 # read version from installed package
 from importlib.metadata import version
 __version__ = "1.18.0"
-
diff --git a/src/readii/analyze/__init__.py b/src/readii/analyze/__init__.py
diff --git a/src/readii/analyze/correlation.py b/src/readii/analyze/correlation.py
@@ -0,0 +1,143 @@
+import pandas as pd
+from typing import Optional
+import matplotlib.pyplot as plt
+import seaborn as sns
+import numpy as np
+from scipy.linalg import issymmetric
+
+
+def getFeatureCorrelations(vertical_features:pd.DataFrame,
+                           horizontal_features:pd.DataFrame,
+                           method:str = "pearson",
+                           vertical_feature_name:Optional[str] = "",
+                           horizontal_feature_name:Optional[str] = ""):
+    """ Function to calculate correlation between two sets of features.
+
+    Parameters
+    ----------
+    vertical_features : pd.DataFrame
+        Dataframe containing features to calculate correlations with. Index must be the same as the index of the horizontal_features dataframe.
+    horizontal_features : pd.DataFrame
+        Dataframe containing features to calculate correlations with. Index must be the same as the index of the vertical_features dataframe.
+    method : str
+        Method to use for calculating correlations. Default is "pearson".
+    vertical_feature_name : str
+        Name of the vertical features to use as suffix in correlation dataframe. Default is blank "".
+    horizontal_feature_name : str
+        Name of the horizontal features to use as suffix in correlation dataframe. Default is blank "".
+
+    Returns
+    -------
+    correlation_matrix : pd.DataFrame
+        Dataframe containing correlation values.
+    """
+    # Check that features are dataframes
+    assert isinstance(vertical_features, pd.DataFrame), "vertical_features must be a pandas DataFrame"
+    assert isinstance(horizontal_features, pd.DataFrame), "horizontal_features must be a pandas DataFrame"
+
+    if method not in ["pearson", "spearman", "kendall"]:
+        raise ValueError("Correlation method must be one of 'pearson', 'spearman', or 'kendall'.")
+
+    if not vertical_features.index.equals(horizontal_features.index):
+        raise ValueError("Vertical and horizontal features must have the same index to calculate correlation. Set the index to the intersection of patient IDs.")
+
+    # Add _ to beginnging of feature names if they are not blank so they can be used as suffixes
+    if vertical_feature_name: vertical_feature_name = f"_{vertical_feature_name}"
+    if horizontal_feature_name: horizontal_feature_name = f"_{horizontal_feature_name}"
+
+    # Join the features into one dataframe
+    # Use inner join to keep only the rows that have a value in both vertical and horizontal features
+    features_to_correlate = vertical_features.join(horizontal_features, 
+                                                   how='inner', 
+                                                   lsuffix=vertical_feature_name, 
+                                                   rsuffix=horizontal_feature_name) 
+
+    try:
+        # Calculate correlation between vertical features and horizontal features
+        correlation_matrix = features_to_correlate.corr(method=method)
+    except Exception as e:
+        raise ValueError(f"Error calculating correlation matrix: {e}")
+
+    return correlation_matrix
+
+
+def plotCorrelationHeatmap(correlation_matrix_df:pd.DataFrame,
+                           diagonal:Optional[bool] = False,
+                           triangle:Optional[str] = "lower",
+                           cmap:Optional[str] = "nipy_spectral",
+                           xlabel:Optional[str] = "",
+                           ylabel:Optional[str] = "",
+                           title:Optional[str] = "",
+                           subtitle:Optional[str] = "",
+                           show_tick_labels:Optional[bool] = False
+                           ):
+    """Function to plot a correlation heatmap.
+
+    Parameters
+    ----------
+    correlation_matrix_df : pd.DataFrame
+        Dataframe containing the correlation matrix to plot.
+    diagonal : bool, optional
+        Whether to only plot half of the matrix. The default is False.
+    triangle : str, optional
+        Which triangle half of the matrixto plot. The default is "lower".
+    xlabel : str, optional
+        Label for the x-axis. The default is "".
+    ylabel : str, optional
+        Label for the y-axis. The default is "".
+    title : str, optional
+        Title for the plot. The default is "".
+    subtitle : str, optional
+        Subtitle for the plot. The default is "".
+    show_tick_labels : bool, optional
+        Whether to show the tick labels on the x and y axes. These would be the feature names. The default is False.
+
+    Returns
+    -------
+    corr_fig : matplotlib.pyplot.figure
+        Figure object containing a Seaborn heatmap.
+    """
+
+    if diagonal:
+        # Set up mask for hiding half the matrix in the plot
+        if triangle == "lower":
+            # Mask out the upper right triangle half of the matrix
+            mask = np.triu(correlation_matrix_df)
+        elif triangle == "upper":
+            # Mask out the lower left triangle half of the matrix
+            mask = np.tril(correlation_matrix_df)
+        else:
+            raise ValueError("If diagonal is True, triangle must be either 'lower' or 'upper'.")
+    else:
+        # The entire correlation matrix will be visisble in the plot
+        mask = None
+
+    # Set a default title if one is not provided
+    if not title:
+        title = "Correlation Heatmap"
+
+    # Set up figure and axes for the plot
+    corr_fig, corr_ax = plt.subplots()
+
+    # Plot the correlation matrix
+    corr_ax = sns.heatmap(correlation_matrix_df,
+                         mask = mask,
+                         cmap=cmap,
+                         vmin=-1.0,
+                         vmax=1.0)
+
+    if not show_tick_labels:
+        # Remove the individual feature names from the axes
+        corr_ax.set_xticklabels(labels=[])
+        corr_ax.set_yticklabels(labels=[])
+
+    # Set axis labels
+    corr_ax.set_xlabel(xlabel)
+    corr_ax.set_ylabel(ylabel)
+
+    # Set title and subtitle
+    # Suptitle is the super title, which will be above the title
+    plt.title(subtitle, fontsize=12)
+    plt.suptitle(title, fontsize=14)
+
+    return corr_fig
diff --git a/src/readii/data/labelling.py b/src/readii/data/labelling.py
diff --git a/src/readii/data/process.py b/src/readii/data/process.py
@@ -0,0 +1,43 @@
+from pandas import DataFrame
+from typing import Optional
+
+def dropUpToFeature(dataframe:DataFrame,
+                    feature_name:str,
+                    keep_feature_name_column:Optional[bool] = False
+                    ):
+    """ Function to drop all columns up to and possibly including the specified feature.
+
+    Parameters
+    ----------
+    dataframe : DataFrame
+        Dataframe to drop columns from.
+    feature_name : str
+        Name of the feature to drop up to.
+    keep_feature_name_column : bool, optional
+        Whether to keep the specified feature name column in the dataframe or drop it. The default is False.
+
+    Returns
+    -------
+    dataframe : DataFrame
+        Dataframe with all columns up to and including the specified feature dropped.
+    """
+    try:
+        if keep_feature_name_column:
+            # Get the column names up to but not including the specified feature
+            column_names = dataframe.columns.to_list()[:dataframe.columns.get_loc(feature_name)]
+        else:
+            # Get the column names up to and including the specified feature
+            column_names = dataframe.columns.to_list()[:dataframe.columns.get_loc(feature_name)+1]
+
+        # Drop all columns up to and including the specified feature
+        dataframe_dropped_columns = dataframe.drop(columns=column_names)
+
+        return dataframe_dropped_columns
+
+    except KeyError:
+        print(f"Feature {feature_name} was not found as a column in dataframe. No columns dropped.")
+        return dataframe
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
diff --git a/src/readii/io/__init__.py b/src/readii/io/__init__.py
diff --git a/src/readii/io/loaders/__init__.py b/src/readii/io/loaders/__init__.py
diff --git a/src/readii/io/loaders/features.py b/src/readii/io/loaders/features.py
@@ -0,0 +1,70 @@
+import os
+import pandas as pd 
+
+from typing import Optional, Dict
+
+from readii.io.loaders.general import loadFileToDataFrame
+
+
+def loadFeatureFilesFromImageTypes(extracted_feature_dir:str,
+                                   image_types:Optional[list]=['original'], 
+                                   drop_labels:Optional[bool]=True, 
+                                   labels_to_drop:Optional[list]=["patient_ID","survival_time_in_years","survival_event_binary"])->Dict[str,pd.DataFrame]:
+    """Function to load in all the extracted imaging feature sets from a directory and return them as a dictionary of dataframes.
+
+    Parameters
+    ----------
+    extracted_feature_dir : str
+        Path to the directory containing the extracted feature csv files
+    image_types : list, optional
+        List of image types to load in. The default is ['original'].
+    drop_labels : bool, optional
+        Whether to drop the labels from the dataframes. Use when loading labelled data from data_setup_for_modeling.ipynb. The default is True.
+    labels_to_drop : list, optional
+        List of labels to drop from the dataframes. The default is ["patient_ID","survival_time_in_years","survival_event_binary"] based on code
+        in data_setup_for_modeling.ipynb.
+
+    Returns
+    -------
+    feature_sets : dict
+        Dictionary of dataframes containing the extracted radiomics features.
+    """
+    # Initialize dictionary to store the feature sets
+    feature_sets = {}
+
+    feature_file_list = os.listdir(extracted_feature_dir)
+
+    # Loop through all the files in the directory
+    for image_type in image_types:
+        try:
+            # Extract the image type feature csv file from the feature directory
+            # This should return a list of length 1, so we can just take the first element
+            image_type_feature_file = [file for file in feature_file_list if (image_type in file) and (file.endswith(".csv"))][0]
+            # Remove the image type file from the list of feature files
+            feature_file_list.remove(image_type_feature_file)
+        except Exception as e:
+            print(f"{e}\n No {image_type} feature csv files found in {extracted_feature_dir}")
+            # Skip to the next image type
+            continue
+
+
+        # Get the full path to the feature file
+        feature_file_path = os.path.join(extracted_feature_dir, image_type_feature_file)
+
+        # Load the feature data into a pandas dataframe
+        raw_feature_data = loadFileToDataFrame(feature_file_path)
+
+        try:
+            # Drop the labels from the dataframe if specified
+            if drop_labels:
+                # Data is now only extracted features
+                raw_feature_data.drop(labels_to_drop, axis=1, inplace=True)
+        except Exception as e:
+            print(f"{feature_file_path} does not have the labels {labels_to_drop} to drop.")
+            # Skip to the next image type
+            continue
+
+        # Save the dataframe to the feature_sets dictionary
+        feature_sets[image_type] = raw_feature_data
+
+    return feature_sets
diff --git a/src/readii/io/loaders/general.py b/src/readii/io/loaders/general.py
@@ -0,0 +1,70 @@
+import os
+import pandas as pd 
+import yaml
+
+from typing import Optional, Dict, Union
+
+
+def loadImageDatasetConfig(dataset_name:str,
+                           config_dir_path:str) -> dict:
+    """Load the configuration file for a given dataset. Expects the configuration file to be named <dataset_name>.yaml.
+
+    Parameters
+    ----------
+    dataset_name : str
+        Name of the dataset to load the configuration file for.
+    config_dir_path : str
+        Path to the directory containing the configuration files.
+
+    Returns
+    -------
+    dict
+        Dictionary containing the configuration settings for the dataset.
+
+    Examples
+    --------
+    >>> config = loadImageDatasetConfig("NSCLC_Radiogenomics", "config/")
+    """
+    # Make full path to config file
+    config_file_path = os.path.join(config_dir_path, f"{dataset_name}.yaml")
+
+    # Check if config file exists
+    if os.path.exists(config_file_path):
+        # Load the config file
+        config = yaml.safe_load(open(config_file_path, "r"))
-        config = yaml.safe_load(open(config_file_path, "r"))
+        with open(config_file_path, "r") as file:
+            config = yaml.safe_load(file)
-        config = yaml.safe_load(open(config_file_path, "r"))
+        with open(config_file_path, "r") as file:
+            config = yaml.safe_load(file)
+        return config
+    else:
+        print(f"Config file {config_file_path} does not exist.")
+        return None
+
+
+
+def loadFileToDataFrame(file_path:str) -> pd.DataFrame:
+    """Load data from a csv or xlsx file into a pandas dataframe.
+
+    Parameters
+    ----------
+    file_path (str): Path to the data file.
+
+    Returns
+    -------
+    pd.DataFrame: Dataframe containing the data from the file.
+    """
+     # Get the file extension
+    _, file_extension = os.path.splitext(file_path)
+
+    try:
+        # Check if the file is an Excel file
+        if file_extension == '.xlsx':
+            df = pd.read_excel(file_path)
+        # Check if the file is a CSV file
+        elif file_extension == '.csv':
+            df = pd.read_csv(file_path)
+        else:
+            raise ValueError("Unsupported file format. Please provide a .csv or .xlsx file.")
+
+        return df
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
diff --git a/src/readii/io/loaders/images.py b/src/readii/io/loaders/images.py
diff --git a/src/readii/io/writers/__init__.py b/src/readii/io/writers/__init__.py
diff --git a/src/readii/loaders.py b/src/readii/loaders.py
@@ -72,7 +72,11 @@ def loadRTSTRUCTSITK(
 
 	# Set up segmentation loader
 	logger.debug(f"Making mask using ROI names: {roiNames}")
-	makeMask = StructureSetToSegmentation(roi_names=roiNames)
+
+	# Initialize med-imagetools loader to convert RTSTRUCT point cloud to a segmentation
+	# Set continous to False to ensure indices are integers and not floats
+	makeMask = StructureSetToSegmentation(roi_names=roiNames,
+									      continuous=False)
 
 	try:
 		# Get the individual ROI masks