mahmoodlab
diff --git a/‎pyproject.toml
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/hest/HESTData.py
Lines changed: 342 additions & 180 deletions b/‎src/hest/HESTData.py
Lines changed: 342 additions & 180 deletions
diff --git a/‎src/hest/LazyShapes.py
Lines changed: 63 additions & 0 deletions b/‎src/hest/LazyShapes.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/hest/__init__.py
Lines changed: 3 additions & 1 deletion b/‎src/hest/__init__.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/hest/autoalign.py
Lines changed: 6 additions & 4 deletions b/‎src/hest/autoalign.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/hest/io/seg_readers.py
Lines changed: 212 additions & 0 deletions b/‎src/hest/io/seg_readers.py
Lines changed: 212 additions & 0 deletions
@@ -13,7 +13,6 @@ dependencies = [
     "ultralytics >= 8.2.4",
     "pyvips >= 2.2.3",
     "scanpy >= 1.10.1",
-    "kwimage >= 0.9.25",
     "imagecodecs >= 2024.1.1",
     "loguru >= 0.7.2",
     "timm >= 0.9.16",
@@ -27,7 +26,8 @@ dependencies = [
     "spatialdata >= 0.1.2",
     "dask >= 2024.2.1",
     "spatial_image >= 0.3.0",
-    "datasets"
+    "datasets",
+    "mygene"
 ]
 
 requires-python = ">=3.9"
 
@@ -0,0 +1,63 @@
+import geopandas as gpd
+import pandas as pd
+from shapely import Polygon
+
+from hest.io.seg_readers import read_gdf
+from hest.utils import verify_paths
+
+
+class LazyShapes:
+    
+    path: str = None
+    
+    def __init__(self, path: str, name: str, coordinate_system: str):
+        verify_paths([path])
+        self.path = path
+        self.name = name
+        self.coordinate_system = coordinate_system
+        self._shapes = None
+        
+    def compute(self) -> None:
+        if self._shapes is None:
+            self._shapes = read_gdf(self.path)
+            
+    @property
+    def shapes(self) -> gpd.GeoDataFrame:
+        if self._shapes is None:
+            self.compute()
+
+        return self._shapes
+    
+    def __repr__(self) -> str:
+        sup_rep = super().__repr__()
+        
+        loaded_rep = 'loaded' if self._shapes is not None else 'not loaded'
+        
+        rep = f"""name: {self.name}, coord-system: {self.coordinate_system}, <{loaded_rep}>"""
+        return rep
+    
+
+def convert_old_to_gpd(contours_holes, contours_tissue) -> gpd.GeoDataFrame:
+    assert len(contours_holes) == len(contours_tissue)
+    
+    shapes = []
+    tissue_ids = []
+    types = []
+    for i in range(len(contours_holes)):
+        tissue = contours_tissue[i]
+        shapes.append(Polygon(tissue[:, 0, :]))
+        tissue_ids.append(i)
+        types.append('tissue')
+        holes = contours_holes[i]
+        if len(holes) > 0:
+            for hole in holes:
+                shapes.append(Polygon(hole[:, 0, :]))
+                tissue_ids.append(i)
+                types.append('hole')
+                
+    df = pd.DataFrame(tissue_ids, columns=['tissue_id'])
+    df['hole'] = types
+    df['hole'] = df['hole'] == 'hole'
+            
+    return gpd.GeoDataFrame(df, geometry=shapes)
+        
@@ -4,6 +4,7 @@
 from .autoalign import autoalign_visium
 from .readers import *
 from .HESTData import HESTData, read_HESTData, load_hest
+from .segmentation.cell_segmenters import segment_cellvit
 
 __all__ = [
     'tiff_save',
@@ -18,5 +19,6 @@
     'STReader', 
     'autoalign_visium',
     'write_10X_h5',
-    'HESTData'
+    'HESTData',
+    'segment_cellvit'
 ]
@@ -5,10 +5,7 @@
 import cv2
 import matplotlib.collections as mc
 import matplotlib.patches as patches
-import matplotlib.pyplot as plt
 import numpy as np
-from kwimage.im_cv2 import imresize
-from ultralytics import YOLO
 
 from hest.utils import get_path_relative
 
@@ -89,7 +86,7 @@ def _spots_to_file(path, dict):
 def _resize_to_target(img):
     TARGET_PIXEL_EDGE = 1000
     downscale_factor = TARGET_PIXEL_EDGE / np.max(img.shape)
-    downscaled_fullres = imresize(img, downscale_factor)
+    downscaled_fullres = cv2.resize(img, (round(img.shape[1] * downscale_factor), round(img.shape[0] * downscale_factor)))
     return downscaled_fullres, downscale_factor
 
 
@@ -101,6 +98,9 @@ def _alignment_plot_to_file(boxes_to_match,
                             aligned_fiducials,
                             img,
                             save_path):
+
+    import matplotlib.pyplot as plt
+
     fig, ax = plt.subplots()
 
     i = 0
@@ -198,6 +198,8 @@ def autoalign_visium(fullres_img: np.ndarray, save_dir: str=None, name='') -> Di
     Returns:
         Dict: spot alignment as a dictionary
     """ 
+    from ultralytics import YOLO
+    
     path_model = get_path_relative(__file__, '../../models/visium_yolov8_v1.pt')
     model = YOLO(path_model)
 
 
@@ -0,0 +1,212 @@
+import json
+import warnings
+from abc import abstractmethod
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+from shapely.geometry.polygon import Point, Polygon
+from tqdm import tqdm
+
+
+def _process(x, extra_props, index_key, class_name):
+    from shapely.geometry.polygon import Point, Polygon
+    
+    geom_type = x['geometry']['type']
+    if geom_type == 'MultiPoint':
+        coords = [Point(x['geometry']['coordinates'][i]) for i in range(len(x['geometry']['coordinates']))]
+    elif geom_type == 'MultiPolygon':
+        coords = [Polygon(x['geometry']['coordinates'][i][0]) for i in range(len(x['geometry']['coordinates']))]
+    else:
+        raise ValueError("Doesn't recognize type {geom_type}, must be either MultiPoint or MultiPolygon")
+    
+    name = x['properties']['classification']['name']
+    
+    gdf = gpd.GeoDataFrame(geometry=coords)
+    
+    class_index = 'class' if not class_name else class_name
+    gdf[class_index] = [name for _ in range(len(gdf))]
+    
+    if index_key is not None:
+        indices = x['properties'][index_key]
+        values = np.zeros(len(x['geometry']['coordinates']), dtype=bool)
+        values[indices] = True
+        gdf[index_key] = values
+    
+    if extra_props:
+        extra_props = [k for k in x['properties'].keys() if k not in ['objectType', 'classification']]
+        for prop in extra_props:
+            val = x['properties'][prop]
+            gdf[prop] = [val for _ in range(len(gdf))]
+        
+    return gdf
+
+
+def _read_geojson(path, class_name=None, extra_props=False, index_key=None) -> gpd.GeoDataFrame:
+    with open(path) as f:
+        ls = json.load(f)
+        
+        sub_gdfs = []
+        for x in tqdm(ls):
+            sub_gdfs.append(_process(x, extra_props, index_key, class_name))
+
+        gdf = gpd.GeoDataFrame(pd.concat(sub_gdfs, ignore_index=True))
+        
+    return gdf
+
+
+class GDFReader:
+    @abstractmethod
+    def read_gdf(self, path) -> gpd.GeoDataFrame:
+        pass
+    
+
+class XeniumParquetCellReader(GDFReader):
+    
+    def read_gdf(self, path) -> gpd.GeoDataFrame:    
+        
+        df = pd.read_parquet(path)
+        
+        df['xy'] = list(zip(df['vertex_x'], df['vertex_y']))
+        df = df.drop(['vertex_x', 'vertex_y'], axis=1)
+        
+        df = df.groupby('cell_id').agg({
+            'xy': Polygon
+        }).reset_index()
+        
+        gdf = gpd.GeoDataFrame(df, geometry=df['xy'])
+        gdf = gdf.drop(['xy'], axis=1)
+        return gdf
+
+
+class GDFParquetCellReader(GDFReader):
+    
+    def read_gdf(self, path) -> gpd.GeoDataFrame:
+        return gpd.read_parquet(path)
+
+
+class GeojsonCellReader(GDFReader):
+    
+    def read_gdf(self, path) -> gpd.GeoDataFrame:
+        gdf = _read_geojson(path)
+        gdf['cell_id'] = np.arange(len(gdf))
+            
+        return gdf
+    
+
+class TissueContourReader(GDFReader):
+
+    def read_gdf(self, path) -> gpd.GeoDataFrame:
+        
+        gdf = _read_geojson(path, class_name='tissue_id', index_key='hole')
+            
+        return gdf
+    
+
+def write_geojson(gdf: gpd.GeoDataFrame, path: str, category_key: str, extra_prop=False, uniform_prop=True, index_key: str=None) -> None:
+        
+    if isinstance(gdf.geometry.iloc[0], Point):
+        geometry = 'MultiPoint'
+    elif isinstance(gdf.geometry.iloc[0], Polygon):
+        geometry = 'MultiPolygon'
+    else:
+        raise ValueError(f"gdf.geometry[0] must be of type Point or Polygon, got {type(gdf.geometry.iloc[0])}")
+    
+    groups = np.unique(gdf[category_key])
+    colors = generate_colors(groups)
+    cells = []
+    for group in tqdm(groups):
+
+        slice = gdf[gdf[category_key] == group]
+        shapes = slice.geometry
+        
+        properties = {
+            "objectType": "annotation",
+            "classification": {
+                "name": str(group),
+                "color": colors[group]
+            }
+        }
+        
+        if extra_prop:
+            props = {}
+            col_exclude = [category_key, 'geometry']
+            if index_key is not None:
+                col_exclude.append(index_key)
+            for col in [c for c in gdf.columns if c not in col_exclude]:
+                if uniform_prop:
+                    unique = np.unique(slice[col])
+                    if len(unique) != 1:
+                        warnings.warn(f"extra property {col} is not uniform for group {group}, found {unique}")
+                props[col] = slice[col].iloc[0]
+            
+            properties = {**properties, **props}
+        
+        if index_key is not None:
+            key = index_key
+            props = {}
+            mask = (slice[key] == True).values
+            props = {key: np.arange(len(mask))[mask].tolist()}
+            properties = {**properties, **props}
+        
+        if isinstance(gdf.geometry.iloc[0], Point):
+            shapes = [[point.x, point.y] for point in shapes]
+        elif isinstance(gdf.geometry.iloc[0], Polygon):
+            shapes = [[[[x, y] for x, y in polygon.exterior.coords]] for polygon in shapes]
+        cell = {
+            'type': 'Feature',
+            'id': (str(id(path)) + '-id-' + str(group)).replace('.', '-'),
+            'geometry': {
+                'type': geometry,
+                'coordinates': shapes
+            },
+            "properties": properties
+        }
+        cells.append(cell)
+    
+    with open(path, 'w') as f:
+        json.dump(cells, f, indent=4)
+            
+    
+    
+def generate_colors(names):
+    colors = plt.get_cmap('hsv', len(names))
+    color_dict = {}
+    for i in range(len(names)):
+        rgb = colors(i)[:3]
+        rgb = [int(255 * c) for c in rgb]
+        color_dict[names[i]] = rgb
+    return color_dict
+
+
+def read_parquet_schema_df(path: str) -> pd.DataFrame:
+    """Return a Pandas dataframe corresponding to the schema of a local URI of a parquet file.
+
+    The returned dataframe has the columns: column, pa_dtype
+    """
+    import pyarrow.parquet
+
+    # Ref: https://stackoverflow.com/a/64288036/
+    schema = pyarrow.parquet.read_schema(path, memory_map=True)
+    schema = pd.DataFrame(({"column": name, "pa_dtype": str(pa_dtype)} for name, pa_dtype in zip(schema.names, schema.types)))
+    schema = schema.reindex(columns=["column", "pa_dtype"], fill_value=pd.NA)  # Ensures columns in case the parquet file has an empty dataframe.
+    return schema
+    
+    
+def cell_reader_factory(path) -> GDFReader:
+    if path.endswith('.geojson'):
+        return GeojsonCellReader()
+    elif path.endswith('.parquet'):
+        schema = read_parquet_schema_df(path)
+        if 'geometry' in schema['column'].values:
+            return GDFParquetCellReader()
+        else:
+            return XeniumParquetCellReader()
+    else:
+        ext = path.split('.')[-1]
+        raise ValueError(f'Unknown file extension {ext} for a cell segmentation file, needs to be .geojson or .parquet')
+    
+    
+def read_gdf(path) -> gpd.GeoDataFrame:
+    return cell_reader_factory(path).read_gdf(path)