From 8ad36058c6e3c082da965eeae3256f0ac1f4ed66 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 10:44:49 +0200 Subject: [PATCH 01/19] Update public API --- README.md | 31 +++++++++++++++++++++++-------- src/sssom/__init__.py | 3 ++- src/sssom/parsers.py | 25 ++++++++++++++++++++++--- src/sssom/util.py | 6 ------ tests/test_collapse.py | 10 +++++++++- 5 files changed, 56 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 30c22a57..93837377 100644 --- a/README.md +++ b/README.md @@ -21,17 +21,32 @@ -SSSOM (Simple Standard for Sharing Ontology Mappings) is a TSV and RDF/OWL standard for ontology mappings +A Python library and command line interface (CLI) for working with +[SSSOM (Simple Standard for Sharing Ontology Mappings)](https://github.com/mapping-commons/sssom). -``` -WARNING: - The export formats (json, rdf) of sssom-py are not yet finalised! - Please expect changes in future releases! -``` +## Getting Started + +A SSSOM TSV can be parsed with + +```python +import sssom -See https://github.com/OBOFoundry/SSSOM +# other SSSOM files can be found on https://mapping-commons.github.io +url = "https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/master/mappings/mp_hp_eye_impc.sssom.tsv" + +# TSV can be parsed into a mapping set dataframe object, +# which includes a pandas DataFrame, a curies.Converter, +# and metadata +msdf = sssom.parse_tsv(url) + +# SSSOM comes with several "write" functions +sssom.write_json(msdf, "test.json") +sssom.write_owl(msdf, "test.owl") +sssom.write_rdf(msdf, "test.ttl") +``` -This is a python library and command line toolkit for working with SSSOM. It also defines a schema for SSSOM. +> [!WARNING] +> The export formats (json, rdf) of sssom-py are not yet finalised! Expect changes in future releases. ## Documentation diff --git a/src/sssom/__init__.py b/src/sssom/__init__.py index 08f4fafc..ec1d032f 100644 --- a/src/sssom/__init__.py +++ b/src/sssom/__init__.py @@ -19,8 +19,9 @@ dataframe_to_ptable, filter_redundant_rows, group_mappings, - parse, reconcile_prefix_and_data, ) from .constants import generate_mapping_set_id, get_default_metadata # noqa:401 +from .parsers import parse_tsv # noqa:401 +from .writers import write_json, write_owl, write_rdf, write_tsv # noqa:401 diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index b95c6573..ae5c1be3 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -287,9 +287,25 @@ def parse_sssom_table( file_path: Union[str, Path, TextIO], prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, - **kwargs, + *, + strict: bool = False, ) -> MappingSetDataFrame: - """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`.""" + """Parse a SSSOM TSV. + + :param file_path: + A file path, URL, or I/O object that contains SSSOM encoded in TSV + :param prefix_map: + A prefix map or :class:`curies.Converter` used to validate prefixes, + CURIEs, and IRIs appearing in the SSSOM TSV + :param meta: + Additional document-level metadata for the SSSOM TSV document that is not + contained within the document itself. For example, this may come from a + companion SSSOM YAML file. + :param strict: + If true, will fail parsing for undefined prefixes, CURIEs, or IRIs + :returns: + A parsed dataframe wrapper object + """ if isinstance(file_path, Path) or isinstance(file_path, str): raise_for_bad_path(file_path) stream: io.StringIO = _open_input(file_path) @@ -301,7 +317,7 @@ def parse_sssom_table( is_valid_built_in_prefixes = _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map) is_valid_metadata = _is_irregular_metadata([sssom_metadata, meta]) - if kwargs.get("strict"): + if strict: _fail_in_strict_parsing_mode(is_valid_built_in_prefixes, is_valid_metadata) # The priority order for combining prefix maps are: @@ -334,6 +350,9 @@ def parse_sssom_table( return msdf +parse_tsv = parse_sssom_table + + def parse_sssom_rdf( file_path: str, prefix_map: ConverterHint = None, diff --git a/src/sssom/util.py b/src/sssom/util.py index 699e1ed0..7c5dee58 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -399,12 +399,6 @@ class MappingSetDiff: """ -def parse(filename: Union[str, Path]) -> pd.DataFrame: - """Parse a TSV to a pandas frame.""" - logging.info(f"Parsing {filename}") - return pd.read_csv(filename, sep="\t", comment="#") - - def collapse(df: pd.DataFrame) -> pd.DataFrame: """Collapse rows with same S/P/O and combines confidence.""" df2 = df.groupby([SUBJECT_ID, PREDICATE_ID, OBJECT_ID])[CONFIDENCE].apply(max).reset_index() diff --git a/tests/test_collapse.py b/tests/test_collapse.py index a366270c..9cde66d2 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -1,6 +1,10 @@ """Test various grouping functionalities.""" import unittest +from pathlib import Path +from typing import Union + +import pandas as pd from sssom.parsers import parse_sssom_table from sssom.util import ( @@ -9,12 +13,16 @@ dataframe_to_ptable, filter_redundant_rows, group_mappings, - parse, reconcile_prefix_and_data, ) from tests.constants import data_dir +def parse(filename: Union[str, Path]) -> pd.DataFrame: + """Parse a TSV to a pandas frame.""" + return pd.read_csv(filename, sep="\t", comment="#") + + class TestCollapse(unittest.TestCase): """Test various grouping functionalities.""" From 4cb86c95a7f10d8cf87185c8485cd3dcf8fe097a Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 10:44:55 +0200 Subject: [PATCH 02/19] Update writers.py --- src/sssom/writers.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/sssom/writers.py b/src/sssom/writers.py index 01917a30..4d9be045 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -51,8 +51,8 @@ def write_table( msdf: MappingSetDataFrame, file: TextIO, embedded_mode: bool = True, - serialisation="tsv", - sort=False, + serialisation: str = "tsv", + sort: bool = False, ) -> None: """Write a mapping set dataframe to the file as a table.""" sep = _get_separator(serialisation) @@ -79,6 +79,11 @@ def write_table( yaml.safe_dump(meta, y) +def write_tsv(msdf: MappingSetDataFrame, path: str | Path | TextIO, sort: bool = False) -> None: + """Write a mapping set to a TSV file.""" + raise NotImplementedError + + def write_rdf( msdf: MappingSetDataFrame, file: TextIO, From 47061a7da20150d9dd43942f36ddd1b71770a8e9 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 10:57:32 +0200 Subject: [PATCH 03/19] Enable passing paths to writers --- src/sssom/writers.py | 61 +++++++++++++++++++++++++++++++------------ tests/test_writers.py | 23 ++++++---------- 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/src/sssom/writers.py b/src/sssom/writers.py index 4d9be045..3e8714e4 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -49,7 +49,7 @@ def write_table( msdf: MappingSetDataFrame, - file: TextIO, + file: str | Path | TextIO, embedded_mode: bool = True, serialisation: str = "tsv", sort: bool = False, @@ -68,13 +68,21 @@ def write_table( lines = [f"# {line}" for line in lines if line != ""] s = msdf.df.to_csv(sep=sep, index=False).rstrip("\n") lines = lines + [s] - for line in lines: - print(line, file=file) + if isinstance(file, str | Path): + with open(file, "w") as fh: + for line in lines: + print(line, file=fh) + else: + for line in lines: + print(line, file=file) else: + if isinstance(file, str | Path): + yml_filepath = Path(file).with_suffix(".yaml") + else: + yml_filepath = file.name.replace("tsv", "yaml") + # Export MSDF as tsv msdf.df.to_csv(file, sep=sep, index=False) - # Export Metadata as yaml - yml_filepath = file.name.replace("tsv", "yaml") with open(yml_filepath, "w") as y: yaml.safe_dump(meta, y) @@ -86,7 +94,7 @@ def write_tsv(msdf: MappingSetDataFrame, path: str | Path | TextIO, sort: bool = def write_rdf( msdf: MappingSetDataFrame, - file: TextIO, + file: str | Path | TextIO, serialisation: Optional[str] = None, ) -> None: """Write a mapping set dataframe to the file as RDF.""" @@ -102,17 +110,26 @@ def write_rdf( check_all_prefixes_in_curie_map(msdf) graph = to_rdf_graph(msdf=msdf) t = graph.serialize(format=serialisation, encoding="utf-8") - print(t.decode(), file=file) + if isinstance(file, str | Path): + with open(file, "w") as fh: + print(t.decode(), file=fh) + else: + print(t.decode(), file=file) -def write_json(msdf: MappingSetDataFrame, output: TextIO, serialisation="json") -> None: +def write_json( + msdf: MappingSetDataFrame, output: str | Path | TextIO, serialisation="json" +) -> None: """Write a mapping set dataframe to the file as JSON. + :param msdf: A mapping set dataframe + :param output: A path or write-supported file object to write JSON to :param serialisation: The JSON format to use. Supported formats are: - - fhir_json: Outputs JSON in FHIR ConceptMap format (https://fhir-ru.github.io/conceptmap.html) + + - ``fhir_json``: Outputs JSON in FHIR ConceptMap format (https://fhir-ru.github.io/conceptmap.html) https://mapping-commons.github.io/sssom-py/sssom.html#sssom.writers.to_fhir_json - - json: Outputs to SSSOM JSON https://mapping-commons.github.io/sssom-py/sssom.html#sssom.writers.to_json - - ontoportal_json: Outputs JSON in Ontoportal format (https://ontoportal.org/) + - ``json``: Outputs to SSSOM JSON https://mapping-commons.github.io/sssom-py/sssom.html#sssom.writers.to_json + - ``ontoportal_json``: Outputs JSON in Ontoportal format (https://ontoportal.org/) https://mapping-commons.github.io/sssom-py/sssom.html#sssom.writers.to_ontoportal_json """ func_map: Dict[str, Callable] = { @@ -126,11 +143,18 @@ def write_json(msdf: MappingSetDataFrame, output: TextIO, serialisation="json") ) func: Callable = func_map[serialisation] data = func(msdf) - json.dump(data, output, indent=2) + + if isinstance(output, str | Path): + with open(output, "w") as file: + json.dump(data, file, indent=2) + else: + json.dump(data, output, indent=2) @deprecated(deprecated_in="0.4.7", details="Use write_json() instead") -def write_fhir_json(msdf: MappingSetDataFrame, output: TextIO, serialisation="fhir_json") -> None: +def write_fhir_json( + msdf: MappingSetDataFrame, output: str | Path | TextIO, serialisation="fhir_json" +) -> None: """Write a mapping set dataframe to the file as FHIR ConceptMap JSON.""" if serialisation != "fhir_json": raise ValueError( @@ -141,7 +165,7 @@ def write_fhir_json(msdf: MappingSetDataFrame, output: TextIO, serialisation="fh @deprecated(deprecated_in="0.4.7", details="Use write_json() instead") def write_ontoportal_json( - msdf: MappingSetDataFrame, output: TextIO, serialisation: str = "ontoportal_json" + msdf: MappingSetDataFrame, output: str | Path | TextIO, serialisation: str = "ontoportal_json" ) -> None: """Write a mapping set dataframe to the file as the ontoportal mapping JSON model.""" if serialisation != "ontoportal_json": @@ -153,7 +177,7 @@ def write_ontoportal_json( def write_owl( msdf: MappingSetDataFrame, - file: TextIO, + file: str | Path | TextIO, serialisation=SSSOM_DEFAULT_RDF_SERIALISATION, ) -> None: """Write a mapping set dataframe to the file as OWL.""" @@ -166,7 +190,12 @@ def write_owl( graph = to_owl_graph(msdf) t = graph.serialize(format=serialisation, encoding="utf-8") - print(t.decode(), file=file) + + if isinstance(file, str | Path): + with open(file) as fh: + print(t.decode(), file=fh) + else: + print(t.decode(), file=file) # Converters diff --git a/tests/test_writers.py b/tests/test_writers.py index ad27e565..22fab7f5 100644 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -43,20 +43,18 @@ def setUp(self) -> None: def test_write_sssom_dataframe(self): """Test writing as a dataframe.""" tmp_path = os.path.join(test_out_dir, "test_write_sssom_dataframe.tsv") - with open(tmp_path, "w") as tmp_file: - write_table(self.msdf, tmp_file) + write_table(self.msdf, tmp_path) msdf = parse_sssom_table(tmp_path) self.assertEqual( len(msdf.df), self.mapping_count, - f"{tmp_file} has the wrong number of mappings.", + f"{tmp_path} has the wrong number of mappings.", ) def test_write_sssom_rdf(self): """Test writing as RDF.""" path_1 = os.path.join(test_out_dir, "test_write_sssom_rdf.rdf") - with open(path_1, "w") as file: - write_rdf(self.msdf, file) + write_rdf(self.msdf, path_1) msdf = parse_sssom_rdf(path_1, self.msdf.prefix_map) self.assertEqual( len(msdf.df), @@ -66,14 +64,12 @@ def test_write_sssom_rdf(self): # TODO this test doesn't make sense path_2 = os.path.join(test_out_dir, "test_write_sssom_rdf.rdf.tsv") - with open(path_2, "w") as file: - write_table(self.msdf, file) + write_table(self.msdf, path_2) def test_write_sssom_json(self): """Test writing as JSON.""" path = os.path.join(test_out_dir, "test_write_sssom_json.json") - with open(path, "w") as file: - write_json(self.msdf, file) + write_json(self.msdf, path) msdf = parse_sssom_json(path) self.assertEqual( len(msdf.df), @@ -136,8 +132,7 @@ def test_write_sssom_fhir(self): mapping_set_id: str = metadata["mapping_set_id"] # Write - with open(path, "w") as file: - write_json(self.msdf, file, "fhir_json") + write_json(self.msdf, path, "fhir_json") # Read # todo: after implementing reader/importer, change this to `msdf = parse_sssom_fhir_json()` with open(path, "r") as file: @@ -174,14 +169,12 @@ def test_write_sssom_fhir(self): def test_write_sssom_owl(self): """Test writing as OWL.""" tmp_file = os.path.join(test_out_dir, "test_write_sssom_owl.owl") - with open(tmp_file, "w") as file: - write_owl(self.msdf, file) + write_owl(self.msdf, tmp_file) def test_write_sssom_ontoportal_json(self): """Test writing as ontoportal JSON.""" path = os.path.join(test_out_dir, "test_write_sssom_ontoportal_json.json") - with open(path, "w") as file: - write_json(self.msdf, file, "ontoportal_json") + write_json(self.msdf, path, "ontoportal_json") with open(path, "r") as file: d = json.load(file) From 272f2f113d5863c6bad01f8b05f99c287398743c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 11:04:35 +0200 Subject: [PATCH 04/19] Refactor --- src/sssom/writers.py | 45 +++++++++++++++++++------------------------ tests/test_writers.py | 2 +- 2 files changed, 21 insertions(+), 26 deletions(-) diff --git a/src/sssom/writers.py b/src/sssom/writers.py index 3e8714e4..64742fd7 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -2,8 +2,9 @@ import json import logging as _logging +from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, TextIO, Tuple, Union +from typing import Any, Callable, Dict, Generator, List, Optional, TextIO, Tuple, Union import pandas as pd import yaml @@ -47,6 +48,15 @@ MSDFWriter = Callable[[MappingSetDataFrame, TextIO], None] +@contextmanager +def _open_text_writer(xx: str | Path | TextIO) -> Generator[TextIO, None, None]: + if isinstance(xx, str | Path): + with open(xx, "w") as file: + yield file + else: + yield xx + + def write_table( msdf: MappingSetDataFrame, file: str | Path | TextIO, @@ -68,18 +78,14 @@ def write_table( lines = [f"# {line}" for line in lines if line != ""] s = msdf.df.to_csv(sep=sep, index=False).rstrip("\n") lines = lines + [s] - if isinstance(file, str | Path): - with open(file, "w") as fh: - for line in lines: - print(line, file=fh) - else: + with _open_text_writer(file) as fh: for line in lines: - print(line, file=file) + print(line, file=fh) else: if isinstance(file, str | Path): yml_filepath = Path(file).with_suffix(".yaml") else: - yml_filepath = file.name.replace("tsv", "yaml") + yml_filepath = Path(file.name.replace("tsv", "yaml")) # Export MSDF as tsv msdf.df.to_csv(file, sep=sep, index=False) @@ -110,11 +116,8 @@ def write_rdf( check_all_prefixes_in_curie_map(msdf) graph = to_rdf_graph(msdf=msdf) t = graph.serialize(format=serialisation, encoding="utf-8") - if isinstance(file, str | Path): - with open(file, "w") as fh: - print(t.decode(), file=fh) - else: - print(t.decode(), file=file) + with _open_text_writer(file) as fh: + print(t.decode(), file=fh) def write_json( @@ -143,12 +146,8 @@ def write_json( ) func: Callable = func_map[serialisation] data = func(msdf) - - if isinstance(output, str | Path): - with open(output, "w") as file: - json.dump(data, file, indent=2) - else: - json.dump(data, output, indent=2) + with _open_text_writer(output) as fh: + json.dump(data, fh, indent=2) @deprecated(deprecated_in="0.4.7", details="Use write_json() instead") @@ -190,12 +189,8 @@ def write_owl( graph = to_owl_graph(msdf) t = graph.serialize(format=serialisation, encoding="utf-8") - - if isinstance(file, str | Path): - with open(file) as fh: - print(t.decode(), file=fh) - else: - print(t.decode(), file=file) + with _open_text_writer(file) as fh: + print(t.decode(), file=fh) # Converters diff --git a/tests/test_writers.py b/tests/test_writers.py index 22fab7f5..8a6022e0 100644 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -123,7 +123,7 @@ def test_update_sssom_context_with_prefixmap(self): self.assertNotIn("snomed", context["@context"]) self.assertIn("mapping_set_id", context["@context"]) - def test_write_sssom_fhir(self): + def test_write_sssom_fhir(self) -> None: """Test writing as FHIR ConceptMap JSON.""" # Vars path = os.path.join(test_out_dir, "test_write_sssom_fhir.json") From f7b8380b265d093356b88efb853b01f3bff81d19 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 11:11:15 +0200 Subject: [PATCH 05/19] Update typing --- src/sssom/constants.py | 6 +++++- src/sssom/parsers.py | 18 +++++++++++------- src/sssom/writers.py | 26 ++++++++++++-------------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/sssom/constants.py b/src/sssom/constants.py index b2800ac3..d407f0ef 100644 --- a/src/sssom/constants.py +++ b/src/sssom/constants.py @@ -4,7 +4,7 @@ import uuid from enum import Enum from functools import cached_property, lru_cache -from typing import Any, Dict, List, Literal, Set +from typing import Any, Dict, List, Literal, Set, TextIO, Union import importlib_resources import yaml @@ -316,3 +316,7 @@ def get_default_metadata() -> MetadataType: "mapping_set_id": generate_mapping_set_id(), "license": DEFAULT_LICENSE, } + + +#: A hint for functions that can take a path or an IO +PathOrIO = Union[str, pathlib.Path, TextIO] diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index ae5c1be3..6855a12d 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -54,6 +54,7 @@ SUBJECT_SOURCE, SUBJECT_SOURCE_ID, MetadataType, + PathOrIO, _get_sssom_schema_object, get_default_metadata, ) @@ -76,7 +77,7 @@ # Parsers (from file) -def _open_input(input: Union[str, Path, TextIO]) -> io.StringIO: +def _open_input(input: PathOrIO) -> io.StringIO: """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object. :param input: A string representing a URL, a filepath, or file contents, @@ -284,7 +285,7 @@ def _get_converter_pop_replace_curie_map(sssom_metadata): def parse_sssom_table( - file_path: Union[str, Path, TextIO], + file_path: PathOrIO, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, *, @@ -354,7 +355,7 @@ def parse_sssom_table( def parse_sssom_rdf( - file_path: str, + file_path: Union[str, Path], prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, serialisation=SSSOM_DEFAULT_RDF_SERIALISATION, @@ -392,7 +393,10 @@ def parse_sssom_rdf( def parse_sssom_json( - file_path: str, prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, **kwargs + file_path: Union[str, Path], + prefix_map: ConverterHint = None, + meta: Optional[MetadataType] = None, + **kwargs, ) -> MappingSetDataFrame: """Parse a TSV to a :class:`MappingSetDocument` to a :class:`MappingSetDataFrame`.""" raise_for_bad_path(file_path) @@ -426,7 +430,7 @@ def parse_sssom_json( def parse_obographs_json( - file_path: str, + file_path: Union[str, Path], prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, mapping_predicates: Optional[List[str]] = None, @@ -510,7 +514,7 @@ def _get_mapping_dict( def parse_alignment_xml( - file_path: str, + file_path: Union[str, Path], prefix_map: ConverterHint = None, meta: Optional[MetadataType] = None, mapping_predicates: Optional[List[str]] = None, @@ -520,7 +524,7 @@ def parse_alignment_xml( converter, meta = _get_prefix_map_and_metadata(prefix_map=prefix_map, meta=meta) logging.info("Loading from alignment API") - xmldoc = minidom.parse(file_path) + xmldoc = minidom.parse(Path(file_path).resolve().as_posix()) msdf = from_alignment_minidom( xmldoc, prefix_map=converter, diff --git a/src/sssom/writers.py b/src/sssom/writers.py index 64742fd7..2417dacb 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -19,7 +19,7 @@ from sssom.validators import check_all_prefixes_in_curie_map -from .constants import CURIE_MAP, SCHEMA_YAML, SSSOM_URI_PREFIX +from .constants import CURIE_MAP, SCHEMA_YAML, SSSOM_URI_PREFIX, PathOrIO from .context import _load_sssom_context from .parsers import to_mapping_set_document from .util import ( @@ -49,7 +49,7 @@ @contextmanager -def _open_text_writer(xx: str | Path | TextIO) -> Generator[TextIO, None, None]: +def _open_text_writer(xx: PathOrIO) -> Generator[TextIO, None, None]: if isinstance(xx, str | Path): with open(xx, "w") as file: yield file @@ -59,7 +59,7 @@ def _open_text_writer(xx: str | Path | TextIO) -> Generator[TextIO, None, None]: def write_table( msdf: MappingSetDataFrame, - file: str | Path | TextIO, + file: PathOrIO, embedded_mode: bool = True, serialisation: str = "tsv", sort: bool = False, @@ -93,14 +93,16 @@ def write_table( yaml.safe_dump(meta, y) -def write_tsv(msdf: MappingSetDataFrame, path: str | Path | TextIO, sort: bool = False) -> None: +def write_tsv( + msdf: MappingSetDataFrame, path: PathOrIO, embedded_mode: bool = True, sort: bool = False +) -> None: """Write a mapping set to a TSV file.""" - raise NotImplementedError + write_table(msdf, path, serialisation="tsv", embedded_mode=embedded_mode, sort=sort) def write_rdf( msdf: MappingSetDataFrame, - file: str | Path | TextIO, + file: PathOrIO, serialisation: Optional[str] = None, ) -> None: """Write a mapping set dataframe to the file as RDF.""" @@ -120,9 +122,7 @@ def write_rdf( print(t.decode(), file=fh) -def write_json( - msdf: MappingSetDataFrame, output: str | Path | TextIO, serialisation="json" -) -> None: +def write_json(msdf: MappingSetDataFrame, output: PathOrIO, serialisation="json") -> None: """Write a mapping set dataframe to the file as JSON. :param msdf: A mapping set dataframe @@ -151,9 +151,7 @@ def write_json( @deprecated(deprecated_in="0.4.7", details="Use write_json() instead") -def write_fhir_json( - msdf: MappingSetDataFrame, output: str | Path | TextIO, serialisation="fhir_json" -) -> None: +def write_fhir_json(msdf: MappingSetDataFrame, output: PathOrIO, serialisation="fhir_json") -> None: """Write a mapping set dataframe to the file as FHIR ConceptMap JSON.""" if serialisation != "fhir_json": raise ValueError( @@ -164,7 +162,7 @@ def write_fhir_json( @deprecated(deprecated_in="0.4.7", details="Use write_json() instead") def write_ontoportal_json( - msdf: MappingSetDataFrame, output: str | Path | TextIO, serialisation: str = "ontoportal_json" + msdf: MappingSetDataFrame, output: PathOrIO, serialisation: str = "ontoportal_json" ) -> None: """Write a mapping set dataframe to the file as the ontoportal mapping JSON model.""" if serialisation != "ontoportal_json": @@ -176,7 +174,7 @@ def write_ontoportal_json( def write_owl( msdf: MappingSetDataFrame, - file: str | Path | TextIO, + file: PathOrIO, serialisation=SSSOM_DEFAULT_RDF_SERIALISATION, ) -> None: """Write a mapping set dataframe to the file as OWL.""" From 5f81d40c6fb8b307b73abbfac6b4ad307af3f482 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 11:13:29 +0200 Subject: [PATCH 06/19] Update writers.py --- src/sssom/writers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sssom/writers.py b/src/sssom/writers.py index 2417dacb..f1a453ad 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -50,7 +50,7 @@ @contextmanager def _open_text_writer(xx: PathOrIO) -> Generator[TextIO, None, None]: - if isinstance(xx, str | Path): + if isinstance(xx, (str, Path)): with open(xx, "w") as file: yield file else: @@ -82,7 +82,7 @@ def write_table( for line in lines: print(line, file=fh) else: - if isinstance(file, str | Path): + if isinstance(file, (str, Path)): yml_filepath = Path(file).with_suffix(".yaml") else: yml_filepath = Path(file.name.replace("tsv", "yaml")) From 21266ff66f540e79ebcceefe54d3b795172a8248 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 11:17:41 +0200 Subject: [PATCH 07/19] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 93837377..859c67f5 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ url = "https://raw.githubusercontent.com/mapping-commons/mh_mapping_initiative/m msdf = sssom.parse_tsv(url) # SSSOM comes with several "write" functions +sssom.write_tsv(msdf, "test.tsv") sssom.write_json(msdf, "test.json") sssom.write_owl(msdf, "test.owl") sssom.write_rdf(msdf, "test.ttl") From 72f6ab6bb0aa579d07f5689a146db0e4edfa16b0 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Fri, 23 May 2025 11:19:51 +0200 Subject: [PATCH 08/19] Update parsers.py --- src/sssom/parsers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 6855a12d..daf7be9f 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -290,6 +290,7 @@ def parse_sssom_table( meta: Optional[MetadataType] = None, *, strict: bool = False, + **kwargs: Any, ) -> MappingSetDataFrame: """Parse a SSSOM TSV. @@ -304,9 +305,13 @@ def parse_sssom_table( companion SSSOM YAML file. :param strict: If true, will fail parsing for undefined prefixes, CURIEs, or IRIs + :param kwargs: + Additional keyword arguments (unhandled) :returns: A parsed dataframe wrapper object """ + if kwargs: + logging.warning("unhandled keyword arguments passed: %s", kwargs) if isinstance(file_path, Path) or isinstance(file_path, str): raise_for_bad_path(file_path) stream: io.StringIO = _open_input(file_path) From 5d936d7ba01970e6b1cdb9fc081ce738752017bf Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:08:22 +0200 Subject: [PATCH 09/19] Add explicit CSV and TSV parsers --- src/sssom/__init__.py | 2 +- src/sssom/parsers.py | 59 +++++++++++++++++++++++++------------------ src/sssom/util.py | 33 +++++++++++------------- src/sssom/writers.py | 2 +- tests/test_utils.py | 24 ++++++++++++++++++ 5 files changed, 75 insertions(+), 45 deletions(-) diff --git a/src/sssom/__init__.py b/src/sssom/__init__.py index ec1d032f..0fdbe39b 100644 --- a/src/sssom/__init__.py +++ b/src/sssom/__init__.py @@ -23,5 +23,5 @@ ) from .constants import generate_mapping_set_id, get_default_metadata # noqa:401 -from .parsers import parse_tsv # noqa:401 +from .parsers import parse_csv, parse_sssom_table, parse_tsv # noqa:401 from .writers import write_json, write_owl, write_rdf, write_tsv # noqa:401 diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index daf7be9f..bf67439d 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -144,7 +144,7 @@ def _separate_metadata_and_table_from_stream(s: io.StringIO): return table_component, metadata_component -def _read_pandas_and_metadata(input: io.StringIO, sep: str = None): +def _read_pandas_and_metadata(input: io.StringIO, sep: str | None = None): """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. :param input: The file to read. If no separator is given, this file should be named. @@ -155,7 +155,6 @@ def _read_pandas_and_metadata(input: io.StringIO, sep: str = None): try: df = pd.read_csv(table_stream, sep=sep, dtype=str, engine="python") - df.fillna("", inplace=True) except EmptyDataError as e: logging.warning(f"Seems like the dataframe is empty: {e}") df = pd.DataFrame( @@ -167,28 +166,24 @@ def _read_pandas_and_metadata(input: io.StringIO, sep: str = None): MAPPING_JUSTIFICATION, ] ) + else: + df.fillna("", inplace=True) - if isinstance(df, pd.DataFrame): - sssom_metadata = _read_metadata_from_table(metadata_stream) - return df, sssom_metadata - - return None, None + sssom_metadata = _read_metadata_from_table(metadata_stream) + return df, sssom_metadata -def _get_seperator_symbol_from_file_path(file): - r""" - Take as an input a filepath and return the seperate symbol used, for example, by pandas. +def _infer_separator(file: PathOrIO) -> str | None: + r"""Infer the CSV separator from a file path or IO object. :param file: the file path - :return: the seperator symbols as a string, e.g. '\t' + :return: the separator symbols as a string, e.g. '\t' """ - if isinstance(file, Path) or isinstance(file, str): - extension = get_file_extension(file) - if extension == "tsv": - return "\t" - elif extension == "csv": - return "," - logging.warning(f"Could not guess file extension for {file}") + extension = get_file_extension(file) + if extension == "tsv": + return "\t" + elif extension == "csv": + return "," return None @@ -290,9 +285,10 @@ def parse_sssom_table( meta: Optional[MetadataType] = None, *, strict: bool = False, + sep: str | None = None, **kwargs: Any, ) -> MappingSetDataFrame: - """Parse a SSSOM TSV. + """Parse a SSSOM CSV or TSV file. :param file_path: A file path, URL, or I/O object that contains SSSOM encoded in TSV @@ -305,6 +301,8 @@ def parse_sssom_table( companion SSSOM YAML file. :param strict: If true, will fail parsing for undefined prefixes, CURIEs, or IRIs + :param sep: + The seperator. If not given, inferred from file name :param kwargs: Additional keyword arguments (unhandled) :returns: @@ -315,8 +313,9 @@ def parse_sssom_table( if isinstance(file_path, Path) or isinstance(file_path, str): raise_for_bad_path(file_path) stream: io.StringIO = _open_input(file_path) - sep_new = _get_seperator_symbol_from_file_path(file_path) - df, sssom_metadata = _read_pandas_and_metadata(stream, sep_new) + if sep is None: + sep = _infer_separator(file_path) + df, sssom_metadata = _read_pandas_and_metadata(stream, sep) if meta is None: meta = {} @@ -356,7 +355,16 @@ def parse_sssom_table( return msdf -parse_tsv = parse_sssom_table +def parse_csv(*args, **kwargs) -> MappingSetDataFrame: + """Parse a SSSOM CSV file, forwarding arguments to :func:`parse_sssom_table`.""" + kwargs["sep"] = "," + return parse_sssom_table(*args, **kwargs) + + +def parse_tsv(*args, **kwargs) -> MappingSetDataFrame: + """Parse a SSSOM TSV file, forwarding arguments to :func:`parse_sssom_table`.""" + kwargs["sep"] = "\t" + return parse_sssom_table(*args, **kwargs) def parse_sssom_rdf( @@ -828,6 +836,7 @@ def _make_mdict( PARSING_FUNCTIONS: typing.Mapping[str, Callable] = { + "csv": parse_sssom_table, "tsv": parse_sssom_table, "obographs-json": parse_obographs_json, "alignment-api-xml": parse_alignment_xml, @@ -841,14 +850,14 @@ def get_parsing_function(input_format: Optional[str], filename: str) -> Callable :param input_format: File format :param filename: Filename - :raises Exception: Unknown file format + :raises ValueError: Unknown file format :return: Appropriate 'read' function """ if input_format is None: - input_format = get_file_extension(filename) + input_format = get_file_extension(filename) or "tsv" func = PARSING_FUNCTIONS.get(input_format) if func is None: - raise Exception(f"Unknown input format: {input_format}") + raise ValueError(f"Unknown input format: {input_format}") return func diff --git a/src/sssom/util.py b/src/sssom/util.py index 7c5dee58..f502c8df 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -58,6 +58,7 @@ SUBJECT_SOURCE, UNKNOWN_IRI, MetadataType, + PathOrIO, _get_sssom_schema_object, get_default_metadata, ) @@ -993,29 +994,25 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame: return msdf -def get_file_extension(file: Union[str, Path, TextIO]) -> str: +def get_file_extension(file: PathOrIO) -> str | None: """Get file extension. :param file: File path :return: format of the file passed, default tsv """ - if isinstance(file, Path): - if file.suffix: - return file.suffix.strip(punctuation) - else: - logging.warning( - f"Cannot guess format from {file}, despite appearing to be a Path-like object." - ) - elif isinstance(file, str): - filename = file - parts = filename.split(".") - if len(parts) > 0: - f_format = parts[-1] - return f_format.strip(punctuation) - else: - logging.warning(f"Cannot guess format from {filename}") - logging.info("Cannot guess format extension for this file, assuming TSV.") - return "tsv" + if isinstance(file, str): + file = Path(file) + elif isinstance(file, TextIO): + file = Path(file.name) + + filename = file.name.removesuffix(".gz") + if filename.endswith(".tsv"): + return "tsv" + elif filename.endswith(".csv"): + return "csv" + else: + logging.debug("cannot guess format for %s", filename) + return None def _extract_global_metadata(msdoc: MappingSetDocument) -> MetadataType: diff --git a/src/sssom/writers.py b/src/sssom/writers.py index f1a453ad..a6a10b9a 100644 --- a/src/sssom/writers.py +++ b/src/sssom/writers.py @@ -572,7 +572,7 @@ def get_writer_function( :return: Type of writer function """ if output_format is None: - output_format = get_file_extension(output) + output_format = get_file_extension(output) or "tsv" if output_format not in WRITER_FUNCTIONS: raise ValueError(f"Unknown output format: {output_format}") func, tag = WRITER_FUNCTIONS[output_format] diff --git a/tests/test_utils.py b/tests/test_utils.py index 42499e69..fac96b63 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,8 @@ """Test for merging MappingSetDataFrames.""" +import tempfile import unittest +from pathlib import Path import numpy as np import pandas as pd @@ -26,6 +28,7 @@ filter_out_prefixes, filter_prefixes, get_dict_from_mapping, + get_file_extension, get_prefixes_used_in_table, inject_metadata_into_df, invert_mappings, @@ -33,6 +36,8 @@ ) from tests.constants import data_dir +HERE = Path(__file__).parent.resolve() + class TestIO(unittest.TestCase): """A test case for merging msdfs.""" @@ -498,3 +503,22 @@ def test_curiechain_with_conflicts(self): ) # self.assertIn("SCTID", converter.prefix_map) + + def test_get_file_extension(self) -> None: + """Test getting a file extension.""" + for value, part in [ + ("tsv", "test.tsv"), + ("tsv", "test.tsv.gz"), + ("csv", "test.csv"), + ("csv", "test.csv.gz"), + # Don't infer an extension for something else + (None, "test.xxx"), + ]: + path = HERE.joinpath(part) + with self.subTest(path=path, mode="path"): + self.assertEqual(value, get_file_extension(path)) + with self.subTest(path=path, mode="str"): + self.assertEqual(value, get_file_extension(path.as_posix())) + with self.subTest(path=path, mode="file"), tempfile.TemporaryDirectory() as d: + with Path(d).joinpath(part).open("w") as file: + self.assertEqual(value, get_file_extension(file)) From 065ba1fa1104acdb99c45b32676ee9697ba71f46 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:13:43 +0200 Subject: [PATCH 10/19] Get safer with more typing --- src/sssom/parsers.py | 12 +++++++----- src/sssom/util.py | 7 +++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index bf67439d..e87ae527 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -63,6 +63,7 @@ from .sssom_document import MappingSetDocument from .util import ( SSSOM_DEFAULT_RDF_SERIALISATION, + ExtensionLiteral, MappingSetDataFrame, get_file_extension, is_multivalued_slot, @@ -173,6 +174,9 @@ def _read_pandas_and_metadata(input: io.StringIO, sep: str | None = None): return df, sssom_metadata +EXTENSION_TO_SEP: dict[ExtensionLiteral, str] = {"tsv": "\t", "csv": ","} + + def _infer_separator(file: PathOrIO) -> str | None: r"""Infer the CSV separator from a file path or IO object. @@ -180,11 +184,9 @@ def _infer_separator(file: PathOrIO) -> str | None: :return: the separator symbols as a string, e.g. '\t' """ extension = get_file_extension(file) - if extension == "tsv": - return "\t" - elif extension == "csv": - return "," - return None + if extension is None: + return None + return EXTENSION_TO_SEP[extension] def _is_check_valid_extension_slot(slot_name, meta): diff --git a/src/sssom/util.py b/src/sssom/util.py index f502c8df..8458be11 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -10,7 +10,7 @@ from functools import partial, reduce from pathlib import Path from string import punctuation -from typing import Any, DefaultDict, Dict, List, Optional, Set, TextIO, Tuple, Union +from typing import Any, DefaultDict, Dict, List, Literal, Optional, Set, TextIO, Tuple, Union import curies import numpy as np @@ -994,7 +994,10 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame: return msdf -def get_file_extension(file: PathOrIO) -> str | None: +ExtensionLiteral = Literal["tsv", "csv"] + + +def get_file_extension(file: PathOrIO) -> ExtensionLiteral | None: """Get file extension. :param file: File path From faf61db6ac6177ca025e8d46791c64696d7624be Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:15:52 +0200 Subject: [PATCH 11/19] Update test_collapse.py --- tests/test_collapse.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/test_collapse.py b/tests/test_collapse.py index 9cde66d2..61e31b03 100644 --- a/tests/test_collapse.py +++ b/tests/test_collapse.py @@ -1,8 +1,6 @@ """Test various grouping functionalities.""" import unittest -from pathlib import Path -from typing import Union import pandas as pd @@ -18,17 +16,12 @@ from tests.constants import data_dir -def parse(filename: Union[str, Path]) -> pd.DataFrame: - """Parse a TSV to a pandas frame.""" - return pd.read_csv(filename, sep="\t", comment="#") - - class TestCollapse(unittest.TestCase): """Test various grouping functionalities.""" def setUp(self) -> None: """Set up the test case.""" - self.df = parse(data_dir / "basic.tsv") + self.df = pd.read_csv(data_dir / "basic.tsv", sep="\t", comment="#") def test_row_count(self): """Test the dataframe has the correct number of rows.""" @@ -71,7 +64,7 @@ def test_diff(self): self.assertTrue(c.startswith("COMMON_TO_BOTH")) # output = sqldf("select * from diff_df where comment != ''") - df2 = parse(data_dir / "basic2.tsv") + df2 = pd.read_csv(data_dir / "basic2.tsv", sep="\t", comment="#") diff = compare_dataframes(self.df, df2) self.assertEqual(15, len(diff.unique_tuples1)) self.assertEqual(3, len(diff.unique_tuples2)) From af6a7cb94d45e66c77b229ed335c0b07e4c476d5 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:16:14 +0200 Subject: [PATCH 12/19] Update util.py --- src/sssom/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 8458be11..8baedfe7 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -997,7 +997,7 @@ def inject_metadata_into_df(msdf: MappingSetDataFrame) -> MappingSetDataFrame: ExtensionLiteral = Literal["tsv", "csv"] -def get_file_extension(file: PathOrIO) -> ExtensionLiteral | None: +def get_file_extension(file: PathOrIO) -> Optional[ExtensionLiteral]: """Get file extension. :param file: File path From 8065dfe1f589d21073caba030b0ec4ed0dcba780 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:16:40 +0200 Subject: [PATCH 13/19] Update parsers.py --- src/sssom/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index e87ae527..dd2779e8 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -177,7 +177,7 @@ def _read_pandas_and_metadata(input: io.StringIO, sep: str | None = None): EXTENSION_TO_SEP: dict[ExtensionLiteral, str] = {"tsv": "\t", "csv": ","} -def _infer_separator(file: PathOrIO) -> str | None: +def _infer_separator(file: PathOrIO) -> Optional[str]: r"""Infer the CSV separator from a file path or IO object. :param file: the file path From 7fee8d210b03bcb7ed4471b960e6abe1dcf1471e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:20:16 +0200 Subject: [PATCH 14/19] Update util.py --- src/sssom/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sssom/util.py b/src/sssom/util.py index 8baedfe7..64691014 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -9,7 +9,6 @@ from dataclasses import dataclass, field from functools import partial, reduce from pathlib import Path -from string import punctuation from typing import Any, DefaultDict, Dict, List, Literal, Optional, Set, TextIO, Tuple, Union import curies From eb415893fe36a8e23be3e4790733ffd3d97c119f Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:29:05 +0200 Subject: [PATCH 15/19] Update parsers.py --- src/sssom/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index dd2779e8..0e90e90b 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -145,7 +145,7 @@ def _separate_metadata_and_table_from_stream(s: io.StringIO): return table_component, metadata_component -def _read_pandas_and_metadata(input: io.StringIO, sep: str | None = None): +def _read_pandas_and_metadata(input: io.StringIO, sep: Optional[str] = None): """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. :param input: The file to read. If no separator is given, this file should be named. @@ -287,7 +287,7 @@ def parse_sssom_table( meta: Optional[MetadataType] = None, *, strict: bool = False, - sep: str | None = None, + sep: Optional[str] = None, **kwargs: Any, ) -> MappingSetDataFrame: """Parse a SSSOM CSV or TSV file. From 521e94687d8caf27b635854a3d73051b6ecb671e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 20:55:44 +0200 Subject: [PATCH 16/19] Refactor file opening --- src/sssom/parsers.py | 77 +++++++++++++++++++++++-------------------- src/sssom/util.py | 11 ++++--- tests/test_parsers.py | 8 ++--- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 0e90e90b..6a44a0dc 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -5,6 +5,7 @@ import itertools as itt import json import logging as _logging +import os.path import re import typing from collections import ChainMap, Counter @@ -74,46 +75,51 @@ logging = _logging.getLogger(__name__) + # * ******************************************************* # Parsers (from file) -def _open_input(input: PathOrIO) -> io.StringIO: +def _open_input(p: PathOrIO) -> TextIO: """Transform a URL, a filepath (from pathlib), or a string (with file contents) to a StringIO object. - :param input: A string representing a URL, a filepath, or file contents, - or a Path object representing a filepath. + :param p: + A string representing a URL, a filepath, or file contents, or a Path object representing a filepath. :return: A StringIO object containing the input data. """ - # If the import already is a StrinIO, return it - if isinstance(input, io.StringIO): - return input - elif isinstance(input, Path): - input = str(input) - - if isinstance(input, str): - if input.startswith("http://") or input.startswith("https://"): - # It's a URL - data = requests.get(input, timeout=30).content - return io.StringIO(data.decode("utf-8")) - elif "\n" in input or "\r" in input: - # It's string data - return io.StringIO(input) - elif input.endswith(".gz"): - with gzip.open(input, "rt") as file: - file_content = file.read() - return io.StringIO(file_content) - else: - # It's a local file path - with open(input, "r") as file: - file_content = file.read() - return io.StringIO(file_content) - - raise IOError(f"Could not determine the type of input {input}") + # if we passed an IO object, return it back directly + if not isinstance(p, (str, Path)): + return p + + if isinstance(p, str) and (p.startswith("http://") or p.startswith("https://")): + # It's a URL + data = requests.get(p, timeout=30).content + return io.StringIO(data.decode("utf-8")) + + # squash a path to a string so we don't have to duplicate logic below + if isinstance(p, Path): + p = p.as_posix() + + if "\n" in p or "\r" in p: + # It's string data + return io.StringIO(p) + + if not os.path.exists(p): + raise FileNotFoundError(f"file does not exist: {p}") + + if p.endswith(".gz"): + with gzip.open(p, "rt") as file: + file_content = file.read() + return io.StringIO(file_content) + else: + # It's a local file path + with open(p, "r") as file: + file_content = file.read() + return io.StringIO(file_content) -def _separate_metadata_and_table_from_stream(s: io.StringIO): - s.seek(0) +def _separate_metadata_and_table_from_stream(stream: TextIO): + stream.seek(0) # Create a new StringIO object for filtered data table_component = io.StringIO() @@ -122,7 +128,7 @@ def _separate_metadata_and_table_from_stream(s: io.StringIO): header_section = True # Filter out lines starting with '#' - for line in s: + for line in stream: if not line.startswith("#"): table_component.write(line) if header_section: @@ -145,14 +151,14 @@ def _separate_metadata_and_table_from_stream(s: io.StringIO): return table_component, metadata_component -def _read_pandas_and_metadata(input: io.StringIO, sep: Optional[str] = None): +def _read_pandas_and_metadata(stream: TextIO, sep: Optional[str] = None): """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. - :param input: The file to read. If no separator is given, this file should be named. + :param stream: The file to read. If no separator is given, this file should be named. :param sep: File separator for pandas :return: A pandas dataframe """ - table_stream, metadata_stream = _separate_metadata_and_table_from_stream(input) + table_stream, metadata_stream = _separate_metadata_and_table_from_stream(stream) try: df = pd.read_csv(table_stream, sep=sep, dtype=str, engine="python") @@ -213,7 +219,6 @@ def _is_irregular_metadata(metadata_list: List[Dict]): def _check_redefined_builtin_prefixes(sssom_metadata, meta, prefix_map): - # There are three ways in which prefixes can be communicated, so we will check all of them # This is a bit overly draconian, as in the end, only the highest priority one gets picked # But since this only constitues a (logging) warning, I think its worth reporting @@ -314,7 +319,7 @@ def parse_sssom_table( logging.warning("unhandled keyword arguments passed: %s", kwargs) if isinstance(file_path, Path) or isinstance(file_path, str): raise_for_bad_path(file_path) - stream: io.StringIO = _open_input(file_path) + stream = _open_input(file_path) if sep is None: sep = _infer_separator(file_path) df, sssom_metadata = _read_pandas_and_metadata(stream, sep) diff --git a/src/sssom/util.py b/src/sssom/util.py index 64691014..7221d300 100644 --- a/src/sssom/util.py +++ b/src/sssom/util.py @@ -9,7 +9,7 @@ from dataclasses import dataclass, field from functools import partial, reduce from pathlib import Path -from typing import Any, DefaultDict, Dict, List, Literal, Optional, Set, TextIO, Tuple, Union +from typing import Any, DefaultDict, Dict, List, Literal, Optional, Set, Tuple, Union import curies import numpy as np @@ -1002,10 +1002,13 @@ def get_file_extension(file: PathOrIO) -> Optional[ExtensionLiteral]: :param file: File path :return: format of the file passed, default tsv """ - if isinstance(file, str): - file = Path(file) - elif isinstance(file, TextIO): + if not isinstance(file, (str, Path)): + if not hasattr(file, "name"): + logging.debug("cannot guess format for object without name: %s", file) + return None file = Path(file.name) + elif isinstance(file, str): + file = Path(file) filename = file.name.removesuffix(".gz") if filename.endswith(".tsv"): diff --git a/tests/test_parsers.py b/tests/test_parsers.py index 3bb4605c..ce6c20d0 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -88,11 +88,9 @@ def test_parse_sssom_dataframe_from_file(self): def test_parse_sssom_dataframe_from_stringio(self): """Test parsing a TSV.""" - input_path = f"{test_data_dir}/basic.tsv" - with open(input_path, "r") as file: - input_string = file.read() - stream = io.StringIO(input_string) - msdf = parse_sssom_table(stream) + input_path = test_data_dir.joinpath("basic.tsv") + with input_path.open() as file: + msdf = parse_sssom_table(file) output_path = os.path.join(test_out_dir, "test_parse_sssom_dataframe_stream.tsv") with open(output_path, "w") as file: write_table(msdf, file) From 5b233efbe259a11bc7f2a1cfa610bdee2bd9fde1 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sat, 24 May 2025 21:05:49 +0200 Subject: [PATCH 17/19] Cleanup interspersed logic --- src/sssom/parsers.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 6a44a0dc..3e48af36 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -94,6 +94,7 @@ def _open_input(p: PathOrIO) -> TextIO: if isinstance(p, str) and (p.startswith("http://") or p.startswith("https://")): # It's a URL data = requests.get(p, timeout=30).content + # TODO handle gzipped remote content return io.StringIO(data.decode("utf-8")) # squash a path to a string so we don't have to duplicate logic below @@ -151,13 +152,20 @@ def _separate_metadata_and_table_from_stream(stream: TextIO): return table_component, metadata_component -def _read_pandas_and_metadata(stream: TextIO, sep: Optional[str] = None): +def _read_pandas_and_metadata(file_path: PathOrIO, sep: Optional[str] = None): """Read a tabular data file by wrapping func:`pd.read_csv` to handles comment lines correctly. - :param stream: The file to read. If no separator is given, this file should be named. + :param file_path: The file path or stream to read :param sep: File separator for pandas :return: A pandas dataframe """ + if sep is None: + sep = _infer_separator(file_path) + + if isinstance(file_path, (str, Path)): + raise_for_bad_path(file_path) + + stream = _open_input(file_path) table_stream, metadata_stream = _separate_metadata_and_table_from_stream(stream) try: @@ -317,12 +325,8 @@ def parse_sssom_table( """ if kwargs: logging.warning("unhandled keyword arguments passed: %s", kwargs) - if isinstance(file_path, Path) or isinstance(file_path, str): - raise_for_bad_path(file_path) - stream = _open_input(file_path) - if sep is None: - sep = _infer_separator(file_path) - df, sssom_metadata = _read_pandas_and_metadata(stream, sep) + + df, sssom_metadata = _read_pandas_and_metadata(file_path, sep) if meta is None: meta = {} From aabdb93dce39ed0d018b1989133490ff4ee32c57 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 15 Jun 2025 01:51:34 +0200 Subject: [PATCH 18/19] Update test_parsers.py --- tests/test_parsers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_parsers.py b/tests/test_parsers.py index ce6c20d0..a387e5c2 100644 --- a/tests/test_parsers.py +++ b/tests/test_parsers.py @@ -90,7 +90,9 @@ def test_parse_sssom_dataframe_from_stringio(self): """Test parsing a TSV.""" input_path = test_data_dir.joinpath("basic.tsv") with input_path.open() as file: - msdf = parse_sssom_table(file) + input_string = file.read() + stream = io.StringIO(input_string) + msdf = parse_sssom_table(stream) output_path = os.path.join(test_out_dir, "test_parse_sssom_dataframe_stream.tsv") with open(output_path, "w") as file: write_table(msdf, file) From dd9a8b744b370c16e91e8253816b34f22ae8b955 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 15 Jun 2025 01:56:21 +0200 Subject: [PATCH 19/19] Update parsers.py --- src/sssom/parsers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sssom/parsers.py b/src/sssom/parsers.py index 3e48af36..94d65b81 100644 --- a/src/sssom/parsers.py +++ b/src/sssom/parsers.py @@ -700,18 +700,18 @@ def from_alignment_minidom( _add_valid_mapping_to_list(mdict, mlist, flip_superclass_assertions=True) elif node_name == "xml": - if e.firstChild.nodeValue != "yes": + if e.firstChild.nodeValue != "yes": # type:ignore[union-attr] raise ValueError( "Alignment format: xml element said, but not set to yes. Only XML is supported!" ) elif node_name == "onto1": - ms[SUBJECT_SOURCE_ID] = e.firstChild.nodeValue + ms[SUBJECT_SOURCE_ID] = e.firstChild.nodeValue # type:ignore[union-attr] elif node_name == "onto2": - ms[OBJECT_SOURCE_ID] = e.firstChild.nodeValue + ms[OBJECT_SOURCE_ID] = e.firstChild.nodeValue # type:ignore[union-attr] elif node_name == "uri1": - ms[SUBJECT_SOURCE] = e.firstChild.nodeValue + ms[SUBJECT_SOURCE] = e.firstChild.nodeValue # type:ignore[union-attr] elif node_name == "uri2": - ms[OBJECT_SOURCE] = e.firstChild.nodeValue + ms[OBJECT_SOURCE] = e.firstChild.nodeValue # type:ignore[union-attr] ms.mappings = mlist # type: ignore mapping_set_document = MappingSetDocument(mapping_set=ms, converter=converter)