From 94bf9723e434b54b110b873177ffcb8258d6287e Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 6 Jan 2025 10:35:01 -0500 Subject: [PATCH 01/16] Allow reading and writing when href startswith file:/// --- pystac/stac_io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pystac/stac_io.py b/pystac/stac_io.py index abe5bc26a..4bfe7488a 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -303,6 +303,7 @@ def read_text_from_href(self, href: str) -> str: except HTTPError as e: raise Exception(f"Could not read uri {href}") from e else: + href = safe_urlparse(href).path with open(href, encoding="utf-8") as f: href_contents = f.read() return href_contents @@ -328,7 +329,7 @@ def write_text_to_href(self, href: str, txt: str) -> None: """ if _is_url(href): raise NotImplementedError("DefaultStacIO cannot write to urls") - href = os.fspath(href) + href = safe_urlparse(href).path dirname = os.path.dirname(href) if dirname != "" and not os.path.isdir(dirname): os.makedirs(dirname) @@ -391,7 +392,7 @@ def _report_duplicate_object_names( def _is_url(href: str) -> bool: parsed = safe_urlparse(href) - return parsed.scheme != "" + return parsed.scheme != "" and parsed.scheme != "file" if HAS_URLLIB3: From 3cc1715d7405a0e340daf1e836c84c55422549f7 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 6 Jan 2025 10:36:06 -0500 Subject: [PATCH 02/16] Add test of reading and writing to hrefs starting with file:/// --- tests/test_stac_io.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_stac_io.py b/tests/test_stac_io.py index b84ea2854..97b6c1fc3 100644 --- a/tests/test_stac_io.py +++ b/tests/test_stac_io.py @@ -24,6 +24,15 @@ def test_read_write_collection(self) -> None: pystac.write_file(collection, dest_href=dest_href) self.assertTrue(os.path.exists(dest_href), msg="File was not written.") + def test_read_write_collection_with_file_protocol(self) -> None: + collection = pystac.read_file( + "file://" + TestCases.get_path("data-files/collections/multi-extent.json") + ) + with tempfile.TemporaryDirectory() as tmp_dir: + dest_href = os.path.join(tmp_dir, "collection.json") + pystac.write_file(collection, dest_href="file://" + dest_href) + self.assertTrue(os.path.exists(dest_href), msg="File was not written.") + def test_read_item(self) -> None: item = pystac.read_file(TestCases.get_path("data-files/item/sample-item.json")) with tempfile.TemporaryDirectory() as tmp_dir: From b09cf63f263f2dd82f25f8e5906b28acd5265956 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Mon, 6 Jan 2025 11:59:00 -0500 Subject: [PATCH 03/16] Ensure that file:/// are interpretted as absolute urls --- pystac/stac_io.py | 2 +- pystac/utils.py | 12 +++++++++--- tests/test_utils.py | 14 ++++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pystac/stac_io.py b/pystac/stac_io.py index 4bfe7488a..216249f3f 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -392,7 +392,7 @@ def _report_duplicate_object_names( def _is_url(href: str) -> bool: parsed = safe_urlparse(href) - return parsed.scheme != "" and parsed.scheme != "file" + return parsed.scheme not in ["", "file"] if HAS_URLLIB3: diff --git a/pystac/utils.py b/pystac/utils.py index 76c3f7100..cb8ae4ab5 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -246,7 +246,7 @@ def make_relative_href( ): return source_href - if parsed_start.scheme == "": + if parsed_start.scheme in ["", "file"]: return _make_relative_href_path(parsed_source, parsed_start, start_is_dir) else: return _make_relative_href_url(parsed_source, parsed_start, start_is_dir) @@ -311,6 +311,9 @@ def _make_absolute_href_path( make_posix_style(os.path.abspath(start_dir)), start_dir ) + if parsed_source.scheme or parsed_start.scheme: + abs_path = f"file://{abs_path}" + return abs_path @@ -346,7 +349,10 @@ def make_absolute_href( parsed_start = safe_urlparse(start_href) parsed_source = safe_urlparse(source_href) - if parsed_source.scheme != "" or parsed_start.scheme != "": + if parsed_source.scheme not in ["", "file"] or parsed_start.scheme not in [ + "", + "file", + ]: return _make_absolute_href_url(parsed_source, parsed_start, start_is_dir) else: return _make_absolute_href_path(parsed_source, parsed_start, start_is_dir) @@ -364,7 +370,7 @@ def is_absolute_href(href: str) -> bool: bool: ``True`` if the given HREF is absolute, ``False`` if it is relative. """ parsed = safe_urlparse(href) - return parsed.scheme != "" or os.path.isabs(parsed.path) + return parsed.scheme not in ["", "file"] or os.path.isabs(parsed.path) def datetime_to_str(dt: datetime, timespec: str = "auto") -> str: diff --git a/tests/test_utils.py b/tests/test_utils.py index 5e9f85cef..b0ab3a8e8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -33,6 +33,11 @@ def test_make_relative_href(self) -> None: ("/a/catalog.json", "/a/b/c/catalog.json", "../../catalog.json"), ("/a/b/c/d/", "/a/b/c/catalog.json", "./d/"), ("/a/b/c/d/.dotfile", "/a/b/c/d/catalog.json", "./.dotfile"), + ( + "file:///a/b/c/d/catalog.json", + "file:///a/b/c/catalog.json", + "./d/catalog.json", + ), ] for source_href, start_href, expected in test_cases: @@ -161,6 +166,13 @@ def test_make_absolute_href(self) -> None: "https://stacspec.org/a/b/item.json", ), ("http://localhost:8000", None, "http://localhost:8000"), + ("item.json", "file:///a/b/c/catalog.json", "file:///a/b/c/item.json"), + ( + "./z/item.json", + "file:///a/b/c/catalog.json", + "file:///a/b/c/z/item.json", + ), + ("file:///a/b/c/item.json", None, "file:///a/b/c/item.json"), ] for source_href, start_href, expected in test_cases: @@ -219,6 +231,8 @@ def test_is_absolute_href(self) -> None: ("item.json", False), ("./item.json", False), ("../item.json", False), + ("/home/someuser/item.json", True), + ("file:///home/someuser/item.json", True), ("http://stacspec.org/item.json", True), ] From 54fd73d5b1e6f7e19b64dd78b4a0b986acffc04f Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 7 Jan 2025 10:58:45 -0500 Subject: [PATCH 04/16] Try to fix windows --- pystac/utils.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pystac/utils.py b/pystac/utils.py index cb8ae4ab5..5a87a54b6 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -71,6 +71,15 @@ def safe_urlparse(href: str) -> URLParseResult: query=parsed.query, fragment=parsed.fragment, ) + if parsed.scheme == "file" and parsed.netloc: + return URLParseResult( + scheme=parsed.scheme, + netloc="", + path=f"{parsed.netloc}{parsed.path}", + params=parsed.params, + query=parsed.query, + fragment=parsed.fragment, + ) else: return parsed From 7173abdbab0f5dd55ea47ce38bb4564c5e40a4db Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 7 Jan 2025 11:32:57 -0500 Subject: [PATCH 05/16] Try to fix windows --- tests/test_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index b0ab3a8e8..f516f2457 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -177,7 +177,11 @@ def test_make_absolute_href(self) -> None: for source_href, start_href, expected in test_cases: actual = make_absolute_href(source_href, start_href) - _, actual = os.path.splitdrive(actual) + if expected.startswith("file://"): + _, actual = os.path.splitdrive(actual.replace("file://", "")) + actual = f"file://{actual}" + else: + _, actual = os.path.splitdrive(actual) self.assertEqual(actual, expected) def test_make_absolute_href_on_vsitar(self) -> None: From 2dae0579ffb3551e8ef083b33c9e090a687feed6 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 7 Jan 2025 11:45:10 -0500 Subject: [PATCH 06/16] Try to fix windows --- tests/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index f516f2457..854c9b55c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -178,8 +178,8 @@ def test_make_absolute_href(self) -> None: for source_href, start_href, expected in test_cases: actual = make_absolute_href(source_href, start_href) if expected.startswith("file://"): - _, actual = os.path.splitdrive(actual.replace("file://", "")) - actual = f"file://{actual}" + _, actual = os.path.splitdrive(actual.replace("file:///", "")) + actual = f"file:///{actual}" else: _, actual = os.path.splitdrive(actual) self.assertEqual(actual, expected) From d903d9eabf1aad1ccc8796531e437217d78b792b Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 7 Jan 2025 12:01:07 -0500 Subject: [PATCH 07/16] Add some print debugging --- pystac/utils.py | 3 +++ tests/test_utils.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pystac/utils.py b/pystac/utils.py index 5a87a54b6..9dff11b64 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -213,7 +213,10 @@ def _make_relative_href_path( # posixpath doesn't play well with windows drive letters, so we have to use # the os-specific path library for the relpath function. This means we can # only handle windows paths on windows machines. + print("start_dir: ", start_dir) + print("source_path:", source_path) relpath = make_posix_style(os.path.relpath(source_path, start_dir)) + print("relpath:", relpath) # Ensure we retain a trailing slash from the original source path if parsed_source.path.endswith("/"): diff --git a/tests/test_utils.py b/tests/test_utils.py index 854c9b55c..f516f2457 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -178,8 +178,8 @@ def test_make_absolute_href(self) -> None: for source_href, start_href, expected in test_cases: actual = make_absolute_href(source_href, start_href) if expected.startswith("file://"): - _, actual = os.path.splitdrive(actual.replace("file:///", "")) - actual = f"file:///{actual}" + _, actual = os.path.splitdrive(actual.replace("file://", "")) + actual = f"file://{actual}" else: _, actual = os.path.splitdrive(actual) self.assertEqual(actual, expected) From da0ecc05045d78ef31fc0064878be08cbbda4c21 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 7 Jan 2025 12:31:37 -0500 Subject: [PATCH 08/16] Add more print debugging --- pystac/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pystac/utils.py b/pystac/utils.py index 9dff11b64..42eb1fee3 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -51,6 +51,7 @@ def safe_urlparse(href: str) -> URLParseResult: urllib.parse.ParseResult : The named tuple representing the parsed HREF. """ parsed = urlparse(href) + print(parsed.scheme, parsed.netloc, parsed.path) if parsed.scheme != "" and ( href.lower().startswith(f"{parsed.scheme}:\\") or ( From db61a27ae0d742af5e7fac3c30bb3dffe554c8f5 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 7 Jan 2025 14:26:30 -0500 Subject: [PATCH 09/16] Add more print debugging --- pystac/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pystac/utils.py b/pystac/utils.py index 42eb1fee3..7774ead1f 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -50,6 +50,7 @@ def safe_urlparse(href: str) -> URLParseResult: Returns: urllib.parse.ParseResult : The named tuple representing the parsed HREF. """ + print("href:", href) parsed = urlparse(href) print(parsed.scheme, parsed.netloc, parsed.path) if parsed.scheme != "" and ( From d8640898f3f9519b7e8502d62d5d2b3b517ef32a Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 10:48:19 -0500 Subject: [PATCH 10/16] Moved is_absolute to os dependent test --- tests/test_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index f516f2457..910e1bc8f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -235,8 +235,6 @@ def test_is_absolute_href(self) -> None: ("item.json", False), ("./item.json", False), ("../item.json", False), - ("/home/someuser/item.json", True), - ("file:///home/someuser/item.json", True), ("http://stacspec.org/item.json", True), ] @@ -252,6 +250,7 @@ def test_is_absolute_href_os_aware(self) -> None: test_cases = [ ("/item.json", not incl_drive_letter), ("/home/someuser/Downloads/item.json", not incl_drive_letter), + ("file:///home/someuser/Downloads/item.json", True), ("d:/item.json", is_windows), ("c:/files/more_files/item.json", is_windows), ] From f880a739d7ea916664aeb3e16eb47801fd7c5c90 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 10:52:46 -0500 Subject: [PATCH 11/16] Fix os-dependent test --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 910e1bc8f..8f2a9f6ac 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -250,7 +250,7 @@ def test_is_absolute_href_os_aware(self) -> None: test_cases = [ ("/item.json", not incl_drive_letter), ("/home/someuser/Downloads/item.json", not incl_drive_letter), - ("file:///home/someuser/Downloads/item.json", True), + ("file:///home/someuser/Downloads/item.json", not incl_drive_letter), ("d:/item.json", is_windows), ("c:/files/more_files/item.json", is_windows), ] From 8c35501e1576f2cfbc7d647d122954d6e1d66021 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 11:17:07 -0500 Subject: [PATCH 12/16] Strip initial slash and see if it passes --- pystac/stac_io.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pystac/stac_io.py b/pystac/stac_io.py index 216249f3f..d571445d2 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -304,6 +304,8 @@ def read_text_from_href(self, href: str) -> str: raise Exception(f"Could not read uri {href}") from e else: href = safe_urlparse(href).path + if href.startswith("/D:/"): + href = href[1:] with open(href, encoding="utf-8") as f: href_contents = f.read() return href_contents From d34dc2d36c34c2a2841a842049492ac42e50a114 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 13:09:24 -0500 Subject: [PATCH 13/16] Add more print debugging --- pystac/link.py | 3 ++- pystac/stac_io.py | 2 -- pystac/utils.py | 11 ++++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pystac/link.py b/pystac/link.py index f88061090..b9949dbde 100644 --- a/pystac/link.py +++ b/pystac/link.py @@ -172,6 +172,7 @@ def get_href(self, transform_href: bool = True) -> str | None: href = self._target_object.get_self_href() else: href = self._target_href + print("starting href:", href) if ( transform_href @@ -191,7 +192,7 @@ def get_href(self, transform_href: bool = True) -> str | None: owner_href = self.owner.get_self_href() if owner_href is not None: href = make_relative_href(href, owner_href) - + print("new href:", href) return href @property diff --git a/pystac/stac_io.py b/pystac/stac_io.py index d571445d2..216249f3f 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -304,8 +304,6 @@ def read_text_from_href(self, href: str) -> str: raise Exception(f"Could not read uri {href}") from e else: href = safe_urlparse(href).path - if href.startswith("/D:/"): - href = href[1:] with open(href, encoding="utf-8") as f: href_contents = f.read() return href_contents diff --git a/pystac/utils.py b/pystac/utils.py index 7774ead1f..3fa2d0cee 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -60,7 +60,7 @@ def safe_urlparse(href: str) -> URLParseResult: and not href.lower().startswith(f"{parsed.scheme}://") ) ): - return URLParseResult( + output = URLParseResult( scheme="", netloc="", path="{}:{}".format( @@ -73,8 +73,10 @@ def safe_urlparse(href: str) -> URLParseResult: query=parsed.query, fragment=parsed.fragment, ) + print("non-file scheme gives:", output.path) + return output if parsed.scheme == "file" and parsed.netloc: - return URLParseResult( + output = URLParseResult( scheme=parsed.scheme, netloc="", path=f"{parsed.netloc}{parsed.path}", @@ -82,6 +84,8 @@ def safe_urlparse(href: str) -> URLParseResult: query=parsed.query, fragment=parsed.fragment, ) + print("file scheme gives:", output.path) + return output else: return parsed @@ -215,10 +219,7 @@ def _make_relative_href_path( # posixpath doesn't play well with windows drive letters, so we have to use # the os-specific path library for the relpath function. This means we can # only handle windows paths on windows machines. - print("start_dir: ", start_dir) - print("source_path:", source_path) relpath = make_posix_style(os.path.relpath(source_path, start_dir)) - print("relpath:", relpath) # Ensure we retain a trailing slash from the original source path if parsed_source.path.endswith("/"): From 334e314be807f7ff724748e5aad9e63b7dde6212 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 13:34:35 -0500 Subject: [PATCH 14/16] Just strip off the leading slash --- pystac/link.py | 2 -- pystac/utils.py | 21 +++++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pystac/link.py b/pystac/link.py index b9949dbde..06b8b1de1 100644 --- a/pystac/link.py +++ b/pystac/link.py @@ -172,7 +172,6 @@ def get_href(self, transform_href: bool = True) -> str | None: href = self._target_object.get_self_href() else: href = self._target_href - print("starting href:", href) if ( transform_href @@ -192,7 +191,6 @@ def get_href(self, transform_href: bool = True) -> str | None: owner_href = self.owner.get_self_href() if owner_href is not None: href = make_relative_href(href, owner_href) - print("new href:", href) return href @property diff --git a/pystac/utils.py b/pystac/utils.py index 3fa2d0cee..2e551960c 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -50,9 +50,7 @@ def safe_urlparse(href: str) -> URLParseResult: Returns: urllib.parse.ParseResult : The named tuple representing the parsed HREF. """ - print("href:", href) parsed = urlparse(href) - print(parsed.scheme, parsed.netloc, parsed.path) if parsed.scheme != "" and ( href.lower().startswith(f"{parsed.scheme}:\\") or ( @@ -60,7 +58,7 @@ def safe_urlparse(href: str) -> URLParseResult: and not href.lower().startswith(f"{parsed.scheme}://") ) ): - output = URLParseResult( + return URLParseResult( scheme="", netloc="", path="{}:{}".format( @@ -73,19 +71,22 @@ def safe_urlparse(href: str) -> URLParseResult: query=parsed.query, fragment=parsed.fragment, ) - print("non-file scheme gives:", output.path) - return output - if parsed.scheme == "file" and parsed.netloc: - output = URLParseResult( + if parsed.scheme == "file": + # Windows drives sometimes get parsed as the netloc and sometimes + # as part of the parsed.path. + if parsed.netloc: + path = f"{parsed.netloc}{parsed.path}" + elif parsed.path.startswith("/") and os.name == "nt": + path = parsed.path[1:] + + return URLParseResult( scheme=parsed.scheme, netloc="", - path=f"{parsed.netloc}{parsed.path}", + path=path, params=parsed.params, query=parsed.query, fragment=parsed.fragment, ) - print("file scheme gives:", output.path) - return output else: return parsed From 64dce3ae90bd6c344b239b419b33a7ca99f5a6dd Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 13:37:50 -0500 Subject: [PATCH 15/16] Only for windows --- pystac/link.py | 1 + pystac/utils.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pystac/link.py b/pystac/link.py index 06b8b1de1..f88061090 100644 --- a/pystac/link.py +++ b/pystac/link.py @@ -191,6 +191,7 @@ def get_href(self, transform_href: bool = True) -> str | None: owner_href = self.owner.get_self_href() if owner_href is not None: href = make_relative_href(href, owner_href) + return href @property diff --git a/pystac/utils.py b/pystac/utils.py index 2e551960c..764a1e0fc 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -71,12 +71,13 @@ def safe_urlparse(href: str) -> URLParseResult: query=parsed.query, fragment=parsed.fragment, ) - if parsed.scheme == "file": - # Windows drives sometimes get parsed as the netloc and sometimes - # as part of the parsed.path. + + # Windows drives sometimes get parsed as the netloc and sometimes + # as part of the parsed.path. + if parsed.scheme == "file" and os.name == "nt": if parsed.netloc: path = f"{parsed.netloc}{parsed.path}" - elif parsed.path.startswith("/") and os.name == "nt": + elif parsed.path.startswith("/"): path = parsed.path[1:] return URLParseResult( From a566f5c06fe26f5c2b0b73ba9f9d3d7f3415b5cc Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Fri, 10 Jan 2025 13:47:09 -0500 Subject: [PATCH 16/16] Fix windows handling --- pystac/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pystac/utils.py b/pystac/utils.py index 764a1e0fc..4d702c187 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -77,8 +77,10 @@ def safe_urlparse(href: str) -> URLParseResult: if parsed.scheme == "file" and os.name == "nt": if parsed.netloc: path = f"{parsed.netloc}{parsed.path}" - elif parsed.path.startswith("/"): + elif parsed.path.startswith("/") and ":" in parsed.path: path = parsed.path[1:] + else: + path = parsed.path return URLParseResult( scheme=parsed.scheme,