From efdc87cbd1ea22931aa645374ee5fd42d9fe3429 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:32:28 +0000 Subject: [PATCH 01/29] Handle deuterated reduced formulae --- src/csd_optimade/mappers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index 37248a8..fd9eb8b 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -33,6 +33,9 @@ def _reduce_csd_formula(formula: str) -> str: reducer = math.gcd(*formula_dct.values()) + if "D" in formula_dct: + formula_dct["H"] = formula_dct.get("H", 0) + formula_dct.pop("D") + formula_str: str = "" for e in sorted(formula_dct): formula_str += ( From 8d164c21a3b2ec425883e186a93ed383d25fabfc Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:33:58 +0000 Subject: [PATCH 02/29] Handle missing or multi-component formulae --- src/csd_optimade/mappers.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index fd9eb8b..56e2657 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -7,6 +7,8 @@ import warnings from typing import TYPE_CHECKING +from optimade.models.utils import anonymize_formula + if TYPE_CHECKING: import ccdc.crystal import ccdc.entry @@ -24,6 +26,12 @@ def _reduce_csd_formula(formula: str) -> str: import re + if "," in formula: + raise ValueError(f"Cannot reduce multi-component formula: {formula}") + + if not formula: + raise ValueError("Cannot reduce non-existent formula") + formula_dct = {} for e in formula.strip("(").strip(")n").split(" "): matches = re.match(r"([a-zA-Z]+)([0-9]*)", e) @@ -148,10 +156,12 @@ def _get_citations(entry) -> list[ReferenceResource]: inchi = None try: - reduced_formula = _reduce_csd_formula(asym_unit.formula) - except Exception: + reduced_formula = _reduce_csd_formula(entry.formula) + except ValueError: + reduced_formula = None + except RuntimeError: warnings.warn( - f"Unable to reduce formula for {entry.identifier}: {entry.formula}" + f"Unable to reduce formula for {entry.identifier}: {entry.formula} / {asym_unit.formula}" ) reduced_formula = None @@ -166,6 +176,9 @@ def _get_citations(entry) -> list[ReferenceResource]: "attributes": StructureResourceAttributes( immutable_id=entry.identifier, last_modified=now, + chemical_formula_anonymous=anonymize_formula(reduced_formula) + if reduced_formula + else None, chemical_formula_descriptive=entry.formula, chemical_formula_reduced=reduced_formula, elements=sorted(list(optimade_elements)), From ea94afbddabb470e8a9e5e683ae1560c0ce311e3 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:34:14 +0000 Subject: [PATCH 03/29] Add `_csd_remark` as a searchable field --- src/csd_optimade/fields.py | 5 +++++ src/csd_optimade/mappers.py | 1 + 2 files changed, 6 insertions(+) diff --git a/src/csd_optimade/fields.py b/src/csd_optimade/fields.py index 48b4d56..53e9664 100644 --- a/src/csd_optimade/fields.py +++ b/src/csd_optimade/fields.py @@ -95,5 +95,10 @@ def generate_csd_provider_fields(): "type": "integer", "description": "The number of formula units in the asymmetric unit.", }, + { + "name": "_csd_remarks", + "type": "string", + "description": "Free-text remarks about the structure.", + }, ] } diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index 56e2657..e4515e5 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -221,6 +221,7 @@ def _get_citations(entry) -> list[ReferenceResource]: _csd_ccdc_number=entry.ccdc_number, _csd_deposition_date={"$date": dep_date}, _csd_disorder_details=entry.disorder_details, + _csd_remarks=entry.remarks if entry.remarks else None, ), } ) From 60b675f796f102fa852eddf6e046d7c9600d4473 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:34:41 +0000 Subject: [PATCH 04/29] Remove nsites check and add simple tests for reduced formulae --- tests/test_mappers.py | 52 ++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/tests/test_mappers.py b/tests/test_mappers.py index 3f85c43..03c8e5f 100644 --- a/tests/test_mappers.py +++ b/tests/test_mappers.py @@ -5,6 +5,8 @@ import pytest from optimade.adapters.structures.utils import cellpar_to_cell +from csd_optimade.mappers import _reduce_csd_formula + from .utils import generate_same_random_csd_entries if TYPE_CHECKING: @@ -21,9 +23,9 @@ def check_entry( warn_only: bool = False, ) -> bool: assert entry.identifier == resource.id, f"{entry.identifier} != {resource.id}" - total_num_atoms = entry.crystal.z_value * len( - entry.crystal.asymmetric_unit_molecule.atoms - ) + # total_num_atoms = entry.crystal.z_value * len( + # entry.crystal.asymmetric_unit_molecule.atoms + # ) if resource.attributes.lattice_vectors: a, b, c = entry.crystal.cell_lengths @@ -33,16 +35,16 @@ def check_entry( cell, resource.attributes.lattice_vectors, decimal=5 ) - try: - assert resource.attributes.nsites == total_num_atoms, ( - f"{resource.attributes.nsites=} != {total_num_atoms=} for {entry.identifier}" - ) - except AssertionError as exc: - if warn_only: - warnings.warn( - f"{exc} for {entry.identifier}", - RuntimeWarning, - ) + # try: + # assert resource.attributes.nsites == total_num_atoms, ( + # f"{resource.attributes.nsites=} != {total_num_atoms=} for {entry.identifier}" + # ) + # except AssertionError as exc: + # if warn_only: + # warnings.warn( + # f"{exc} for {entry.identifier}", + # RuntimeWarning, + # ) try: if entry.publications: assert resource.relationships.references is not None @@ -92,3 +94,27 @@ def test_random_entries(index: int, entry: "ccdc.entry.Entry", csd_available): assert check_entry(entry, optimade, included, warn_only=True), ( f"{entry.identifier} ({index}) failed" ) + + +def test_reduce_formula(): + zzzghe = "C18 H12 Br3 N1" + assert _reduce_csd_formula(zzzghe) == "Br3C18H12N" + + pivcih01 = "C11 H20 O3" + assert _reduce_csd_formula(pivcih01) == "C11H20O3" + + dumjif1 = "C54 H41 As2 O11 P1 Ru3,0.15(C1 H2 Cl2)" + with pytest.raises(ValueError, match="multi-component"): + _reduce_csd_formula(dumjif1) + + dipjer = "C20 H25 N2 S2 1+,C4 H3 O4 1-" + with pytest.raises(ValueError, match="multi-component"): + _reduce_csd_formula(dipjer) + + nubjax01 = "C36 H24 Br3 N3 O11 U2,H2 O1" + with pytest.raises(ValueError, match="multi-component"): + _reduce_csd_formula(nubjax01) + + jatfet01 = "C65 H45 Au2 N3 O1,C35 H40 N3 Pt1 1+,B1 F4 1-" + with pytest.raises(ValueError, match="multi-component"): + _reduce_csd_formula(jatfet01) From f820a75d1787fd846d8bf253afc5e6ced4a225b3 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:34:53 +0000 Subject: [PATCH 05/29] Bump optimade-python-tools version to better handle disorder --- pyproject.toml | 4 +--- uv.lock | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 90ff300..7e6793d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ ] requires-python = ">= 3.11, < 3.12" dependencies = [ - "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@ml-evs/jsonl-relationships-links", + "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@ml-evs/fix-disorder-validation", "optimade-maker @ git+https://github.com/materialscloud-org/optimade-maker.git@ml-evs/qol-server", "tqdm ~= 4.66", "pymongo >= 4, < 5", @@ -80,6 +80,4 @@ testpaths = "tests" addopts = "-rs" filterwarnings = [ "error", - "ignore:.*total_num_atoms.*:RuntimeWarning", - "ignore:.*unable to reduce formula.*:UserWarning" ] diff --git a/uv.lock b/uv.lock index e18b104..96069cc 100644 --- a/uv.lock +++ b/uv.lock @@ -166,7 +166,7 @@ ingest = [ requires-dist = [ { name = "csd-python-api", marker = "extra == 'ingest'", specifier = ">=3,<4" }, { name = "mypy", marker = "extra == 'dev'", specifier = "~=1.0" }, - { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Fjsonl-relationships-links" }, + { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Ffix-disorder-validation" }, { name = "optimade-maker", git = "https://github.com/materialscloud-org/optimade-maker.git?rev=ml-evs%2Fqol-server" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = "~=3.0" }, { name = "psutil", marker = "extra == 'ingest'", specifier = "~=6.1" }, @@ -607,8 +607,8 @@ wheels = [ [[package]] name = "optimade" -version = "1.1.9" -source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Fjsonl-relationships-links#cfc3e7af4077747707f239770762787289b502ff" } +version = "1.1.10" +source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Ffix-disorder-validation#47386b737e7585de35322d97209ffd0a369260bc" } dependencies = [ { name = "lark" }, { name = "pydantic", extra = ["email"] }, From 18e90dd8b572984d5ef62fae2559a111c018869e Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:41:52 +0000 Subject: [PATCH 06/29] Add bigger subset test case --- tests/test_mappers.py | 15 +++++++++++++++ tests/utils.py | 15 +++++---------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/test_mappers.py b/tests/test_mappers.py index 03c8e5f..8641da6 100644 --- a/tests/test_mappers.py +++ b/tests/test_mappers.py @@ -14,6 +14,7 @@ from optimade.models import Resource, StructureResource TEST_ENTRIES = generate_same_random_csd_entries() +TEST_ENTRIES_BIG = generate_same_random_csd_entries(num_entries=100_000) def check_entry( @@ -96,6 +97,20 @@ def test_random_entries(index: int, entry: "ccdc.entry.Entry", csd_available): ) +def test_random_entries_big(csd_available): + if not csd_available: + pytest.skip("CSD not available") + from csd_optimade.mappers import from_csd_entry_directly + + mapper = from_csd_entry_directly + + for index, entry in TEST_ENTRIES_BIG: + optimade, included = mapper(entry) + assert check_entry(entry, optimade, included, warn_only=True), ( + f"{entry.identifier} ({index}) failed" + ) + + def test_reduce_formula(): zzzghe = "C18 H12 Br3 N1" assert _reduce_csd_formula(zzzghe) == "Br3C18H12N" diff --git a/tests/utils.py b/tests/utils.py index 63bfd5e..b2b063b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -58,9 +58,8 @@ class MockCSDEntry: crystal: MockCSDCrystal = MockCSDCrystal() -def generate_same_random_csd_entries(csd_available=True): +def generate_same_random_csd_entries(csd_available=True, num_entries=1000): """Pick some random entries from the CSD, with a fixed seed.""" - num_entries: int = 1000 if not csd_available: warnings.warn("CSD not available") yield zip(range(num_entries), num_entries * [MockCSDEntry()]) @@ -70,24 +69,20 @@ def generate_same_random_csd_entries(csd_available=True): from ccdc.io import EntryReader - from csd_optimade.ingest import BAD_IDENTIFIERS - random.seed(0) entry_indices = set() - entries = [] max_n: int = int(1.29e6) + n_trials: int = 0 with EntryReader("CSD") as reader: - while len(entry_indices) < num_entries: + while n_trials < num_entries: i = random.randint(0, max_n) if i not in entry_indices: try: entry = reader[i] if entry: - if entry.identifier in BAD_IDENTIFIERS: - continue - entries.append((i, entry)) + yield (i, entry) + n_trials += 1 entry_indices.add(i) except Exception: continue - yield from entries From b0338bcdd39252ab89cbb7e098b130a78f3c7c8b Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 12:50:17 +0000 Subject: [PATCH 07/29] Fix deuterated reduced formula --- src/csd_optimade/mappers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index e4515e5..77290ba 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -39,11 +39,11 @@ def _reduce_csd_formula(formula: str) -> str: species, count = matches.groups() formula_dct[species] = int(count) if count else 1 - reducer = math.gcd(*formula_dct.values()) - if "D" in formula_dct: formula_dct["H"] = formula_dct.get("H", 0) + formula_dct.pop("D") + reducer = math.gcd(*formula_dct.values()) + formula_str: str = "" for e in sorted(formula_dct): formula_str += ( From f6892b535f8c3969e8c6dbb841c6a4615d61cef6 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 13:09:30 +0000 Subject: [PATCH 08/29] Handle another formula edge case --- src/csd_optimade/mappers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index 77290ba..5d7317b 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -159,7 +159,7 @@ def _get_citations(entry) -> list[ReferenceResource]: reduced_formula = _reduce_csd_formula(entry.formula) except ValueError: reduced_formula = None - except RuntimeError: + except Exception: warnings.warn( f"Unable to reduce formula for {entry.identifier}: {entry.formula} / {asym_unit.formula}" ) From 34373d4696108d01158d51909ef8eb4b09b3f2b8 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 13:10:33 +0000 Subject: [PATCH 09/29] Bump optimade-python-tools version to better handle disorder --- pyproject.toml | 2 +- uv.lock | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7e6793d..b8b98c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ ] requires-python = ">= 3.11, < 3.12" dependencies = [ - "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@ml-evs/fix-disorder-validation", + "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@main", "optimade-maker @ git+https://github.com/materialscloud-org/optimade-maker.git@ml-evs/qol-server", "tqdm ~= 4.66", "pymongo >= 4, < 5", diff --git a/uv.lock b/uv.lock index 96069cc..8cb8f29 100644 --- a/uv.lock +++ b/uv.lock @@ -166,7 +166,7 @@ ingest = [ requires-dist = [ { name = "csd-python-api", marker = "extra == 'ingest'", specifier = ">=3,<4" }, { name = "mypy", marker = "extra == 'dev'", specifier = "~=1.0" }, - { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Ffix-disorder-validation" }, + { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=main" }, { name = "optimade-maker", git = "https://github.com/materialscloud-org/optimade-maker.git?rev=ml-evs%2Fqol-server" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = "~=3.0" }, { name = "psutil", marker = "extra == 'ingest'", specifier = "~=6.1" }, @@ -608,7 +608,7 @@ wheels = [ [[package]] name = "optimade" version = "1.1.10" -source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Ffix-disorder-validation#47386b737e7585de35322d97209ffd0a369260bc" } +source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=main#19ab866231ec7eae82b7b69f1d48f421c58fd81d" } dependencies = [ { name = "lark" }, { name = "pydantic", extra = ["email"] }, From edba7c49cb6821104d1db0d31433df60d1d69d7a Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 16:10:36 +0000 Subject: [PATCH 10/29] Fix type of z-prime field --- src/csd_optimade/fields.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csd_optimade/fields.py b/src/csd_optimade/fields.py index 53e9664..223e547 100644 --- a/src/csd_optimade/fields.py +++ b/src/csd_optimade/fields.py @@ -92,7 +92,7 @@ def generate_csd_provider_fields(): }, { "name": "_csd_z_prime", - "type": "integer", + "type": "float", "description": "The number of formula units in the asymmetric unit.", }, { From 5a8c92fb3e666830076c1fc6fa1d1d38d931fb3f Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 4 Feb 2025 16:47:24 +0000 Subject: [PATCH 11/29] Attempt to use packing automatically --- src/csd_optimade/mappers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index 5d7317b..c68ef43 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -84,12 +84,13 @@ def from_csd_entry_directly( lattice_params: list[list[float | None]] = [[None, None, None], [None, None, None]] cell_volume: float | None = None if entry.has_3d_structure: + packed_mol = entry.crystal.packing() try: positions = [ [atom.coordinates.x, atom.coordinates.y, atom.coordinates.z] - for atom in asym_unit.atoms + for atom in packed_mol.atoms ] - # Handle case that asym_unit.atoms is [] + # Handle case that atoms is [] if not positions: positions = None except AttributeError: @@ -198,7 +199,7 @@ def _get_citations(entry) -> list[ReferenceResource]: if positions else None, cartesian_site_positions=positions, - species_at_sites=[atom.atomic_symbol for atom in asym_unit.atoms] + species_at_sites=[atom.atomic_symbol for atom in packed_mol.atoms] if positions else None, structure_features=["disorder"] if entry.has_disorder else [], From ec9974e7cf624cb2bcd62dbc3bc57dfba8503e9f Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 7 Feb 2025 14:23:18 +0000 Subject: [PATCH 12/29] Add example license info --- pyproject.toml | 2 +- src/csd_optimade/serve.py | 3 +++ uv.lock | 4 ++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b8b98c8..637f3c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ ] requires-python = ">= 3.11, < 3.12" dependencies = [ - "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@main", + "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@ml-evs/license_support", "optimade-maker @ git+https://github.com/materialscloud-org/optimade-maker.git@ml-evs/qol-server", "tqdm ~= 4.66", "pymongo >= 4, < 5", diff --git a/src/csd_optimade/serve.py b/src/csd_optimade/serve.py index 9f41f46..15a8ca9 100644 --- a/src/csd_optimade/serve.py +++ b/src/csd_optimade/serve.py @@ -72,6 +72,8 @@ def cli(): if args.drop_first and test_client: test_client.drop_database(database_name) + override_kwargs["license"] = "https://www.ccdc.cam.ac.uk/licence-agreement" + optimake_server = OptimakeServer( jsonl_path, args.port, @@ -82,6 +84,7 @@ def cli(): "prefix": "csd", "name": "Cambridge Structural Database", "description": "A database of crystal structures curated by the Cambridge Crystallographic Data Centre.", + "homepage=": "https://www.ccdc.cam.ac.uk", }, **override_kwargs, ) diff --git a/uv.lock b/uv.lock index 8cb8f29..9df0969 100644 --- a/uv.lock +++ b/uv.lock @@ -166,7 +166,7 @@ ingest = [ requires-dist = [ { name = "csd-python-api", marker = "extra == 'ingest'", specifier = ">=3,<4" }, { name = "mypy", marker = "extra == 'dev'", specifier = "~=1.0" }, - { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=main" }, + { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Flicense_support" }, { name = "optimade-maker", git = "https://github.com/materialscloud-org/optimade-maker.git?rev=ml-evs%2Fqol-server" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = "~=3.0" }, { name = "psutil", marker = "extra == 'ingest'", specifier = "~=6.1" }, @@ -608,7 +608,7 @@ wheels = [ [[package]] name = "optimade" version = "1.1.10" -source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=main#19ab866231ec7eae82b7b69f1d48f421c58fd81d" } +source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Flicense_support#8c8ac5eb91065b082b850736a5c0262708210921" } dependencies = [ { name = "lark" }, { name = "pydantic", extra = ["email"] }, From ff2cef7d9e4e6f6e95671494448d8cb6360674b4 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 7 Feb 2025 14:35:43 +0000 Subject: [PATCH 13/29] Add ability to exit the API after inserting, to allow asynchronous rebuild of the database --- src/csd_optimade/serve.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/csd_optimade/serve.py b/src/csd_optimade/serve.py index 15a8ca9..fd2158b 100644 --- a/src/csd_optimade/serve.py +++ b/src/csd_optimade/serve.py @@ -19,6 +19,11 @@ def cli(): action="store_true", help="Do not insert the JSONL file into the database.", ) + parser.add_argument( + "--exit-after-insert", + action="store_true", + help="Exit the API after inserting the JSONL file.", + ) parser.add_argument( "--drop-first", action="store_true", @@ -45,6 +50,9 @@ def cli(): if args.no_insert: override_kwargs["insert_from_jsonl"] = None + if args.exit_after_insert: + override_kwargs["exit_after_insert"] = True + # Allow user to specify a real MongoDB mongo_uri = args.mongo_uri if mongo_uri: From f3f17b3274b90e76371f6ca353e7b7e063d97aef Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 7 Feb 2025 14:59:18 +0000 Subject: [PATCH 14/29] Run async insertion pipeline in Dockerfile --- Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 38b736b..5ca2f5b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -185,7 +185,9 @@ fi gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl -exec uv run --no-sync csd-serve --drop-first /opt/csd-optimade/csd-optimade.jsonl +# Run the API twice: once to wipe and reinsert the data then exit, the second to run the API +exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl && \ +exec uv run --no-sync csd-serve --no-insert /opt/csd-optimade/csd-optimade.jsonl EOF RUN chmod +x /entrypoint.sh From 7c292cb0215437f916c70a00d81b9a59a85416cf Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 7 Feb 2025 15:02:06 +0000 Subject: [PATCH 15/29] Add ability to turn off insertion pipeline via `CSD_OPTIMADE_INSERT` env var --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5ca2f5b..0234119 100644 --- a/Dockerfile +++ b/Dockerfile @@ -186,8 +186,12 @@ fi gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl # Run the API twice: once to wipe and reinsert the data then exit, the second to run the API -exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl && \ +if [ -z "$CSD_OPTIMADE_INSERT" ]; then + exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl && \ +fi + exec uv run --no-sync csd-serve --no-insert /opt/csd-optimade/csd-optimade.jsonl + EOF RUN chmod +x /entrypoint.sh From 5492bf7789a9439f771614a55263fcaadce2db30 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 7 Feb 2025 15:11:37 +0000 Subject: [PATCH 16/29] Update README with more deployment instructions --- README.md | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 431d4da..3336504 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ Buildx. Once configured, you can build the container with ```shell -docker build --secret id=env,src=.env -t csd-optimade . +docker build --secret id=env,src=.env --target csd-optimade-server -t csd-optimade-server . ``` This will install the CSD inside the container, run the ingestion pipeline and @@ -124,11 +124,45 @@ To launch the container (which will decrypt the file and start the OPTIMADE API locally): ```shell -docker run --env-file .env -p 5000:5000 csd-optimade +docker run --env-file .env -p 5000:5000 csd-optimade-server ``` -For development, you may prefer to use the bake definitions in -`docker-bake.hcl` to build and tag the relevant build stages. +If using a persistent database, future runs of the API can be controlled with +the `CSD_OPTIMADE_INSERT` environment variable. If `true`, the configured database will be + + +For development and deployment, you may prefer to use the bake definitions in +`docker-bake.hcl` to build and tag the relevant build stages: + +```shell +docker buildx bake csd-optimade-server +docker run --env-file .env -p 5000:5000 ghcr.io/datalab-industries/csd-optimade-server +``` + +### Runtime configuration options + +As noted above, the `CSD_ACTIVATION_KEY` used to build the container must be provided at runtime. + +The API container can also be configured with all the `OPTIMAKE_` prefixed environment variables. + +The most important ones are listed here: + +- `OPTIMAKE_MONGO_URI`: to use a persistent MongoDB backend, you can provide a `MONGO_URI` via: + + ```shell + OPTIMAKE_DATABSE_BACKEND=mongodb + OPTIMAKE_MONGO_URI=mongodb://mongodb_server:27017/optimade + ``` + +- `OPTIMAKE_BASE_URL`: to set the base URL of the API (used to generate pagination links), you can provide a `BASE_URL` via: + + ```shell + OPTIMAKE_BASE_URL=https://my-csd-deployment.com + ``` + +Finally, if using a persistent database, future runs of the API can be controlled with the `CSD_OPTIMADE_INSERT` environment variable. +If `true` (default), the configured database will be wiped and rebuilt from the JSONL file directly, and a separate process will run the API. +If `false`, only the API will be started, with no database rebuild. ## Contributing and Getting Help From 4a4ec7bfbb9e3fb4f70eb70aec3215326fe1c9d5 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 7 Feb 2025 20:39:36 +0000 Subject: [PATCH 17/29] Refactoring to allow for `implicit_atoms` and completed elements lists from formulae directly --- src/csd_optimade/mappers.py | 154 ++++++++++++++++++++++-------------- 1 file changed, 93 insertions(+), 61 deletions(-) diff --git a/src/csd_optimade/mappers.py b/src/csd_optimade/mappers.py index c68ef43..ff59c91 100644 --- a/src/csd_optimade/mappers.py +++ b/src/csd_optimade/mappers.py @@ -13,6 +13,7 @@ import ccdc.crystal import ccdc.entry import ccdc.io + import ccdc.molecule from optimade.models import ( ReferenceResource, @@ -22,8 +23,56 @@ StructureResourceAttributes, ) +NOW = datetime.datetime.now() +NOW = NOW.replace(microsecond=0) -def _reduce_csd_formula(formula: str) -> str: + +def _get_citations(entry) -> list[ReferenceResource]: + """Return attached reference resources given the CSD API citation format.""" + citations = [] + for citation in entry.publications: + # Use the DOI as OPTIMADE identifier, if available, otherwise generate one + # from first author, year and random string (cannot detect duplicates) + _id = citation.doi + if _id is None: + first_author = citation.authors.split(", ")[0].split(".")[-1].split(" ")[-1] + _id = f"{first_author}{citation.year}-{''.join(random.choices(string.ascii_lowercase, k=6))}" + + citations.append( + ReferenceResource( + id=_id, + type="references", + attributes=ReferenceResourceAttributes( + last_modified=NOW, + authors=[ + {"name": author} for author in citation.authors.split(", ") + ], + year=str( + citation.year + ), # Potential specification bug that this value should be a string + journal=citation.journal.full_name, + volume=str(citation.volume), + pages=str(citation.first_page), + doi=citation.doi, + ), + ) + ) + return citations + + +def _reduce_csd_formula(formula: str) -> tuple[str, set[str]]: + """Given a CSD Python API formula string, return a reduced + OPTIMADE formula and the set of elements* present. + + * including "D" + + Parameters: + formula: The `Entry.formula` string from the CSD Python API. + + Returns: + A tuple of the reduced formula and the set of elements present. + + """ import re if "," in formula: @@ -39,6 +88,9 @@ def _reduce_csd_formula(formula: str) -> str: species, count = matches.groups() formula_dct[species] = int(count) if count else 1 + # Elements list should include "D" so that it can be post-filtered in species lists + elements = set(formula_dct.keys()) + if "D" in formula_dct: formula_dct["H"] = formula_dct.get("H", 0) + formula_dct.pop("D") @@ -53,7 +105,7 @@ def _reduce_csd_formula(formula: str) -> str: if not formula_str: raise RuntimeError(f"Unable to create formula for {formula}") - return formula_str + return formula_str, elements def from_csd_entry_directly( @@ -65,16 +117,6 @@ def from_csd_entry_directly( """ asym_unit = entry.crystal.asymmetric_unit_molecule - elements = {d.atomic_symbol for d in asym_unit.atoms} - - optimade_elements = elements.copy() - # Replace deuterium with H - if "D" in elements: - optimade_elements.remove("D") - optimade_elements.add("H") - - now = datetime.datetime.now() - now = now.replace(microsecond=0) dep_date: datetime.datetime | datetime.date | None = entry.deposition_date dep_date = ( datetime.datetime.fromisoformat(dep_date.isoformat()) if dep_date else None @@ -83,6 +125,7 @@ def from_csd_entry_directly( positions: list | None = None lattice_params: list[list[float | None]] = [[None, None, None], [None, None, None]] cell_volume: float | None = None + packed_mol: ccdc.molecule.Molecule | None = None if entry.has_3d_structure: packed_mol = entry.crystal.packing() try: @@ -110,39 +153,6 @@ def from_csd_entry_directly( ] cell_volume = entry.crystal.cell_volume - def _get_citations(entry) -> list[ReferenceResource]: - citations = [] - for citation in entry.publications: - # Use the DOI as OPTIMADE identifier, if available, otherwise generate one - # from first author, year and random string (cannot detect duplicates) - _id = citation.doi - if _id is None: - first_author = ( - citation.authors.split(", ")[0].split(".")[-1].split(" ")[-1] - ) - _id = f"{first_author}{citation.year}-{''.join(random.choices(string.ascii_lowercase, k=6))}" - - citations.append( - ReferenceResource( - id=_id, - type="references", - attributes=ReferenceResourceAttributes( - last_modified=now, - authors=[ - {"name": author} for author in citation.authors.split(", ") - ], - year=str( - citation.year - ), # Potential specification bug that this value should be a string - journal=citation.journal.full_name, - volume=str(citation.volume), - pages=str(citation.first_page), - doi=citation.doi, - ), - ) - ) - return citations - references: list[ReferenceResource] = _get_citations(entry) relationships: dict[str, dict] | None = None if references: @@ -156,16 +166,49 @@ def _get_citations(entry) -> list[ReferenceResource]: if not inchi.success: inchi = None + structure_features = [] try: - reduced_formula = _reduce_csd_formula(entry.formula) + reduced_formula, elements = _reduce_csd_formula(entry.formula) except ValueError: reduced_formula = None + elements = {d.atomic_symbol for d in asym_unit.atoms} + except Exception: warnings.warn( f"Unable to reduce formula for {entry.identifier}: {entry.formula} / {asym_unit.formula}" ) reduced_formula = None + optimade_elements = elements.copy() + # Replace deuterium with H + if "D" in elements: + optimade_elements.remove("D") + optimade_elements.add("H") + + optimade_species = [ + Species( + chemical_symbols=[e if e != "D" else "H"], + name=e, + concentration=[1.0], + ) + for e in elements + ] + + optimade_species_at_sites: list[str] | None = ( + [atom.atomic_symbol for atom in packed_mol.atoms] + if (positions and packed_mol) + else None + ) + + if entry.has_disorder: + structure_features += ["disorder"] + + if optimade_species_at_sites: + for s in optimade_species: + if s.name not in optimade_species_at_sites: + structure_features += ["implicit_atoms"] + break + resource = StructureResource( **{ "id": entry.identifier, @@ -176,7 +219,7 @@ def _get_citations(entry) -> list[ReferenceResource]: }, "attributes": StructureResourceAttributes( immutable_id=entry.identifier, - last_modified=now, + last_modified=NOW, chemical_formula_anonymous=anonymize_formula(reduced_formula) if reduced_formula else None, @@ -188,21 +231,10 @@ def _get_citations(entry) -> list[ReferenceResource]: nelements=len(optimade_elements), nsites=len(positions) if positions else None, # Make sure the "D" is remapped to "H" in the species list, but continue using it in the sites list - species=[ - Species( - chemical_symbols=[e if e != "D" else "H"], - name=e, - concentration=[1.0], - ) - for e in elements - ] - if positions - else None, + species=optimade_species if positions else None, + species_at_sites=optimade_species_at_sites, cartesian_site_positions=positions, - species_at_sites=[atom.atomic_symbol for atom in packed_mol.atoms] - if positions - else None, - structure_features=["disorder"] if entry.has_disorder else [], + structure_features=structure_features, # Add custom CSD-specific fields _csd_lattice_parameter_a=lattice_params[0][0], _csd_lattice_parameter_b=lattice_params[0][1], From a234e6b17bf24f7bd0a1e8a636c0f24814923d2f Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sat, 8 Feb 2025 15:10:15 +0000 Subject: [PATCH 18/29] Add fat test for all entries with reuslts saved to disk --- tests/test_mappers.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/tests/test_mappers.py b/tests/test_mappers.py index 8641da6..a702b67 100644 --- a/tests/test_mappers.py +++ b/tests/test_mappers.py @@ -1,3 +1,5 @@ +import os +import traceback import warnings from typing import TYPE_CHECKING @@ -14,7 +16,7 @@ from optimade.models import Resource, StructureResource TEST_ENTRIES = generate_same_random_csd_entries() -TEST_ENTRIES_BIG = generate_same_random_csd_entries(num_entries=100_000) +TEST_ENTRIES_ALL = generate_same_random_csd_entries(num_entries=1_290_000) def check_entry( @@ -97,18 +99,29 @@ def test_random_entries(index: int, entry: "ccdc.entry.Entry", csd_available): ) -def test_random_entries_big(csd_available): +def test_random_entries_all(csd_available): if not csd_available: pytest.skip("CSD not available") + + if not os.getenv("CSD_TEST_ALL") == "1": + pytest.skip("Skipping all CSD entries test as `CSD_TEST_ALL` unset.") + from csd_optimade.mappers import from_csd_entry_directly mapper = from_csd_entry_directly - for index, entry in TEST_ENTRIES_BIG: - optimade, included = mapper(entry) - assert check_entry(entry, optimade, included, warn_only=True), ( - f"{entry.identifier} ({index}) failed" - ) + for index, entry in TEST_ENTRIES_ALL: + try: + optimade, included = mapper(entry) + assert check_entry(entry, optimade, included, warn_only=True), ( + f"{entry.identifier} ({index}) failed" + ) + print(".", end="") + except Exception as exc: + print(f"{entry.identifier} ({index}) failed") + traceback.print_exc() + with open("bad_entries.txt", "a") as f: + f.write(f"{entry.identifier} ({index}): {exc}\n") def test_reduce_formula(): From dabcec540ab8325b1b5fae8500c01aa9fcb16b95 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sun, 9 Feb 2025 17:56:44 +0000 Subject: [PATCH 19/29] Do not decrypt unless inserting --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0234119..52cae48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -183,10 +183,9 @@ if [ -z "$CSD_ACTIVATION_KEY" ]; then exit 1 fi -gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl - -# Run the API twice: once to wipe and reinsert the data then exit, the second to run the API if [ -z "$CSD_OPTIMADE_INSERT" ]; then + # Run the API twice: once to wipe and reinsert the data then exit, the second to run the API + gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl && \ fi From 2f74b6aeaa74955fc7237e761f6289db8d6549b7 Mon Sep 17 00:00:00 2001 From: mevans Date: Sun, 9 Feb 2025 18:01:18 +0000 Subject: [PATCH 20/29] Fix typo in Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 52cae48..43b34aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -186,7 +186,7 @@ fi if [ -z "$CSD_OPTIMADE_INSERT" ]; then # Run the API twice: once to wipe and reinsert the data then exit, the second to run the API gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl - exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl && \ + exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl & fi exec uv run --no-sync csd-serve --no-insert /opt/csd-optimade/csd-optimade.jsonl From b9cc89888999b97be6350c75a0c2236fce3624d8 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sun, 9 Feb 2025 18:07:43 +0000 Subject: [PATCH 21/29] Use latest optimade-python-tools pre-release --- pyproject.toml | 2 +- uv.lock | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 637f3c9..0adb7a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ ] requires-python = ">= 3.11, < 3.12" dependencies = [ - "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git@ml-evs/license_support", + "optimade @ git+https://github.com/Materials-Consortia/optimade-python-tools.git", "optimade-maker @ git+https://github.com/materialscloud-org/optimade-maker.git@ml-evs/qol-server", "tqdm ~= 4.66", "pymongo >= 4, < 5", diff --git a/uv.lock b/uv.lock index 9df0969..0b9015f 100644 --- a/uv.lock +++ b/uv.lock @@ -166,7 +166,7 @@ ingest = [ requires-dist = [ { name = "csd-python-api", marker = "extra == 'ingest'", specifier = ">=3,<4" }, { name = "mypy", marker = "extra == 'dev'", specifier = "~=1.0" }, - { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Flicense_support" }, + { name = "optimade", git = "https://github.com/Materials-Consortia/optimade-python-tools.git" }, { name = "optimade-maker", git = "https://github.com/materialscloud-org/optimade-maker.git?rev=ml-evs%2Fqol-server" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = "~=3.0" }, { name = "psutil", marker = "extra == 'ingest'", specifier = "~=6.1" }, @@ -607,8 +607,8 @@ wheels = [ [[package]] name = "optimade" -version = "1.1.10" -source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git?rev=ml-evs%2Flicense_support#8c8ac5eb91065b082b850736a5c0262708210921" } +version = "1.1.11" +source = { git = "https://github.com/Materials-Consortia/optimade-python-tools.git#2affb50d0593a8d9d4229970b034c0d317778c51" } dependencies = [ { name = "lark" }, { name = "pydantic", extra = ["email"] }, From cd8c6361c18ec2091470bdf114d78b3bfcfc9db2 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sun, 9 Feb 2025 18:08:02 +0000 Subject: [PATCH 22/29] Skip bad identifiers in big test --- tests/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/utils.py b/tests/utils.py index b2b063b..58a68f2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -69,6 +69,8 @@ def generate_same_random_csd_entries(csd_available=True, num_entries=1000): from ccdc.io import EntryReader + from csd_optimade.ingest import BAD_IDENTIFIERS + random.seed(0) entry_indices = set() max_n: int = int(1.29e6) @@ -81,6 +83,8 @@ def generate_same_random_csd_entries(csd_available=True, num_entries=1000): try: entry = reader[i] if entry: + if entry in BAD_IDENTIFIERS: + continue yield (i, entry) n_trials += 1 entry_indices.add(i) From 4617fc02602e119ad8d3973bf3ac3e2e2ac3c414 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 11 Feb 2025 14:54:11 +0000 Subject: [PATCH 23/29] Move metadata into fixed modules rather than serve --- src/csd_optimade/fields.py | 34 ++++++++++++++++++++++++++++++++++ src/csd_optimade/serve.py | 15 +++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/csd_optimade/fields.py b/src/csd_optimade/fields.py index 223e547..605362a 100644 --- a/src/csd_optimade/fields.py +++ b/src/csd_optimade/fields.py @@ -1,3 +1,7 @@ +from optimade import __api_version__ +from optimade.models.baseinfo import BaseInfoAttributes, BaseInfoResource + + def generate_csd_provider_fields(): return { "structures": [ @@ -102,3 +106,33 @@ def generate_csd_provider_fields(): }, ] } + + +def generate_csd_provider_info(): + return { + "prefix": "csd", + "name": "Cambridge Structural Database", + "description": "A database of crystal structures curated by the Cambridge Crystallographic Data Centre.", + "homepage=": "https://www.ccdc.cam.ac.uk", + } + + +def generate_license_link(): + return "https://www.ccdc.cam.ac.uk/licence-agreement" + + +def generate_csd_info_endpoint() -> dict[str, BaseInfoResource]: + return { + "data": BaseInfoResource( + attributes=BaseInfoAttributes( + api_version=__api_version__, + available_api_versions=[], + formats=["json"], + available_endpoints=["info", "structures", "references"], + entry_types_by_format={"json": ["info", "structures", "references"]}, + is_index=False, + license={"href": generate_license_link()}, + available_licenses=None, + ) + ) + } diff --git a/src/csd_optimade/serve.py b/src/csd_optimade/serve.py index fd2158b..93f7765 100644 --- a/src/csd_optimade/serve.py +++ b/src/csd_optimade/serve.py @@ -5,7 +5,11 @@ from optimade_maker.serve import OptimakeServer -from csd_optimade.fields import generate_csd_provider_fields +from csd_optimade.fields import ( + generate_csd_provider_fields, + generate_csd_provider_info, + generate_license_link, +) def cli(): @@ -80,7 +84,7 @@ def cli(): if args.drop_first and test_client: test_client.drop_database(database_name) - override_kwargs["license"] = "https://www.ccdc.cam.ac.uk/licence-agreement" + override_kwargs["license"] = generate_license_link() optimake_server = OptimakeServer( jsonl_path, @@ -88,12 +92,7 @@ def cli(): mongo_uri=mongo_uri, database_backend="mongodb" if mongo_uri else "mongomock", provider_fields=generate_csd_provider_fields(), - provider={ - "prefix": "csd", - "name": "Cambridge Structural Database", - "description": "A database of crystal structures curated by the Cambridge Crystallographic Data Centre.", - "homepage=": "https://www.ccdc.cam.ac.uk", - }, + provider=generate_csd_provider_info(), **override_kwargs, ) optimake_server.start_api() From dd60d064e2ac647f0dc69353d924fe3aefe3fdd2 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 11 Feb 2025 14:55:02 +0000 Subject: [PATCH 24/29] Export info endpoints dynamically into JSONL --- src/csd_optimade/ingest.py | 40 +++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/src/csd_optimade/ingest.py b/src/csd_optimade/ingest.py index 2662ada..a2f236c 100644 --- a/src/csd_optimade/ingest.py +++ b/src/csd_optimade/ingest.py @@ -1,8 +1,17 @@ from __future__ import annotations +from optimade import __api_version__ + +from csd_optimade.fields import ( + generate_csd_info_endpoint, + generate_csd_provider_fields, + generate_csd_provider_info, +) + BAD_IDENTIFIERS = { "QIJZOB", # hangs infinitely during mapping "VOHZIB", # no 3D structure + "YIGKOP", } import glob @@ -189,13 +198,38 @@ def cli(): with open(tmp_jsonl_path) as tmp_jsonl: ids_by_type: dict[str, set] = {} with open(output_file, "w") as final_jsonl: - # Write headers + # Write headers and info endpoints final_jsonl.write( - json.dumps({"x-optimade": {"meta": {"api_version": "1.1.0"}}}) + "\n" + json.dumps({"x-optimade": {"meta": {"api_version": __api_version__}}}) + + "\n" + ) + + info = generate_csd_info_endpoint() + provider = generate_csd_provider_info() + final_jsonl.write( + json.dumps( + { + "data": info["data"].model_dump( + exclude_unset=True, exclude_none=False + ) + } + ) + + "\n" ) final_jsonl.write( _construct_entry_type_info( - "structures", properties=[], provider_prefix="" + "structures", + properties=generate_csd_provider_fields()["structures"], + provider_prefix=provider["prefix"], + ).model_dump_json() + + "\n" + ) + + final_jsonl.write( + _construct_entry_type_info( + "references", + properties=[], + provider_prefix=provider["prefix"], ).model_dump_json() + "\n" ) From 713da8d3781252372f0ad2e58423901bfeb82165 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 11 Feb 2025 14:55:10 +0000 Subject: [PATCH 25/29] Bump optimade-maker version --- uv.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/uv.lock b/uv.lock index 0b9015f..1998c10 100644 --- a/uv.lock +++ b/uv.lock @@ -629,7 +629,7 @@ server = [ [[package]] name = "optimade-maker" version = "0.3.0" -source = { git = "https://github.com/materialscloud-org/optimade-maker.git?rev=ml-evs%2Fqol-server#6f732e00f859b837f538e861e29d0e85e7e6cb66" } +source = { git = "https://github.com/materialscloud-org/optimade-maker.git?rev=ml-evs%2Fqol-server#0248484c899140503e770ae7ef5e46b63cffcafa" } dependencies = [ { name = "click" }, { name = "numpy" }, From 221733c88ca83d5321490c16da00b37eb9a40264 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 11 Feb 2025 15:10:28 +0000 Subject: [PATCH 26/29] Add more debug output to big test --- tests/test_mappers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_mappers.py b/tests/test_mappers.py index a702b67..ad898e2 100644 --- a/tests/test_mappers.py +++ b/tests/test_mappers.py @@ -1,4 +1,5 @@ import os +import time import traceback import warnings from typing import TYPE_CHECKING @@ -112,10 +113,15 @@ def test_random_entries_all(csd_available): for index, entry in TEST_ENTRIES_ALL: try: + start = time.monotonic_ns() + print(entry.identifier, end=",") optimade, included = mapper(entry) + elapsed = time.monotonic_ns() - start assert check_entry(entry, optimade, included, warn_only=True), ( f"{entry.identifier} ({index}) failed" ) + if elapsed > 1e9: + print(f"{entry.identifier} ({index}) took {elapsed / 1e9:.1f}s") print(".", end="") except Exception as exc: print(f"{entry.identifier} ({index}) failed") From 2265908ca8b0984aa159f808e04f2ca7c75109ff Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 12 Feb 2025 08:39:35 +0000 Subject: [PATCH 27/29] Properly check `CSD_OPTIMADE_INGEST` variable in docker entrypoint --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 43b34aa..f7a1c02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -183,7 +183,7 @@ if [ -z "$CSD_ACTIVATION_KEY" ]; then exit 1 fi -if [ -z "$CSD_OPTIMADE_INSERT" ]; then +if [ "$CSD_OPTIMADE_INSERT" = "1" ] || [ "$CSD_OPTIMADE_INSERT" = "true"]; then # Run the API twice: once to wipe and reinsert the data then exit, the second to run the API gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl & From 2d5a3b7e52f61de43c818ea3c97c1cdc62b8804e Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 12 Feb 2025 08:50:12 +0000 Subject: [PATCH 28/29] Tweak blocking behaviour in Dockerfile --- Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index f7a1c02..7dd0dd2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -183,10 +183,10 @@ if [ -z "$CSD_ACTIVATION_KEY" ]; then exit 1 fi -if [ "$CSD_OPTIMADE_INSERT" = "1" ] || [ "$CSD_OPTIMADE_INSERT" = "true"]; then +if [ "$CSD_OPTIMADE_INSERT" = "1" ] || [ "$CSD_OPTIMADE_INSERT" = "true" ]; then # Run the API twice: once to wipe and reinsert the data then exit, the second to run the API - gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl - exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl & + (gpg --batch --passphrase ${CSD_ACTIVATION_KEY} --decrypt /opt/csd-optimade/csd-optimade.jsonl.gz.gpg | gunzip > /opt/csd-optimade/csd-optimade.jsonl && + exec uv run --no-sync csd-serve --port 5001 --exit-after-insert --drop-first /opt/csd-optimade/csd-optimade.jsonl) & fi exec uv run --no-sync csd-serve --no-insert /opt/csd-optimade/csd-optimade.jsonl From 7f3ceb1f0d9ad2e43fbe0d074d1764ea1caeee55 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 12 Feb 2025 09:06:33 +0000 Subject: [PATCH 29/29] Tweak formula tests --- tests/test_mappers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_mappers.py b/tests/test_mappers.py index ad898e2..8ba5354 100644 --- a/tests/test_mappers.py +++ b/tests/test_mappers.py @@ -132,10 +132,19 @@ def test_random_entries_all(csd_available): def test_reduce_formula(): zzzghe = "C18 H12 Br3 N1" - assert _reduce_csd_formula(zzzghe) == "Br3C18H12N" + formula, elements = _reduce_csd_formula(zzzghe) + assert formula == "Br3C18H12N" + assert elements == {"Br", "C", "H", "N"} + + zzzghe = "C18 D6 H6 Br3 N1" + formula, elements = _reduce_csd_formula(zzzghe) + assert formula == "Br3C18H12N" + assert elements == {"Br", "C", "H", "N", "D"} pivcih01 = "C11 H20 O3" - assert _reduce_csd_formula(pivcih01) == "C11H20O3" + formula, elements = _reduce_csd_formula(pivcih01) + assert formula == "C11H20O3" + assert elements == {"C", "H", "O"} dumjif1 = "C54 H41 As2 O11 P1 Ru3,0.15(C1 H2 Cl2)" with pytest.raises(ValueError, match="multi-component"):