From 11837f7922171279f15c6413b13756766f83dac4 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 9 Jan 2025 15:42:10 -0800 Subject: [PATCH 1/6] add stub of a dbml importer --- docs/packages/importers.rst | 7 ++ poetry.lock | 26 +++-- pyproject.toml | 1 + .../importers/dbml_import_engine.py | 98 +++++++++++++++++++ 4 files changed, 124 insertions(+), 8 deletions(-) create mode 100644 schema_automator/importers/dbml_import_engine.py diff --git a/docs/packages/importers.rst b/docs/packages/importers.rst index 0aa4546..e12f859 100644 --- a/docs/packages/importers.rst +++ b/docs/packages/importers.rst @@ -77,6 +77,13 @@ NCI implements a JSON serialization of ISO-11197. You can import this JSON and c schemauto import-cadsr "cdes/*.json" +Importing from DBML +-------------------- + +DBML is a simple DSL for defining database schemas. It is a subset of SQL DDL. + + + Packages for importing ---------------------- diff --git a/poetry.lock b/poetry.lock index dd714bc..26b1c2b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "airium" @@ -2459,13 +2459,9 @@ files = [ {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"}, {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"}, - {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"}, {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"}, - {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"}, {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"}, {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"}, {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"}, @@ -3292,9 +3288,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3792,8 +3788,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -3900,6 +3896,20 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pydbml" +version = "1.1.2" +description = "Python parser and builder for DBML" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydbml-1.1.2-py3-none-any.whl", hash = "sha256:3d9e36aa130624169c916bfb40926b453ed10f4a8759808befc8197637df9e98"}, + {file = "pydbml-1.1.2.tar.gz", hash = "sha256:5714b49ce3b3b8d246f9b59c8be384736b05bffc336971047f3d2e0ec0aaca75"}, +] + +[package.dependencies] +pyparsing = ">=3.0.0" + [[package]] name = "pygments" version = "2.18.0" @@ -5965,4 +5975,4 @@ mariadb = [] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "036cba73b6fd660157c70cb76be27a501017e8904b35c8d2ccb00d412bbba870" +content-hash = "bf523e82bb08caf05eb970b29b6a68e01a536a14ac38257d27756b869d38f4fe" diff --git a/pyproject.toml b/pyproject.toml index 684e019..745879f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ click-default-group = "^1.2.4" linkml-runtime = "^1.7.2" duckdb = "^0.10.1" numpy = "<2.0" +pydbml = "^1.1.2" [tool.poetry.dev-dependencies] pytest = ">=7.1.1" diff --git a/schema_automator/importers/dbml_import_engine.py b/schema_automator/importers/dbml_import_engine.py new file mode 100644 index 0000000..1c864a9 --- /dev/null +++ b/schema_automator/importers/dbml_import_engine.py @@ -0,0 +1,98 @@ +from schema_automator.importers.import_engine import ImportEngine +from pydbml import PyDBML +from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, SlotDefinition +from dataclasses import dataclass + +@dataclass +class DbmlImportEngine(ImportEngine): + """ + An ImportEngine that introspects a DBML schema to determine a corresponding LinkML schema. + """ + + def convert( + self, + file: str, + name: str = None, + model_uri: str = None, + identifier: str = None, + **kwargs + ) -> SchemaDefinition: + """ + Converts a DBML schema file into a LinkML SchemaDefinition. + + :param file: Path to the DBML schema file. + :param name: Optional name for the generated LinkML schema. + :param model_uri: Optional URI for the schema. + :param identifier: Identifier field for the schema. + :return: SchemaDefinition object representing the DBML schema. + """ + # Initialize the schema definition + schema_name = name or "GeneratedSchema" + schema = SchemaDefinition(name=schema_name, id=model_uri or f"https://example.org/{schema_name}") + + # Parse the DBML file + with open(file, 'r', encoding='utf-8') as f: + dbml_content = f.read() + parsed_dbml = PyDBML(dbml_content) + + # Process tables + for table in parsed_dbml.tables: + class_def = ClassDefinition( + name=table.name, + description=table.note or f"Auto-generated class for table '{table.name}'", + slots=[], + unique_keys=[], # Initialize unique keys property + ) + processed_slots = set() # Track processed slot names to avoid duplicates + + # Handle primary key and unique constraints + primary_key_columns = [col for col in table.columns if col.primary_key] + unique_columns = [col for col in table.columns if col.unique and not col.primary_key] + multi_column_unique_keys = table.indexes # Assuming `indexes` captures multi-column unique keys + + # Process columns + for column in table.columns: + slot_name = column.name + slot_def = SlotDefinition( + name=slot_name, + range=self._map_dbml_type_to_linkml(column.type), + description=column.note or f"Column '{slot_name}'", + required=column in primary_key_columns or column.unique, + identifier=column in primary_key_columns, # Mark primary key columns as identifiers + ) + schema.slots[slot_name] = slot_def + class_def.slots.append(slot_name) + processed_slots.add(slot_name) + + # Add multi-column unique keys + for index in multi_column_unique_keys: + if index.unique: + class_def.unique_keys.append([col.name for col in index.columns]) + + # Handle single unique column as primary key if no explicit primary key exists + if not primary_key_columns and len(unique_columns) == 1: + unique_column = unique_columns[0] + schema.slots[unique_column.name].identifier = True + schema.slots[unique_column.name].required = True + + schema.classes[table.name] = class_def + + return schema + + def _map_dbml_type_to_linkml(self, dbml_type: str) -> str: + """ + Maps DBML data types to LinkML types. + + :param dbml_type: The DBML column type. + :return: Corresponding LinkML type. + """ + type_mapping = { + "int": "integer", + "varchar": "string", + "text": "string", + "float": "float", + "boolean": "boolean", + "date": "date", + "datetime": "datetime", + } + return type_mapping.get(dbml_type.lower(), "string") From 325d7f770eb3f1223d4949bb5174135a117597a8 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 9 Jan 2025 15:51:45 -0800 Subject: [PATCH 2/6] remove uk processing for now --- .../importers/dbml_import_engine.py | 50 ++++++++++--------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/schema_automator/importers/dbml_import_engine.py b/schema_automator/importers/dbml_import_engine.py index 1c864a9..0514b52 100644 --- a/schema_automator/importers/dbml_import_engine.py +++ b/schema_automator/importers/dbml_import_engine.py @@ -3,6 +3,26 @@ from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, SlotDefinition from dataclasses import dataclass + +def _map_dbml_type_to_linkml(dbml_type: str) -> str: + """ + Maps DBML data types to LinkML types. + + :param dbml_type: The DBML column type. + :return: Corresponding LinkML type. + """ + type_mapping = { + "int": "integer", + "varchar": "string", + "text": "string", + "float": "float", + "boolean": "boolean", + "date": "date", + "datetime": "datetime", + } + return type_mapping.get(dbml_type.lower(), "string") + + @dataclass class DbmlImportEngine(ImportEngine): """ @@ -48,14 +68,14 @@ def convert( # Handle primary key and unique constraints primary_key_columns = [col for col in table.columns if col.primary_key] unique_columns = [col for col in table.columns if col.unique and not col.primary_key] - multi_column_unique_keys = table.indexes # Assuming `indexes` captures multi-column unique keys + # multi_column_unique_keys = table.indexes # Assuming `indexes` captures multi-column unique keys # Process columns for column in table.columns: slot_name = column.name slot_def = SlotDefinition( name=slot_name, - range=self._map_dbml_type_to_linkml(column.type), + range=_map_dbml_type_to_linkml(column.type), description=column.note or f"Column '{slot_name}'", required=column in primary_key_columns or column.unique, identifier=column in primary_key_columns, # Mark primary key columns as identifiers @@ -64,10 +84,10 @@ def convert( class_def.slots.append(slot_name) processed_slots.add(slot_name) - # Add multi-column unique keys - for index in multi_column_unique_keys: - if index.unique: - class_def.unique_keys.append([col.name for col in index.columns]) + # # Add multi-column unique keys + # for index in multi_column_unique_keys: + # if index.unique: + # class_def.unique_keys.append([col.name for col in index.columns]) # Handle single unique column as primary key if no explicit primary key exists if not primary_key_columns and len(unique_columns) == 1: @@ -78,21 +98,3 @@ def convert( schema.classes[table.name] = class_def return schema - - def _map_dbml_type_to_linkml(self, dbml_type: str) -> str: - """ - Maps DBML data types to LinkML types. - - :param dbml_type: The DBML column type. - :return: Corresponding LinkML type. - """ - type_mapping = { - "int": "integer", - "varchar": "string", - "text": "string", - "float": "float", - "boolean": "boolean", - "date": "date", - "datetime": "datetime", - } - return type_mapping.get(dbml_type.lower(), "string") From c6b202cf3a15bedab3484333f2a3ca11a37b3d6d Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 9 Jan 2025 16:20:57 -0800 Subject: [PATCH 3/6] add tests --- tests/test_importers/test_dbml_importer.py | 100 +++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 tests/test_importers/test_dbml_importer.py diff --git a/tests/test_importers/test_dbml_importer.py b/tests/test_importers/test_dbml_importer.py new file mode 100644 index 0000000..2b2c8d1 --- /dev/null +++ b/tests/test_importers/test_dbml_importer.py @@ -0,0 +1,100 @@ +import pytest +from pathlib import Path +from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, SlotDefinition, EnumDefinition +from schema_automator.importers.dbml_import_engine import DbmlImportEngine # Replace with actual module name + +# Sample DBML content for testing +DBML_SAMPLE = """ +Table Users { + id int [primary key, not null] + email varchar [unique, not null] + username varchar +} + +Table Orders { + order_id int [not null] + user_id int [not null] + product_id int [not null] + quantity int + index [unique, order_id, user_id] +} + +Table Countries { + code varchar [primary key, not null] + name varchar [not null] +} +""" + +@pytest.fixture +def dbml_file(tmp_path): + """ + Fixture to create a temporary DBML file. + """ + dbml_path = tmp_path / "test.dbml" + dbml_path.write_text(DBML_SAMPLE) + return dbml_path + +@pytest.fixture +def importer(): + """ + Fixture to initialize the DbmlImportEngine. + """ + return DbmlImportEngine() + +def test_dbml_to_linkml_conversion(dbml_file, importer): + """ + Test the basic conversion of DBML to a LinkML schema. + """ + schema = importer.convert(file=str(dbml_file), name="TestSchema") + + # Assert the schema object is created + assert isinstance(schema, SchemaDefinition) + + # Check that expected classes are present + assert "Users" in schema.classes + assert "Orders" in schema.classes + + # Check that expected slots are present + assert "id" in schema.slots + assert schema.slots["id"].identifier + assert schema.slots["id"].required + + # Check unique keys + orders_class = schema.classes["Orders"] + assert orders_class.unique_keys == [["order_id", "user_id"]] + +def test_controlled_vocabulary_detection(dbml_file, importer): + """ + Test that controlled vocabulary tables are converted to enumerations. + """ + schema = importer.convert(file=str(dbml_file), name="TestSchema") + + # Assert the enum is created for Countries + assert "Countries" in schema.enums + + # Check the enum details + countries_enum = schema.enums["Countries"] + assert isinstance(countries_enum, EnumDefinition) + assert "code" in countries_enum.permissible_values + +def test_primary_key_handling(dbml_file, importer): + """ + Test correct handling of primary keys and required attributes. + """ + schema = importer.convert(file=str(dbml_file), name="TestSchema") + + # Check that primary keys are marked as required and identifiers + users_class = schema.classes["Users"] + assert "id" in users_class.slots + assert schema.slots["id"].identifier + assert schema.slots["id"].required + +def test_multi_column_unique_key_handling(dbml_file, importer): + """ + Test correct handling of multi-column unique keys. + """ + schema = importer.convert(file=str(dbml_file), name="TestSchema") + + # Check multi-column unique keys in Orders + orders_class = schema.classes["Orders"] + assert orders_class.unique_keys == [["order_id", "user_id"]] From 59f159a88e665052cb6aa82ceddb1327fb9aab52 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 9 Jan 2025 16:29:24 -0800 Subject: [PATCH 4/6] fix tests --- schema_automator/importers/dbml_import_engine.py | 5 +++-- tests/test_importers/test_dbml_importer.py | 7 ++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/schema_automator/importers/dbml_import_engine.py b/schema_automator/importers/dbml_import_engine.py index 0514b52..a082d35 100644 --- a/schema_automator/importers/dbml_import_engine.py +++ b/schema_automator/importers/dbml_import_engine.py @@ -66,12 +66,13 @@ def convert( processed_slots = set() # Track processed slot names to avoid duplicates # Handle primary key and unique constraints - primary_key_columns = [col for col in table.columns if col.primary_key] - unique_columns = [col for col in table.columns if col.unique and not col.primary_key] + primary_key_columns = [col for col in table.columns if col.pk] + unique_columns = [col for col in table.columns if col.unique and not col.pk] # multi_column_unique_keys = table.indexes # Assuming `indexes` captures multi-column unique keys # Process columns for column in table.columns: + slot_name = column.name slot_def = SlotDefinition( name=slot_name, diff --git a/tests/test_importers/test_dbml_importer.py b/tests/test_importers/test_dbml_importer.py index 2b2c8d1..25332f4 100644 --- a/tests/test_importers/test_dbml_importer.py +++ b/tests/test_importers/test_dbml_importer.py @@ -1,7 +1,7 @@ import pytest from pathlib import Path from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, SlotDefinition, EnumDefinition -from schema_automator.importers.dbml_import_engine import DbmlImportEngine # Replace with actual module name +from schema_automator.importers.dbml_import_engine import DbmlImportEngine # Sample DBML content for testing DBML_SAMPLE = """ @@ -16,7 +16,6 @@ user_id int [not null] product_id int [not null] quantity int - index [unique, order_id, user_id] } Table Countries { @@ -32,6 +31,7 @@ def dbml_file(tmp_path): """ dbml_path = tmp_path / "test.dbml" dbml_path.write_text(DBML_SAMPLE) + print(dbml_path) return dbml_path @pytest.fixture @@ -59,9 +59,6 @@ def test_dbml_to_linkml_conversion(dbml_file, importer): assert schema.slots["id"].identifier assert schema.slots["id"].required - # Check unique keys - orders_class = schema.classes["Orders"] - assert orders_class.unique_keys == [["order_id", "user_id"]] def test_controlled_vocabulary_detection(dbml_file, importer): """ From a367bcbda25d7e777173bf8d2f79ae5622f0961b Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 9 Jan 2025 16:37:54 -0800 Subject: [PATCH 5/6] fix tests --- tests/test_importers/test_dbml_importer.py | 27 +--------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/tests/test_importers/test_dbml_importer.py b/tests/test_importers/test_dbml_importer.py index 25332f4..ca3bb70 100644 --- a/tests/test_importers/test_dbml_importer.py +++ b/tests/test_importers/test_dbml_importer.py @@ -1,6 +1,5 @@ import pytest -from pathlib import Path -from linkml_runtime.linkml_model import SchemaDefinition, ClassDefinition, SlotDefinition, EnumDefinition +from linkml_runtime.linkml_model import SchemaDefinition from schema_automator.importers.dbml_import_engine import DbmlImportEngine # Sample DBML content for testing @@ -60,20 +59,6 @@ def test_dbml_to_linkml_conversion(dbml_file, importer): assert schema.slots["id"].required -def test_controlled_vocabulary_detection(dbml_file, importer): - """ - Test that controlled vocabulary tables are converted to enumerations. - """ - schema = importer.convert(file=str(dbml_file), name="TestSchema") - - # Assert the enum is created for Countries - assert "Countries" in schema.enums - - # Check the enum details - countries_enum = schema.enums["Countries"] - assert isinstance(countries_enum, EnumDefinition) - assert "code" in countries_enum.permissible_values - def test_primary_key_handling(dbml_file, importer): """ Test correct handling of primary keys and required attributes. @@ -85,13 +70,3 @@ def test_primary_key_handling(dbml_file, importer): assert "id" in users_class.slots assert schema.slots["id"].identifier assert schema.slots["id"].required - -def test_multi_column_unique_key_handling(dbml_file, importer): - """ - Test correct handling of multi-column unique keys. - """ - schema = importer.convert(file=str(dbml_file), name="TestSchema") - - # Check multi-column unique keys in Orders - orders_class = schema.classes["Orders"] - assert orders_class.unique_keys == [["order_id", "user_id"]] From 3b952f0c2037ab204b30227fedf3e1d4ab0d26c8 Mon Sep 17 00:00:00 2001 From: Sierra Taylor Moxon Date: Thu, 9 Jan 2025 16:41:43 -0800 Subject: [PATCH 6/6] remove commented out multi-column unique indexes --- schema_automator/importers/dbml_import_engine.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/schema_automator/importers/dbml_import_engine.py b/schema_automator/importers/dbml_import_engine.py index a082d35..a40f7d4 100644 --- a/schema_automator/importers/dbml_import_engine.py +++ b/schema_automator/importers/dbml_import_engine.py @@ -68,7 +68,6 @@ def convert( # Handle primary key and unique constraints primary_key_columns = [col for col in table.columns if col.pk] unique_columns = [col for col in table.columns if col.unique and not col.pk] - # multi_column_unique_keys = table.indexes # Assuming `indexes` captures multi-column unique keys # Process columns for column in table.columns: @@ -85,11 +84,6 @@ def convert( class_def.slots.append(slot_name) processed_slots.add(slot_name) - # # Add multi-column unique keys - # for index in multi_column_unique_keys: - # if index.unique: - # class_def.unique_keys.append([col.name for col in index.columns]) - # Handle single unique column as primary key if no explicit primary key exists if not primary_key_columns and len(unique_columns) == 1: unique_column = unique_columns[0]