Skip to content

SEA: Normalise Columns returned by Metadata Queries #627

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: ext-links-sea
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 110 additions & 0 deletions src/databricks/sql/backend/column_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""
Column name mappings between different backend protocols.

This module provides mappings between column names returned by different backends
to ensure a consistent interface for metadata operations.
"""

from enum import Enum


class MetadataOp(Enum):
"""Enum for metadata operations."""

CATALOGS = "catalogs"
SCHEMAS = "schemas"
TABLES = "tables"
COLUMNS = "columns"


# Mappings from column names to standard column names
# ref: CATALOG_COLUMNS in JDBC: https://github.com/databricks/databricks-jdbc/blob/e3d0d8dad683146a3afc3d501ddf0864ba086309/src/main/java/com/databricks/jdbc/common/MetadataResultConstants.java#L219
CATALOG_COLUMNS = {
"catalog": "TABLE_CAT",
}

# ref: SCHEMA_COLUMNS in JDBC: https://github.com/databricks/databricks-jdbc/blob/e3d0d8dad683146a3afc3d501ddf0864ba086309/src/main/java/com/databricks/jdbc/common/MetadataResultConstants.java#L221
SCHEMA_COLUMNS = {
"databaseName": "TABLE_SCHEM",
"catalogName": "TABLE_CATALOG",
}

# ref: TABLE_COLUMNS in JDBC: https://github.com/databricks/databricks-jdbc/blob/e3d0d8dad683146a3afc3d501ddf0864ba086309/src/main/java/com/databricks/jdbc/common/MetadataResultConstants.java#L224
TABLE_COLUMNS = {
"catalogName": "TABLE_CAT",
"namespace": "TABLE_SCHEM",
"tableName": "TABLE_NAME",
"tableType": "TABLE_TYPE",
"remarks": "REMARKS",
"TYPE_CATALOG_COLUMN": "TYPE_CAT",
"TYPE_SCHEMA_COLUMN": "TYPE_SCHEM",
"TYPE_NAME": "TYPE_NAME",
"SELF_REFERENCING_COLUMN_NAME": "SELF_REFERENCING_COL_NAME",
"REF_GENERATION_COLUMN": "REF_GENERATION",
}

# ref: COLUMN_COLUMNS in JDBC: https://github.com/databricks/databricks-jdbc/blob/e3d0d8dad683146a3afc3d501ddf0864ba086309/src/main/java/com/databricks/jdbc/common/MetadataResultConstants.java#L192
# TYPE_NAME is not included because it is a duplicate target for columnType, and COLUMN_DEF is known to be returned by Thrift.
# TODO: check if TYPE_NAME is to be returned / also used by Thrift.
COLUMN_COLUMNS = {
"catalogName": "TABLE_CAT",
"namespace": "TABLE_SCHEM",
"tableName": "TABLE_NAME",
"col_name": "COLUMN_NAME",
"dataType": "DATA_TYPE",
"columnSize": "COLUMN_SIZE",
"bufferLength": "BUFFER_LENGTH",
"decimalDigits": "DECIMAL_DIGITS",
"radix": "NUM_PREC_RADIX",
"Nullable": "NULLABLE",
"remarks": "REMARKS",
"columnType": "COLUMN_DEF",
"SQLDataType": "SQL_DATA_TYPE",
"SQLDatetimeSub": "SQL_DATETIME_SUB",
"CharOctetLength": "CHAR_OCTET_LENGTH",
"ordinalPosition": "ORDINAL_POSITION",
"isNullable": "IS_NULLABLE",
"ScopeCatalog": "SCOPE_CATALOG",
"ScopeSchema": "SCOPE_SCHEMA",
"ScopeTable": "SCOPE_TABLE",
"SourceDataType": "SOURCE_DATA_TYPE",
"isAutoIncrement": "IS_AUTOINCREMENT",
"isGenerated": "IS_GENERATEDCOLUMN",
}


def normalise_metadata_result(result_set, operation: MetadataOp):
"""
Normalise column names in a result set based on the operation type.
This function modifies the result set in place.

Args:
result_set: The result set object to normalise
operation: The metadata operation (from MetadataOp enum)
"""

# Select the appropriate mapping based on the operation
mapping = None
if operation == MetadataOp.CATALOGS:
mapping = CATALOG_COLUMNS
elif operation == MetadataOp.SCHEMAS:
mapping = SCHEMA_COLUMNS
elif operation == MetadataOp.TABLES:
mapping = TABLE_COLUMNS
elif operation == MetadataOp.COLUMNS:
mapping = COLUMN_COLUMNS

if mapping is None:
return

# Normalize column names in the description
new_description = []
for col_desc in result_set.description:
col_name = col_desc[0]
if col_name in mapping:
# Create a new column description tuple with the normalized name
new_col_desc = (mapping[col_name],) + col_desc[1:]
new_description.append(new_col_desc)
else:
new_description.append(col_desc)
result_set.description = new_description
14 changes: 14 additions & 0 deletions src/databricks/sql/backend/sea/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
from databricks.sql.backend.sea.utils.http_client import SeaHttpClient
from databricks.sql.types import SSLOptions

# Import the column mapping module
from databricks.sql.backend.column_mapping import normalise_metadata_result, MetadataOp

from databricks.sql.backend.sea.models import (
ExecuteStatementRequest,
GetStatementRequest,
Expand Down Expand Up @@ -681,6 +684,9 @@ def get_catalogs(
enforce_embedded_schema_correctness=False,
)
assert result is not None, "execute_command returned None in synchronous mode"

normalise_metadata_result(result, MetadataOp.CATALOGS)

return result

def get_schemas(
Expand Down Expand Up @@ -714,6 +720,9 @@ def get_schemas(
enforce_embedded_schema_correctness=False,
)
assert result is not None, "execute_command returned None in synchronous mode"

normalise_metadata_result(result, MetadataOp.SCHEMAS)

return result

def get_tables(
Expand Down Expand Up @@ -761,6 +770,8 @@ def get_tables(

result = ResultSetFilter.filter_tables_by_type(result, table_types)

normalise_metadata_result(result, MetadataOp.TABLES)

return result

def get_columns(
Expand Down Expand Up @@ -802,4 +813,7 @@ def get_columns(
enforce_embedded_schema_correctness=False,
)
assert result is not None, "execute_command returned None in synchronous mode"

normalise_metadata_result(result, MetadataOp.COLUMNS)

return result
194 changes: 194 additions & 0 deletions tests/unit/test_column_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""
Tests for the column mapping module.
"""

import pytest
from unittest.mock import MagicMock
from enum import Enum

from databricks.sql.backend.column_mapping import (
normalise_metadata_result,
MetadataOp,
)


class TestColumnMapping:
"""Tests for the column mapping module."""

def test_normalize_metadata_result_catalogs(self):
"""Test normalizing catalog column names."""
# Create a mock result set with a description
mock_result = MagicMock()
mock_result.description = [
("catalog", "string", None, None, None, None, True),
("other_column", "string", None, None, None, None, True),
]

# Normalize the result set
normalise_metadata_result(mock_result, MetadataOp.CATALOGS)

# Check that the column names were normalized
assert mock_result.description[0][0] == "TABLE_CAT"
assert mock_result.description[1][0] == "other_column"

def test_normalize_metadata_result_schemas(self):
"""Test normalizing schema column names."""
# Create a mock result set with a description
mock_result = MagicMock()
mock_result.description = [
("databaseName", "string", None, None, None, None, True),
("catalogName", "string", None, None, None, None, True),
("other_column", "string", None, None, None, None, True),
]

# Normalize the result set
normalise_metadata_result(mock_result, MetadataOp.SCHEMAS)

# Check that the column names were normalized
assert mock_result.description[0][0] == "TABLE_SCHEM"
assert mock_result.description[1][0] == "TABLE_CATALOG"
assert mock_result.description[2][0] == "other_column"

def test_normalize_metadata_result_tables(self):
"""Test normalizing table column names."""
# Create a mock result set with a description
mock_result = MagicMock()
mock_result.description = [
("catalogName", "string", None, None, None, None, True),
("namespace", "string", None, None, None, None, True),
("tableName", "string", None, None, None, None, True),
("tableType", "string", None, None, None, None, True),
("remarks", "string", None, None, None, None, True),
("TYPE_CATALOG_COLUMN", "string", None, None, None, None, True),
("TYPE_SCHEMA_COLUMN", "string", None, None, None, None, True),
("TYPE_NAME", "string", None, None, None, None, True),
("SELF_REFERENCING_COLUMN_NAME", "string", None, None, None, None, True),
("REF_GENERATION_COLUMN", "string", None, None, None, None, True),
("other_column", "string", None, None, None, None, True),
]

# Normalize the result set
normalise_metadata_result(mock_result, MetadataOp.TABLES)

# Check that the column names were normalized
assert mock_result.description[0][0] == "TABLE_CAT"
assert mock_result.description[1][0] == "TABLE_SCHEM"
assert mock_result.description[2][0] == "TABLE_NAME"
assert mock_result.description[3][0] == "TABLE_TYPE"
assert mock_result.description[4][0] == "REMARKS"
assert mock_result.description[5][0] == "TYPE_CAT"
assert mock_result.description[6][0] == "TYPE_SCHEM"
assert mock_result.description[7][0] == "TYPE_NAME"
assert mock_result.description[8][0] == "SELF_REFERENCING_COL_NAME"
assert mock_result.description[9][0] == "REF_GENERATION"
assert mock_result.description[10][0] == "other_column"

def test_normalize_metadata_result_columns(self):
"""Test normalizing column column names."""
# Create a mock result set with a description
mock_result = MagicMock()
mock_result.description = [
("catalogName", "string", None, None, None, None, True),
("namespace", "string", None, None, None, None, True),
("tableName", "string", None, None, None, None, True),
("col_name", "string", None, None, None, None, True),
("dataType", "string", None, None, None, None, True),
("columnSize", "string", None, None, None, None, True),
("bufferLength", "string", None, None, None, None, True),
("decimalDigits", "string", None, None, None, None, True),
("radix", "string", None, None, None, None, True),
("Nullable", "string", None, None, None, None, True),
("remarks", "string", None, None, None, None, True),
("columnType", "string", None, None, None, None, True),
("SQLDataType", "string", None, None, None, None, True),
("SQLDatetimeSub", "string", None, None, None, None, True),
("CharOctetLength", "string", None, None, None, None, True),
("ordinalPosition", "string", None, None, None, None, True),
("isNullable", "string", None, None, None, None, True),
("ScopeCatalog", "string", None, None, None, None, True),
("ScopeSchema", "string", None, None, None, None, True),
("ScopeTable", "string", None, None, None, None, True),
("SourceDataType", "string", None, None, None, None, True),
("isAutoIncrement", "string", None, None, None, None, True),
("isGenerated", "string", None, None, None, None, True),
("other_column", "string", None, None, None, None, True),
]

# Normalize the result set
normalise_metadata_result(mock_result, MetadataOp.COLUMNS)

# Check that the column names were normalized
assert mock_result.description[0][0] == "TABLE_CAT"
assert mock_result.description[1][0] == "TABLE_SCHEM"
assert mock_result.description[2][0] == "TABLE_NAME"
assert mock_result.description[3][0] == "COLUMN_NAME"
assert mock_result.description[4][0] == "DATA_TYPE"
assert mock_result.description[5][0] == "COLUMN_SIZE"
assert mock_result.description[6][0] == "BUFFER_LENGTH"
assert mock_result.description[7][0] == "DECIMAL_DIGITS"
assert mock_result.description[8][0] == "NUM_PREC_RADIX"
assert mock_result.description[9][0] == "NULLABLE"
assert mock_result.description[10][0] == "REMARKS"
assert mock_result.description[11][0] == "COLUMN_DEF"
assert mock_result.description[12][0] == "SQL_DATA_TYPE"
assert mock_result.description[13][0] == "SQL_DATETIME_SUB"
assert mock_result.description[14][0] == "CHAR_OCTET_LENGTH"
assert mock_result.description[15][0] == "ORDINAL_POSITION"
assert mock_result.description[16][0] == "IS_NULLABLE"
assert mock_result.description[17][0] == "SCOPE_CATALOG"
assert mock_result.description[18][0] == "SCOPE_SCHEMA"
assert mock_result.description[19][0] == "SCOPE_TABLE"
assert mock_result.description[20][0] == "SOURCE_DATA_TYPE"
assert mock_result.description[21][0] == "IS_AUTOINCREMENT"
assert mock_result.description[22][0] == "IS_GENERATEDCOLUMN"
assert mock_result.description[23][0] == "other_column"

def test_normalize_metadata_result_unknown_operation(self):
"""Test normalizing with an unknown operation type."""
# Create a mock result set with a description
mock_result = MagicMock()
mock_result.description = [
("column1", "string", None, None, None, None, True),
("column2", "string", None, None, None, None, True),
]

# Save the original description
original_description = mock_result.description.copy()

# Create a separate enum for testing
class TestOp(Enum):
UNKNOWN = "unknown"

# Normalize the result set with an unknown operation
normalise_metadata_result(mock_result, TestOp.UNKNOWN)

# Check that the description was not modified
assert mock_result.description == original_description

def test_normalize_metadata_result_preserves_other_fields(self):
"""Test that normalization preserves other fields in the description."""
# Create a mock result set with a description
mock_result = MagicMock()
mock_result.description = [
(
"catalog",
"string",
"display_size",
"internal_size",
"precision",
"scale",
True,
),
]

# Normalize the result set
normalise_metadata_result(mock_result, MetadataOp.CATALOGS)

# Check that the column name was normalized but other fields preserved
assert mock_result.description[0][0] == "TABLE_CAT"
assert mock_result.description[0][1] == "string"
assert mock_result.description[0][2] == "display_size"
assert mock_result.description[0][3] == "internal_size"
assert mock_result.description[0][4] == "precision"
assert mock_result.description[0][5] == "scale"
assert mock_result.description[0][6] == True
Loading
Loading