Skip to content

Commit d884a20

Browse files
feat: add llm-transpile command with Switch integration
Implement llm-transpile command for LLM-based code transpilation: - Add SwitchInstaller for Switch transpiler package management - Install Switch package and deploy to workspace - Create and manage Databricks jobs with job-level parameters - Configure Switch resources (catalog, schema, volume) - Add SwitchRunner for executing Switch transpilation jobs - Upload source files to workspace volume - Execute transpilation via Databricks job - Download results and handle job lifecycle - Add llm-transpile CLI command with Switch transpiler support - Add comprehensive unit and integration tests
1 parent d0c63c3 commit d884a20

File tree

11 files changed

+1066
-91
lines changed

11 files changed

+1066
-91
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ remorph_transpile/
2222
/linter/src/main/antlr4/library/gen/
2323
.databricks-login.json
2424
.mypy_cache
25+
.env

labs.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,18 @@ commands:
4949
total_files_processed\ttotal_queries_processed\tanalysis_error_count\tparsing_error_count\tvalidation_error_count\tgeneration_error_count\terror_log_file
5050
{{range .}}{{.total_files_processed}}\t{{.total_queries_processed}}\t{{.analysis_error_count}}\t{{.parsing_error_count}}\t{{.validation_error_count}}\t{{.generation_error_count}}\t{{.error_log_file}}
5151
{{end}}
52+
- name: llm-transpile
53+
description: Transpile source code to Databricks using LLM Transpiler (Switch)
54+
flags:
55+
- name: input-source
56+
description: Input Script Folder or File (local path)
57+
default: null
58+
- name: output-ws-folder
59+
description: Output folder path (Databricks Workspace path starting with /Workspace/)
60+
default: null
61+
- name: source-dialect
62+
description: Source dialect name (e.g., 'snowflake', 'teradata')
63+
default: null
5264
- name: reconcile
5365
description: Reconcile is an utility to streamline the reconciliation process between source data and target data residing on Databricks.
5466
- name: aggregates-reconcile

src/databricks/labs/lakebridge/cli.py

Lines changed: 231 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from databricks.labs.blueprint.cli import App
1818
from databricks.labs.blueprint.entrypoint import get_logger, is_in_debug
1919
from databricks.labs.blueprint.installation import RootJsonValue
20+
from databricks.labs.blueprint.installer import InstallState
2021
from databricks.labs.blueprint.tui import Prompts
2122

2223

@@ -35,9 +36,10 @@
3536
from databricks.labs.lakebridge.reconcile.recon_config import RECONCILE_OPERATION_NAME, AGG_RECONCILE_OPERATION_NAME
3637
from databricks.labs.lakebridge.transpiler.describe import TranspilersDescription
3738
from databricks.labs.lakebridge.transpiler.execute import transpile as do_transpile
38-
from databricks.labs.lakebridge.transpiler.lsp.lsp_engine import LSPEngine
39+
from databricks.labs.lakebridge.transpiler.lsp.lsp_engine import LSPConfig, LSPEngine
3940
from databricks.labs.lakebridge.transpiler.repository import TranspilerRepository
4041
from databricks.labs.lakebridge.transpiler.sqlglot.sqlglot_engine import SqlglotEngine
42+
from databricks.labs.lakebridge.transpiler.switch_runner import SwitchConfig, SwitchRunner
4143
from databricks.labs.lakebridge.transpiler.transpile_engine import TranspileEngine
4244

4345
from databricks.labs.lakebridge.transpiler.transpile_status import ErrorSeverity
@@ -534,6 +536,234 @@ def _override_workspace_client_config(ctx: ApplicationContext, overrides: dict[s
534536
ctx.connect_config.cluster_id = cluster_id
535537

536538

539+
@lakebridge.command
540+
def llm_transpile(
541+
*,
542+
w: WorkspaceClient,
543+
input_source: str | None = None,
544+
output_ws_folder: str | None = None,
545+
source_dialect: str | None = None,
546+
transpiler_repository: TranspilerRepository = TranspilerRepository.user_home(),
547+
) -> None:
548+
"""Transpile source code to Databricks using LLM Transpiler (Switch)"""
549+
ctx = ApplicationContext(w)
550+
ctx.add_user_agent_extra("cmd", "llm-transpile")
551+
user = ctx.current_user
552+
logger.debug(f"User: {user}")
553+
554+
checker = _LLMTranspileConfigChecker(ctx.transpile_config, ctx.prompts, ctx.install_state, transpiler_repository)
555+
checker.use_input_source(input_source)
556+
checker.use_output_ws_folder(output_ws_folder)
557+
checker.use_source_dialect(source_dialect)
558+
params = checker.check()
559+
560+
result = _llm_transpile(ctx, params)
561+
print(json.dumps(result))
562+
563+
564+
class _LLMTranspileConfigChecker:
565+
"""Helper class for 'llm-transpile' command configuration validation"""
566+
567+
_transpile_config: TranspileConfig | None
568+
_prompts: Prompts
569+
_install_state: InstallState
570+
_transpiler_repository: TranspilerRepository
571+
_input_source: str | None = None
572+
_output_ws_folder: str | None = None
573+
_source_dialect: str | None = None
574+
575+
def __init__(
576+
self,
577+
transpile_config: TranspileConfig | None,
578+
prompts: Prompts,
579+
install_state: InstallState,
580+
transpiler_repository: TranspilerRepository,
581+
):
582+
self._transpile_config = transpile_config
583+
self._prompts = prompts
584+
self._install_state = install_state
585+
self._transpiler_repository = transpiler_repository
586+
587+
@staticmethod
588+
def _validate_input_source_path(input_source: str, msg: str) -> None:
589+
"""Validate the input source: it must be a path that exists."""
590+
if not Path(input_source).exists():
591+
raise_validation_exception(msg)
592+
593+
def use_input_source(self, input_source: str | None) -> None:
594+
if input_source is not None:
595+
logger.debug(f"Setting input_source to: {input_source!r}")
596+
self._validate_input_source_path(input_source, f"Invalid path for '--input-source': {input_source}")
597+
self._input_source = input_source
598+
599+
def _prompt_input_source(self) -> None:
600+
default_input = None
601+
if self._transpile_config and self._transpile_config.input_source:
602+
default_input = self._transpile_config.input_source
603+
604+
if default_input:
605+
prompt_text = f"Enter input source path (press <enter> for default: {default_input})"
606+
prompted = self._prompts.question(prompt_text).strip()
607+
self._input_source = prompted if prompted else default_input
608+
else:
609+
prompted = self._prompts.question("Enter input source path (directory or file)").strip()
610+
self._input_source = prompted
611+
612+
logger.debug(f"Setting input_source to: {self._input_source!r}")
613+
self._validate_input_source_path(self._input_source, f"Invalid input source: {self._input_source}")
614+
615+
def _check_input_source(self) -> None:
616+
if self._input_source is None:
617+
self._prompt_input_source()
618+
619+
def use_output_ws_folder(self, output_ws_folder: str | None) -> None:
620+
if output_ws_folder is not None:
621+
logger.debug(f"Setting output_ws_folder to: {output_ws_folder!r}")
622+
self._validate_output_ws_folder_path(
623+
output_ws_folder, f"Invalid path for '--output-ws-folder': {output_ws_folder}"
624+
)
625+
self._output_ws_folder = output_ws_folder
626+
627+
@staticmethod
628+
def _validate_output_ws_folder_path(output_ws_folder: str, msg: str) -> None:
629+
"""Validate output folder is a Workspace path."""
630+
if not output_ws_folder.startswith("/Workspace/"):
631+
raise_validation_exception(f"{msg}. Must start with /Workspace/")
632+
633+
def _prompt_output_ws_folder(self) -> None:
634+
prompted_output_ws_folder = self._prompts.question(
635+
"Enter output folder path (Databricks Workspace path starting with /Workspace/)"
636+
).strip()
637+
logger.debug(f"Setting output_ws_folder to: {prompted_output_ws_folder!r}")
638+
self._validate_output_ws_folder_path(
639+
prompted_output_ws_folder, f"Invalid output folder: {prompted_output_ws_folder}"
640+
)
641+
self._output_ws_folder = prompted_output_ws_folder
642+
643+
def _check_output_ws_folder(self) -> None:
644+
if self._output_ws_folder is None:
645+
self._prompt_output_ws_folder()
646+
647+
def use_source_dialect(self, source_dialect: str | None) -> None:
648+
if source_dialect is not None:
649+
logger.debug(f"Setting source_dialect to: {source_dialect!r}")
650+
self._source_dialect = source_dialect
651+
652+
def _prompt_source_dialect(self) -> None:
653+
"""Prompt for source dialect from Switch dialects."""
654+
available_dialects = self._get_switch_dialects()
655+
656+
if not available_dialects:
657+
raise_validation_exception(
658+
"No Switch dialects available. "
659+
"Install with: databricks labs lakebridge install-transpile --include-llm-transpiler"
660+
)
661+
662+
logger.debug(f"Available dialects: {available_dialects!r}")
663+
source_dialect = self._prompts.choice("Select the source dialect:", list(sorted(available_dialects)))
664+
665+
self._source_dialect = source_dialect
666+
667+
def _check_source_dialect(self) -> None:
668+
"""Validate and prompt for source dialect if not provided."""
669+
available_dialects = self._get_switch_dialects()
670+
671+
if self._source_dialect is None:
672+
self._prompt_source_dialect()
673+
elif self._source_dialect not in available_dialects:
674+
supported = ", ".join(sorted(available_dialects))
675+
raise_validation_exception(f"Invalid source-dialect: '{self._source_dialect}'. " f"Available: {supported}")
676+
677+
def _get_switch_dialects(self) -> set[str]:
678+
"""Get Switch dialects from config.yml using LSPConfig."""
679+
config_path = self._transpiler_repository.transpiler_config_path("Switch")
680+
if not config_path.exists():
681+
return set()
682+
683+
try:
684+
lsp_config = LSPConfig.load(config_path)
685+
return set(lsp_config.remorph.dialects)
686+
except (OSError, ValueError) as e:
687+
logger.warning(f"Failed to load Switch dialects: {e}")
688+
return set()
689+
690+
def _get_switch_options_with_defaults(self) -> dict[str, str]:
691+
"""Get default values for Switch options from config.yml."""
692+
config_path = self._transpiler_repository.transpiler_config_path("Switch")
693+
if not config_path.exists():
694+
return {}
695+
696+
try:
697+
lsp_config = LSPConfig.load(config_path)
698+
except (OSError, ValueError) as e:
699+
logger.warning(f"Failed to load Switch options: {e}")
700+
return {}
701+
702+
options_all = lsp_config.options_for_dialect("all")
703+
result = {}
704+
for option in options_all:
705+
if option.default and option.default != "<none>":
706+
result[option.flag] = option.default
707+
708+
logger.debug(f"Loaded {len(result)} Switch options with defaults from config.yml")
709+
return result
710+
711+
def _validate_switch_options(self, options: dict[str, str]) -> None:
712+
"""Validate options against config.yml choices."""
713+
config_path = self._transpiler_repository.transpiler_config_path("Switch")
714+
if not config_path.exists():
715+
return
716+
717+
try:
718+
lsp_config = LSPConfig.load(config_path)
719+
except (OSError, ValueError) as e:
720+
logger.warning(f"Failed to validate Switch options: {e}")
721+
return
722+
723+
options_all = lsp_config.options_for_dialect("all")
724+
for option in options_all:
725+
if option.flag in options and option.choices:
726+
value = options[option.flag]
727+
if value not in option.choices:
728+
raise_validation_exception(
729+
f"Invalid value for '{option.flag}': {value!r}. " f"Must be one of: {', '.join(option.choices)}"
730+
)
731+
732+
def check(self) -> dict:
733+
"""Validate all parameters and return configuration dict."""
734+
logger.debug("Checking llm-transpile configuration")
735+
736+
self._check_input_source()
737+
self._check_output_ws_folder()
738+
self._check_source_dialect()
739+
740+
switch_options = self._get_switch_options_with_defaults()
741+
self._validate_switch_options(switch_options)
742+
743+
wait_for_completion = str(switch_options.pop("wait_for_completion", "false")).lower() == "true"
744+
745+
return {
746+
"input_source": self._input_source,
747+
"output_ws_folder": self._output_ws_folder,
748+
"source_dialect": self._source_dialect,
749+
"switch_options": switch_options,
750+
"wait_for_completion": wait_for_completion,
751+
}
752+
753+
754+
def _llm_transpile(ctx: ApplicationContext, params: dict) -> RootJsonValue:
755+
"""Execute LLM transpilation via Switch job."""
756+
config = SwitchConfig(ctx.install_state)
757+
resources = config.get_resources()
758+
job_id = config.get_job_id()
759+
760+
runner = SwitchRunner(ctx.workspace_client, ctx.installation)
761+
762+
return runner.run(
763+
catalog=resources["catalog"], schema=resources["schema"], volume=resources["volume"], job_id=job_id, **params
764+
)
765+
766+
537767
@lakebridge.command
538768
def reconcile(*, w: WorkspaceClient) -> None:
539769
"""[EXPERIMENTAL] Reconciles source to Databricks datasets"""

0 commit comments

Comments
 (0)