Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions docs/lakebridge/docs/assessment/analyzer/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ Usage:
databricks labs lakebridge analyze [flags]

Flags:
--generate-json true|false (Optional) Generate JSON file alongside the Excel report
--generate-json true|false (Optional) Generate JSON alongside Excel (default: true); pass false to skip
-h, --help help for analyze
--report-file path (Optional) Local filesystem path of the analysis report file to write
--source-directory path (Optional) Local filesystem path of a directory containing sources to analyze
Expand All @@ -62,7 +62,7 @@ Below is an explanation of the settings needed for the analyzer:
- _Source directory_: The path of the folder containing the artifacts to analyze.
- _Report file_: The path of the Excel file into which the analyzer results will be written. (The parent directory of this file must already exist.)
- _Source technology_: The underlying technology platform of the files in the source directory that need to be analyzed.
- _Generate JSON_ (optional): When set to `true`, generates a JSON file alongside the Excel report. This is useful for programmatic processing and integration with other tools.
- _Generate JSON_ (optional): Defaults to `true` so a JSON file is written next to the Excel report (for programmatic use, migration estimators, and UI). Set to `false` if you only need the spreadsheet.

## Execution
Execute the below command to start the analyzer:
Expand All @@ -72,14 +72,15 @@ Execute the below command to start the analyzer:

_Note: Any settings that are not provided as arguments will trigger a prompt for the missing setting(s)._

To generate both Excel and JSON output:
By default, both Excel and JSON are produced. To skip JSON:

```bash
databricks labs lakebridge analyze [--source-directory <path>] [--report-file <path>] [--source-tech <string>] --generate-json true
databricks labs lakebridge analyze [--source-directory <path>] [--report-file <path>] [--source-tech <string>] --generate-json false
```

### Output Files
- **Excel Report** (`.xlsx`) - Always generated. Contains formatted analysis results with multiple worksheets for different aspects of the analysis.
- **JSON Report** (`.json`) - Generated when `--generate-json true` is specified. Contains the same analysis data in JSON format for programmatic access and integration with other tools.
- **JSON Report** (`.json`) - Generated by default alongside the Excel report. Contains the same analysis data in JSON format for programmatic access and integration with other tools. Omit JSON with `--generate-json false`.

Both files will be written to the location specified by `--report-file` with their respective extensions.

Expand Down
4 changes: 2 additions & 2 deletions labs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ commands:
- name: source-tech
description: (Optional) Name of the Source System Technology you want to analyze
- name: generate-json
description: (Optional) Generate JSON file alongside the Excel report (`true|false`)
default: "false"
description: (Optional) Emit JSON alongside the Excel report (`true|false`; default `true`). Pass `false` for Excel only.
default: "true"

- name: transpile
description: Transpile SQL/ETL sources to Databricks-compatible code
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def create(cls, is_debug: bool = False) -> "AnalyzerRunner":
return cls(Analyzer.analyze, is_debug)

def run(
self, source_dir: Path, results_file_path: Path, platform: str, generate_json: bool = False
self, source_dir: Path, results_file_path: Path, platform: str, generate_json: bool = True
) -> AnalyzerResult:
logger.debug(f"Starting analyzer execution for {platform}: {source_dir}")

Expand Down Expand Up @@ -125,7 +125,7 @@ def run_analyzer(
source: str | None = None,
report_file: str | None = None,
platform: str | None = None,
generate_json: bool = False,
generate_json: bool = True,
) -> AnalyzerResult:
source_dir = self._prompts.get_source_directory() if source is None else Path(source)
results_file_path = self._prompts.get_result_file_path() if report_file is None else Path(report_file)
Expand Down
7 changes: 5 additions & 2 deletions src/databricks/labs/lakebridge/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,9 +847,12 @@ def analyze(
source_directory: str | None = None,
report_file: str | None = None,
source_tech: str | None = None,
generate_json: bool = False,
generate_json: bool = True,
):
"""Run the Analyzer"""
"""Run the Analyzer.

JSON is emitted alongside Excel by default (for estimators and UI). Use --generate-json false to skip.
"""
ctx = ApplicationContext(w)
try:
result = ctx.analyzer.run_analyzer(source_directory, report_file, source_tech, generate_json)
Expand Down
62 changes: 56 additions & 6 deletions tests/unit/analyzer/test_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from pathlib import Path

import pytest
Expand All @@ -14,10 +15,12 @@


def _mock_analyze(
_directory: Path, result: Path, _platform: str, _is_debug: bool = False, _json_result: Path | None = None
_directory: Path, result: Path, _platform: str, _is_debug: bool = False, json_result: Path | None = None
) -> None:
# Nothing really needed here, except a result needs to be created.
"""Stand in for Bladespector: create Excel path; when JSON is requested, create that file too."""
result.touch()
if json_result is not None:
json_result.write_text(json.dumps({"mock": True}), encoding="utf-8")


@pytest.mark.parametrize(
Expand All @@ -28,7 +31,20 @@ def _mock_analyze(
),
ids=str,
)
def test_analyze_arguments_return(tmp_path: Path, report_file: Path) -> None:
@pytest.mark.parametrize(
"omit_generate_json_kw, expect_json_file",
(
(True, True), # default: omit kwarg → JSON on
(False, False), # explicit generate_json=False
),
ids=("default_json", "skip_json"),
)
def test_analyze_json_output(
tmp_path: Path,
report_file: Path,
omit_generate_json_kw: bool,
expect_json_file: bool,
) -> None:
path = tmp_path / "in"
file = tmp_path / report_file
mock_prompts = MockPrompts({})
Expand All @@ -37,9 +53,18 @@ def test_analyze_arguments_return(tmp_path: Path, report_file: Path) -> None:
expected_result = AnalyzerResult(source_directory=path, report_path=file, source_system=str("Synapse"))

analyzer = LakebridgeAnalyzer(AnalyzerPrompts(mock_prompts), runner)
result = analyzer.run_analyzer(source=str(path), report_file=str(file), platform="Synapse")
if omit_generate_json_kw:
result = analyzer.run_analyzer(source=str(path), report_file=str(file), platform="Synapse")
else:
result = analyzer.run_analyzer(source=str(path), report_file=str(file), platform="Synapse", generate_json=False)

assert result == expected_result
assert file.exists()
json_path = file.with_suffix(".json")
if expect_json_file:
assert json_path.exists(), "JSON should be produced by default alongside Excel"
else:
assert not json_path.exists(), "JSON should not be produced when generate_json is False"


def test_analyze_prompts_result(tmp_path: Path):
Expand Down Expand Up @@ -77,9 +102,34 @@ def test_analyze_prompt_relative_result_path(tmp_path: Path) -> None:
_test_analyze_prompt(mock_prompts, expected_result)


def _test_analyze_prompt(mock_prompts: MockPrompts, expected_result: AnalyzerResult) -> None:
def test_analyze_prompts_skip_json(tmp_path: Path) -> None:
first_tech = next(iter(sorted(Analyzer.supported_source_technologies(), key=str.casefold)))
input_path = tmp_path / "in"
report_file = tmp_path / "report-no-json.xlsx"
mock_prompts = MockPrompts(
{
"Select the source technology": "0",
"Enter the path of the directory containing sources to analyze": str(input_path),
"Enter the path of the report file for analyzer results": str(report_file),
}
)
expected_result = AnalyzerResult(source_directory=input_path, report_path=report_file, source_system=first_tech)
_test_analyze_prompt(mock_prompts, expected_result, generate_json=False)


def _test_analyze_prompt(
mock_prompts: MockPrompts, expected_result: AnalyzerResult, *, generate_json: bool | None = None
) -> None:
runner = AnalyzerRunner(runnable=_mock_analyze, is_debug=True)
analyzer = LakebridgeAnalyzer(AnalyzerPrompts(mock_prompts), runner)

result = analyzer.run_analyzer()
if generate_json is None:
result = analyzer.run_analyzer()
else:
result = analyzer.run_analyzer(generate_json=generate_json)
assert result == expected_result
json_path = expected_result.report_path.with_suffix(".json")
if generate_json is False:
assert not json_path.exists(), "JSON should not be produced when generate_json is False"
else:
assert json_path.exists(), "JSON should be produced by default alongside Excel"
56 changes: 54 additions & 2 deletions tests/unit/test_cli_analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def test_analyze_arguments(
source_tech="Informatica - PC",
)

assert report_path.with_suffix(".json").exists(), "JSON report should be generated by default"

with report_path.open("rb") as f:
header = f.read(4)
# Excel files are .zip files, so we can check they have the zip header.
Expand All @@ -62,15 +64,18 @@ def test_analyze_arguments_wrong_tech(
}
)

report_path = tmp_path / "sample.xlsx"
with patch.object(ApplicationContext, "prompts", mock_prompts):
input_path = test_resources / "functional" / "informatica"
cli.analyze(
w=mock_workspace_client,
source_directory=str(input_path),
report_file=str(tmp_path / "sample.xlsx"),
report_file=str(report_path),
source_tech="Informatica",
)

assert report_path.with_suffix(".json").exists(), "JSON should be generated by default"


def test_analyze_generate_json(
mock_workspace_client: WorkspaceClient,
Expand All @@ -86,7 +91,6 @@ def test_analyze_generate_json(
source_directory=str(input_path),
report_file=str(report_path),
source_tech="Snowflake",
generate_json=True,
)

assert report_path.exists(), "Excel report was not created"
Expand All @@ -96,6 +100,27 @@ def test_analyze_generate_json(
assert isinstance(data, dict), "JSON report is not a valid JSON object"


def test_analyze_skip_json(
mock_workspace_client: WorkspaceClient,
test_resources: Path,
tmp_path: Path,
) -> None:
input_path = test_resources / "functional" / "snowflake" / "integration"
report_path = tmp_path / "report-skip-json.xlsx"
expected_json = tmp_path / "report-skip-json.json"

cli.analyze(
w=mock_workspace_client,
source_directory=str(input_path),
report_file=str(report_path),
source_tech="Snowflake",
generate_json=False,
)

assert report_path.exists(), "Excel report was not created"
assert not expected_json.exists(), "JSON report should not be created when generate_json is false"


def test_analyze_prompts(mock_workspace_client: WorkspaceClient, test_resources: Path, tmp_path: Path) -> None:

supported_tech = sorted(Analyzer.supported_source_technologies(), key=str.casefold)
Expand All @@ -113,3 +138,30 @@ def test_analyze_prompts(mock_workspace_client: WorkspaceClient, test_resources:
)
with patch.object(ApplicationContext, "prompts", mock_prompts):
cli.analyze(w=mock_workspace_client)

assert report_path.exists(), "Excel report was not created"
assert report_path.with_suffix(".json").exists(), "JSON should be generated by default when using prompts"


def test_analyze_prompts_skip_json(
mock_workspace_client: WorkspaceClient, test_resources: Path, tmp_path: Path
) -> None:
supported_tech = sorted(Analyzer.supported_source_technologies(), key=str.casefold)
tech_enum = next((i for i, tech in enumerate(supported_tech) if tech == "Informatica - PC"), 12)

source_dir = test_resources / "functional" / "informatica"
report_path = tmp_path / "results-no-json.xlsx"
expected_json = report_path.with_suffix(".json")

mock_prompts = MockPrompts(
{
"Select the source technology": str(tech_enum),
"Enter the path of the directory containing sources to analyze": str(source_dir),
"Enter the path of the report file for analyzer results": str(report_path),
}
)
with patch.object(ApplicationContext, "prompts", mock_prompts):
cli.analyze(w=mock_workspace_client, generate_json=False)

assert report_path.exists(), "Excel report was not created"
assert not expected_json.exists(), "JSON should not be created when generate_json is false"
Loading