diff --git a/docs/lakebridge/docs/assessment/analyzer/index.mdx b/docs/lakebridge/docs/assessment/analyzer/index.mdx index 4b660365b6..0f31eedc0a 100644 --- a/docs/lakebridge/docs/assessment/analyzer/index.mdx +++ b/docs/lakebridge/docs/assessment/analyzer/index.mdx @@ -37,7 +37,7 @@ Usage: databricks labs lakebridge analyze [flags] Flags: - --generate-json true|false (Optional) Generate JSON file alongside the Excel report + --generate-json true|false (Optional) Generate JSON alongside Excel (default: true); pass false to skip -h, --help help for analyze --report-file path (Optional) Local filesystem path of the analysis report file to write --source-directory path (Optional) Local filesystem path of a directory containing sources to analyze @@ -62,7 +62,7 @@ Below is an explanation of the settings needed for the analyzer: - _Source directory_: The path of the folder containing the artifacts to analyze. - _Report file_: The path of the Excel file into which the analyzer results will be written. (The parent directory of this file must already exist.) - _Source technology_: The underlying technology platform of the files in the source directory that need to be analyzed. - - _Generate JSON_ (optional): When set to `true`, generates a JSON file alongside the Excel report. This is useful for programmatic processing and integration with other tools. + - _Generate JSON_ (optional): Defaults to `true` so a JSON file is written next to the Excel report (for programmatic use, migration estimators, and UI). Set to `false` if you only need the spreadsheet. ## Execution Execute the below command to start the analyzer: @@ -72,14 +72,15 @@ Execute the below command to start the analyzer: _Note: Any settings that are not provided as arguments will trigger a prompt for the missing setting(s)._ -To generate both Excel and JSON output: +By default, both Excel and JSON are produced. To skip JSON: + ```bash - databricks labs lakebridge analyze [--source-directory ] [--report-file ] [--source-tech ] --generate-json true + databricks labs lakebridge analyze [--source-directory ] [--report-file ] [--source-tech ] --generate-json false ``` ### Output Files - **Excel Report** (`.xlsx`) - Always generated. Contains formatted analysis results with multiple worksheets for different aspects of the analysis. -- **JSON Report** (`.json`) - Generated when `--generate-json true` is specified. Contains the same analysis data in JSON format for programmatic access and integration with other tools. +- **JSON Report** (`.json`) - Generated by default alongside the Excel report. Contains the same analysis data in JSON format for programmatic access and integration with other tools. Omit JSON with `--generate-json false`. Both files will be written to the location specified by `--report-file` with their respective extensions. diff --git a/labs.yml b/labs.yml index 862cd068b3..4dd0afd5fe 100644 --- a/labs.yml +++ b/labs.yml @@ -18,8 +18,8 @@ commands: - name: source-tech description: (Optional) Name of the Source System Technology you want to analyze - name: generate-json - description: (Optional) Generate JSON file alongside the Excel report (`true|false`) - default: "false" + description: (Optional) Emit JSON alongside the Excel report (`true|false`; default `true`). Pass `false` for Excel only. + default: "true" - name: transpile description: Transpile SQL/ETL sources to Databricks-compatible code diff --git a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py index b1418357a5..391d11747b 100644 --- a/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py +++ b/src/databricks/labs/lakebridge/analyzer/lakebridge_analyzer.py @@ -63,7 +63,7 @@ def create(cls, is_debug: bool = False) -> "AnalyzerRunner": return cls(Analyzer.analyze, is_debug) def run( - self, source_dir: Path, results_file_path: Path, platform: str, generate_json: bool = False + self, source_dir: Path, results_file_path: Path, platform: str, generate_json: bool = True ) -> AnalyzerResult: logger.debug(f"Starting analyzer execution for {platform}: {source_dir}") @@ -125,7 +125,7 @@ def run_analyzer( source: str | None = None, report_file: str | None = None, platform: str | None = None, - generate_json: bool = False, + generate_json: bool = True, ) -> AnalyzerResult: source_dir = self._prompts.get_source_directory() if source is None else Path(source) results_file_path = self._prompts.get_result_file_path() if report_file is None else Path(report_file) diff --git a/src/databricks/labs/lakebridge/cli.py b/src/databricks/labs/lakebridge/cli.py index 872e7e6163..be3ed234c8 100644 --- a/src/databricks/labs/lakebridge/cli.py +++ b/src/databricks/labs/lakebridge/cli.py @@ -847,9 +847,12 @@ def analyze( source_directory: str | None = None, report_file: str | None = None, source_tech: str | None = None, - generate_json: bool = False, + generate_json: bool = True, ): - """Run the Analyzer""" + """Run the Analyzer. + + JSON is emitted alongside Excel by default (for estimators and UI). Use --generate-json false to skip. + """ ctx = ApplicationContext(w) try: result = ctx.analyzer.run_analyzer(source_directory, report_file, source_tech, generate_json) diff --git a/tests/unit/analyzer/test_analyzer.py b/tests/unit/analyzer/test_analyzer.py index 1ff1d85132..87ac915ee3 100644 --- a/tests/unit/analyzer/test_analyzer.py +++ b/tests/unit/analyzer/test_analyzer.py @@ -1,3 +1,4 @@ +import json from pathlib import Path import pytest @@ -14,10 +15,12 @@ def _mock_analyze( - _directory: Path, result: Path, _platform: str, _is_debug: bool = False, _json_result: Path | None = None + _directory: Path, result: Path, _platform: str, _is_debug: bool = False, json_result: Path | None = None ) -> None: - # Nothing really needed here, except a result needs to be created. + """Stand in for Bladespector: create Excel path; when JSON is requested, create that file too.""" result.touch() + if json_result is not None: + json_result.write_text(json.dumps({"mock": True}), encoding="utf-8") @pytest.mark.parametrize( @@ -28,7 +31,20 @@ def _mock_analyze( ), ids=str, ) -def test_analyze_arguments_return(tmp_path: Path, report_file: Path) -> None: +@pytest.mark.parametrize( + "omit_generate_json_kw, expect_json_file", + ( + (True, True), # default: omit kwarg → JSON on + (False, False), # explicit generate_json=False + ), + ids=("default_json", "skip_json"), +) +def test_analyze_json_output( + tmp_path: Path, + report_file: Path, + omit_generate_json_kw: bool, + expect_json_file: bool, +) -> None: path = tmp_path / "in" file = tmp_path / report_file mock_prompts = MockPrompts({}) @@ -37,9 +53,18 @@ def test_analyze_arguments_return(tmp_path: Path, report_file: Path) -> None: expected_result = AnalyzerResult(source_directory=path, report_path=file, source_system=str("Synapse")) analyzer = LakebridgeAnalyzer(AnalyzerPrompts(mock_prompts), runner) - result = analyzer.run_analyzer(source=str(path), report_file=str(file), platform="Synapse") + if omit_generate_json_kw: + result = analyzer.run_analyzer(source=str(path), report_file=str(file), platform="Synapse") + else: + result = analyzer.run_analyzer(source=str(path), report_file=str(file), platform="Synapse", generate_json=False) assert result == expected_result + assert file.exists() + json_path = file.with_suffix(".json") + if expect_json_file: + assert json_path.exists(), "JSON should be produced by default alongside Excel" + else: + assert not json_path.exists(), "JSON should not be produced when generate_json is False" def test_analyze_prompts_result(tmp_path: Path): @@ -77,9 +102,34 @@ def test_analyze_prompt_relative_result_path(tmp_path: Path) -> None: _test_analyze_prompt(mock_prompts, expected_result) -def _test_analyze_prompt(mock_prompts: MockPrompts, expected_result: AnalyzerResult) -> None: +def test_analyze_prompts_skip_json(tmp_path: Path) -> None: + first_tech = next(iter(sorted(Analyzer.supported_source_technologies(), key=str.casefold))) + input_path = tmp_path / "in" + report_file = tmp_path / "report-no-json.xlsx" + mock_prompts = MockPrompts( + { + "Select the source technology": "0", + "Enter the path of the directory containing sources to analyze": str(input_path), + "Enter the path of the report file for analyzer results": str(report_file), + } + ) + expected_result = AnalyzerResult(source_directory=input_path, report_path=report_file, source_system=first_tech) + _test_analyze_prompt(mock_prompts, expected_result, generate_json=False) + + +def _test_analyze_prompt( + mock_prompts: MockPrompts, expected_result: AnalyzerResult, *, generate_json: bool | None = None +) -> None: runner = AnalyzerRunner(runnable=_mock_analyze, is_debug=True) analyzer = LakebridgeAnalyzer(AnalyzerPrompts(mock_prompts), runner) - result = analyzer.run_analyzer() + if generate_json is None: + result = analyzer.run_analyzer() + else: + result = analyzer.run_analyzer(generate_json=generate_json) assert result == expected_result + json_path = expected_result.report_path.with_suffix(".json") + if generate_json is False: + assert not json_path.exists(), "JSON should not be produced when generate_json is False" + else: + assert json_path.exists(), "JSON should be produced by default alongside Excel" diff --git a/tests/unit/test_cli_analyze.py b/tests/unit/test_cli_analyze.py index 7048c0cff5..e3b5934917 100644 --- a/tests/unit/test_cli_analyze.py +++ b/tests/unit/test_cli_analyze.py @@ -41,6 +41,8 @@ def test_analyze_arguments( source_tech="Informatica - PC", ) + assert report_path.with_suffix(".json").exists(), "JSON report should be generated by default" + with report_path.open("rb") as f: header = f.read(4) # Excel files are .zip files, so we can check they have the zip header. @@ -62,15 +64,18 @@ def test_analyze_arguments_wrong_tech( } ) + report_path = tmp_path / "sample.xlsx" with patch.object(ApplicationContext, "prompts", mock_prompts): input_path = test_resources / "functional" / "informatica" cli.analyze( w=mock_workspace_client, source_directory=str(input_path), - report_file=str(tmp_path / "sample.xlsx"), + report_file=str(report_path), source_tech="Informatica", ) + assert report_path.with_suffix(".json").exists(), "JSON should be generated by default" + def test_analyze_generate_json( mock_workspace_client: WorkspaceClient, @@ -86,7 +91,6 @@ def test_analyze_generate_json( source_directory=str(input_path), report_file=str(report_path), source_tech="Snowflake", - generate_json=True, ) assert report_path.exists(), "Excel report was not created" @@ -96,6 +100,27 @@ def test_analyze_generate_json( assert isinstance(data, dict), "JSON report is not a valid JSON object" +def test_analyze_skip_json( + mock_workspace_client: WorkspaceClient, + test_resources: Path, + tmp_path: Path, +) -> None: + input_path = test_resources / "functional" / "snowflake" / "integration" + report_path = tmp_path / "report-skip-json.xlsx" + expected_json = tmp_path / "report-skip-json.json" + + cli.analyze( + w=mock_workspace_client, + source_directory=str(input_path), + report_file=str(report_path), + source_tech="Snowflake", + generate_json=False, + ) + + assert report_path.exists(), "Excel report was not created" + assert not expected_json.exists(), "JSON report should not be created when generate_json is false" + + def test_analyze_prompts(mock_workspace_client: WorkspaceClient, test_resources: Path, tmp_path: Path) -> None: supported_tech = sorted(Analyzer.supported_source_technologies(), key=str.casefold) @@ -113,3 +138,30 @@ def test_analyze_prompts(mock_workspace_client: WorkspaceClient, test_resources: ) with patch.object(ApplicationContext, "prompts", mock_prompts): cli.analyze(w=mock_workspace_client) + + assert report_path.exists(), "Excel report was not created" + assert report_path.with_suffix(".json").exists(), "JSON should be generated by default when using prompts" + + +def test_analyze_prompts_skip_json( + mock_workspace_client: WorkspaceClient, test_resources: Path, tmp_path: Path +) -> None: + supported_tech = sorted(Analyzer.supported_source_technologies(), key=str.casefold) + tech_enum = next((i for i, tech in enumerate(supported_tech) if tech == "Informatica - PC"), 12) + + source_dir = test_resources / "functional" / "informatica" + report_path = tmp_path / "results-no-json.xlsx" + expected_json = report_path.with_suffix(".json") + + mock_prompts = MockPrompts( + { + "Select the source technology": str(tech_enum), + "Enter the path of the directory containing sources to analyze": str(source_dir), + "Enter the path of the report file for analyzer results": str(report_path), + } + ) + with patch.object(ApplicationContext, "prompts", mock_prompts): + cli.analyze(w=mock_workspace_client, generate_json=False) + + assert report_path.exists(), "Excel report was not created" + assert not expected_json.exists(), "JSON should not be created when generate_json is false"