Skip to content

Commit 6fa5301

Browse files
committed
Add Notebook-scoped packages for command submits or notebook job run
Signed-off-by: Federico Manuel Gomez Peter <federico.gomez@payclip.com>
1 parent 25caa2a commit 6fa5301

File tree

5 files changed

+518
-17
lines changed

5 files changed

+518
-17
lines changed

dbt/adapters/databricks/python_models/python_config.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ class Config:
2424
extra = "allow"
2525

2626

27+
class PythonPackagesConfig(BaseModel):
28+
"""Pydantic model for python packages configuration."""
29+
30+
packages: list[str]
31+
notebook_scoped: bool
32+
index_url: Optional[str] = None
33+
34+
2735
class PythonModelConfig(BaseModel):
2836
"""
2937
Pydantic model for a Python model configuration.
@@ -42,6 +50,7 @@ class PythonModelConfig(BaseModel):
4250
cluster_id: Optional[str] = None
4351
http_path: Optional[str] = None
4452
create_notebook: bool = False
53+
notebook_scoped_libraries: bool = False
4554
environment_key: Optional[str] = None
4655
environment_dependencies: list[str] = Field(default_factory=list)
4756

@@ -69,6 +78,14 @@ def validate_notebook_permissions(cls, v: list[dict[str, str]]) -> list[dict[str
6978
)
7079
return v
7180

81+
@property
82+
def python_packages_config(self) -> PythonPackagesConfig:
83+
return PythonPackagesConfig(
84+
packages=self.packages,
85+
index_url=self.index_url,
86+
notebook_scoped=self.notebook_scoped_libraries,
87+
)
88+
7289

7390
class ParsedPythonModel(BaseModel):
7491
"""Pydantic model for a Python model parsed from a dbt manifest"""

dbt/adapters/databricks/python_models/python_submissions.py

Lines changed: 77 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,50 @@
99
from dbt.adapters.databricks.api_client import CommandExecution, DatabricksApiClient, WorkflowJobApi
1010
from dbt.adapters.databricks.credentials import DatabricksCredentials
1111
from dbt.adapters.databricks.logging import logger
12-
from dbt.adapters.databricks.python_models.python_config import ParsedPythonModel
12+
from dbt.adapters.databricks.python_models.python_config import (
13+
ParsedPythonModel,
14+
PythonPackagesConfig,
15+
)
1316
from dbt.adapters.databricks.python_models.run_tracking import PythonRunTracker
1417

1518
DEFAULT_TIMEOUT = 60 * 60 * 24
19+
NOTEBOOK_SEPARATOR = "\n\n# COMMAND ----------\n\n"
1620

1721

1822
class PythonSubmitter(ABC):
1923
"""Interface for submitting Python models to run on Databricks."""
2024

25+
def __init__(self, packages_config: PythonPackagesConfig) -> None:
26+
self.packages_config = packages_config
27+
2128
@abstractmethod
2229
def submit(self, compiled_code: str) -> None:
2330
"""Submit the compiled code to Databricks."""
2431
pass
2532

33+
def _prepare_code_with_notebook_scoped_packages(
34+
self, compiled_code: str, separator: str = NOTEBOOK_SEPARATOR
35+
) -> str:
36+
"""Prepend notebook-scoped package installation commands to the compiled code."""
37+
if not self.packages_config.packages or not self.packages_config.notebook_scoped:
38+
return compiled_code
39+
40+
index_url = (
41+
f"--index-url {self.packages_config.index_url}"
42+
if self.packages_config.index_url
43+
else ""
44+
)
45+
# Build the %pip install command for notebook-scoped packages
46+
packages = " ".join(self.packages_config.packages)
47+
pip_install_cmd = f"%pip install {index_url} -q {packages}"
48+
logger.debug(f"Adding notebook-scoped package installation: {pip_install_cmd}")
49+
50+
# Add extra restart python command for Databricks runtimes 13.0 and above
51+
restart_cmd = "dbutils.library.restartPython()"
52+
53+
# Prepend the pip install command to the compiled code
54+
return f"{pip_install_cmd}{separator}{restart_cmd}{separator}{compiled_code}"
55+
2656

2757
class BaseDatabricksHelper(PythonJobHelper):
2858
"""Base helper for python models on Databricks."""
@@ -63,16 +93,24 @@ class PythonCommandSubmitter(PythonSubmitter):
6393
"""Submitter for Python models using the Command API."""
6494

6595
def __init__(
66-
self, api_client: DatabricksApiClient, tracker: PythonRunTracker, cluster_id: str
96+
self,
97+
api_client: DatabricksApiClient,
98+
tracker: PythonRunTracker,
99+
cluster_id: str,
100+
parsed_model: ParsedPythonModel,
67101
) -> None:
68102
self.api_client = api_client
69103
self.tracker = tracker
70104
self.cluster_id = cluster_id
105+
super().__init__(parsed_model.config.python_packages_config)
71106

72107
@override
73108
def submit(self, compiled_code: str) -> None:
74109
logger.debug("Submitting Python model using the Command API.")
75110

111+
# Prepare code with notebook-scoped package installation if needed
112+
compiled_code = self._prepare_code_with_notebook_scoped_packages(compiled_code)
113+
76114
context_id = self.api_client.command_contexts.create(self.cluster_id)
77115
command_exec: Optional[CommandExecution] = None
78116
try:
@@ -252,16 +290,24 @@ def get_library_config(
252290
packages: list[str],
253291
index_url: Optional[str],
254292
additional_libraries: list[dict[str, Any]],
293+
notebook_scoped_libraries: bool = False,
255294
) -> dict[str, Any]:
256-
"""Update the job configuration with the required libraries."""
295+
"""
296+
Update the job configuration with the required libraries.
297+
298+
If notebook_scoped_libraries is True, packages are not included in the library config
299+
as they will be installed via %pip install in the notebook itself.
300+
"""
257301

258302
libraries = []
259303

260-
for package in packages:
261-
if index_url:
262-
libraries.append({"pypi": {"package": package, "repo": index_url}})
263-
else:
264-
libraries.append({"pypi": {"package": package}})
304+
# Only add packages to cluster-level libraries if not using notebook-scoped
305+
if not notebook_scoped_libraries:
306+
for package in packages:
307+
if index_url:
308+
libraries.append({"pypi": {"package": package, "repo": index_url}})
309+
else:
310+
libraries.append({"pypi": {"package": package}})
265311

266312
for library in additional_libraries:
267313
libraries.append(library)
@@ -286,7 +332,10 @@ def __init__(
286332
packages = parsed_model.config.packages
287333
index_url = parsed_model.config.index_url
288334
additional_libraries = parsed_model.config.additional_libs
289-
library_config = get_library_config(packages, index_url, additional_libraries)
335+
notebook_scoped_libraries = parsed_model.config.notebook_scoped_libraries
336+
library_config = get_library_config(
337+
packages, index_url, additional_libraries, notebook_scoped_libraries
338+
)
290339
self.cluster_spec = {**cluster_spec, **library_config}
291340
self.job_grants = parsed_model.config.python_job_config.grants
292341
self.additional_job_settings = parsed_model.config.python_job_config.dict()
@@ -335,11 +384,14 @@ def __init__(
335384
tracker: PythonRunTracker,
336385
uploader: PythonNotebookUploader,
337386
config_compiler: PythonJobConfigCompiler,
387+
parsed_model: ParsedPythonModel,
338388
) -> None:
339389
self.api_client = api_client
340390
self.tracker = tracker
341391
self.uploader = uploader
342392
self.config_compiler = config_compiler
393+
self.parsed_model = parsed_model
394+
super().__init__(parsed_model.config.python_packages_config)
343395

344396
@staticmethod
345397
def create(
@@ -356,12 +408,17 @@ def create(
356408
parsed_model,
357409
cluster_spec,
358410
)
359-
return PythonNotebookSubmitter(api_client, tracker, notebook_uploader, config_compiler)
411+
return PythonNotebookSubmitter(
412+
api_client, tracker, notebook_uploader, config_compiler, parsed_model
413+
)
360414

361415
@override
362416
def submit(self, compiled_code: str) -> None:
363417
logger.debug("Submitting Python model using the Job Run API.")
364418

419+
# Prepare code with notebook-scoped package installation if needed
420+
compiled_code = self._prepare_code_with_notebook_scoped_packages(compiled_code)
421+
365422
file_path = self.uploader.upload(compiled_code)
366423
job_config = self.config_compiler.compile(file_path)
367424

@@ -444,7 +501,12 @@ def build_submitter(self) -> PythonSubmitter:
444501
{"existing_cluster_id": self.cluster_id},
445502
)
446503
else:
447-
return PythonCommandSubmitter(self.api_client, self.tracker, self.cluster_id or "")
504+
return PythonCommandSubmitter(
505+
self.api_client,
506+
self.tracker,
507+
self.cluster_id or "",
508+
self.parsed_model,
509+
)
448510

449511
@override
450512
def validate_config(self) -> None:
@@ -572,6 +634,7 @@ def __init__(
572634
workflow_creater: PythonWorkflowCreator,
573635
job_grants: dict[str, list[dict[str, str]]],
574636
acls: list[dict[str, str]],
637+
parsed_model: ParsedPythonModel,
575638
) -> None:
576639
self.api_client = api_client
577640
self.tracker = tracker
@@ -581,6 +644,7 @@ def __init__(
581644
self.workflow_creater = workflow_creater
582645
self.job_grants = job_grants
583646
self.acls = acls
647+
super().__init__(parsed_model.config.python_packages_config)
584648

585649
@staticmethod
586650
def create(
@@ -599,6 +663,7 @@ def create(
599663
workflow_creater,
600664
parsed_model.config.python_job_config.grants,
601665
parsed_model.config.access_control_list,
666+
parsed_model,
602667
)
603668

604669
@override
@@ -611,6 +676,7 @@ def submit(self, compiled_code: str) -> None:
611676
logger.debug(
612677
f"[Workflow Debug] Compiled code preview: {compiled_code[:preview_len]}..."
613678
)
679+
compiled_code = self._prepare_code_with_notebook_scoped_packages(compiled_code)
614680

615681
file_path = self.uploader.upload(compiled_code)
616682
logger.debug(f"[Workflow Debug] Uploaded notebook to: {file_path}")

tests/unit/python/test_python_config.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,22 @@ def test_parsed_model__valid_model_config(self):
8585
assert config.http_path == "http_path"
8686
assert config.create_notebook is True
8787

88+
def test_parsed_model__valid_python_packages_config(self):
89+
parsed_model = {
90+
"alias": "test",
91+
"config": {
92+
"packages": ["package"],
93+
"index_url": "index_url",
94+
"notebook_scoped_libraries": True,
95+
},
96+
}
97+
98+
model = ParsedPythonModel(**parsed_model)
99+
config = model.config.python_packages_config
100+
assert config.packages == ["package"]
101+
assert config.index_url == "index_url"
102+
assert config.notebook_scoped is True
103+
88104
def test_parsed_model__extra_model_config(self):
89105
parsed_model = {
90106
"alias": "test",

tests/unit/python/test_python_job_support.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def uploader(self, client, parsed_model, identifier):
4040
parsed_model.catalog = "catalog"
4141
parsed_model.schema_ = "schema"
4242
parsed_model.identifier = identifier
43+
parsed_model.config.notebook_scoped_libraries = False
44+
parsed_model.config.packages = []
4345
return PythonNotebookUploader(client, parsed_model)
4446

4547
def test_upload__golden_path(self, uploader, client, compiled_code, workdir, identifier):
@@ -220,6 +222,29 @@ def test_get_library_config__packages_libraries(self):
220222
]
221223
}
222224

225+
def test_get_library_config__notebook_scoped_packages_excluded(self):
226+
config = python_submissions.get_library_config(
227+
["package1", "package2"], None, [], notebook_scoped_libraries=True
228+
)
229+
assert config == {"libraries": []}
230+
231+
def test_get_library_config__notebook_scoped_with_additional_libs(self):
232+
config = python_submissions.get_library_config(
233+
["package1", "package2"],
234+
None,
235+
[{"jar": "s3://mybucket/myjar.jar"}],
236+
notebook_scoped_libraries=True,
237+
)
238+
assert config == {"libraries": [{"jar": "s3://mybucket/myjar.jar"}]}
239+
240+
def test_get_library_config__notebook_scoped_false_includes_packages(self):
241+
config = python_submissions.get_library_config(
242+
["package1", "package2"], None, [], notebook_scoped_libraries=False
243+
)
244+
assert config == {
245+
"libraries": [{"pypi": {"package": "package1"}}, {"pypi": {"package": "package2"}}]
246+
}
247+
223248

224249
class TestPythonJobConfigCompiler:
225250
@pytest.fixture
@@ -232,6 +257,7 @@ def run_name(self, parsed_model):
232257
parsed_model.run_name = run_name
233258
parsed_model.config.packages = []
234259
parsed_model.config.additional_libs = []
260+
parsed_model.config.notebook_scoped_libraries = False
235261
return run_name
236262

237263
@pytest.fixture
@@ -384,3 +410,38 @@ def test_compile__user_environments_override_auto_generated(
384410
assert details.additional_job_config["environments"][0]["spec"]["dependencies"] == [
385411
"requests"
386412
]
413+
414+
def test_compile__notebook_scoped_libraries_excludes_packages(
415+
self, client, permission_builder, parsed_model, run_name
416+
):
417+
parsed_model.config.packages = ["pandas", "numpy"]
418+
parsed_model.config.index_url = None
419+
parsed_model.config.notebook_scoped_libraries = True
420+
parsed_model.config.environment_key = None
421+
parsed_model.config.python_job_config.dict.return_value = {}
422+
423+
permission_builder.build_job_permissions.return_value = []
424+
compiler = PythonJobConfigCompiler(client, permission_builder, parsed_model, {})
425+
details = compiler.compile("path")
426+
427+
# Libraries should be empty since packages are notebook-scoped
428+
assert details.job_spec["libraries"] == []
429+
430+
def test_compile__notebook_scoped_false_includes_packages(
431+
self, client, permission_builder, parsed_model, run_name
432+
):
433+
parsed_model.config.packages = ["pandas", "numpy"]
434+
parsed_model.config.index_url = None
435+
parsed_model.config.notebook_scoped_libraries = False
436+
parsed_model.config.environment_key = None
437+
parsed_model.config.python_job_config.dict.return_value = {}
438+
439+
permission_builder.build_job_permissions.return_value = []
440+
compiler = PythonJobConfigCompiler(client, permission_builder, parsed_model, {})
441+
details = compiler.compile("path")
442+
443+
# Libraries should include packages
444+
assert details.job_spec["libraries"] == [
445+
{"pypi": {"package": "pandas"}},
446+
{"pypi": {"package": "numpy"}},
447+
]

0 commit comments

Comments
 (0)