-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathpython_poetry_project.py
More file actions
129 lines (110 loc) · 4.21 KB
/
Copy pathpython_poetry_project.py
File metadata and controls
129 lines (110 loc) · 4.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import subprocess
import sys
from pathlib import Path
from typing import List
from urllib.parse import urlparse
import boto3
from emr_cli.deployments import SparkParams
from emr_cli.deployments.emr_serverless import DeploymentPackage
from emr_cli.utils import (
PrettyUploader,
console_log,
copy_template,
validate_build_target,
)
class PythonPoetryProject(DeploymentPackage):
def initialize(self, target_dir: str = os.getcwd()):
"""
Initializes a poetry-based pyspark project in the provided directory.
- Creates a basic poetry project
- Creates a pyproject.toml file
- Creates a Dockerfile
"""
console_log(f"Initializing project in {target_dir}")
copy_template("pyspark", target_dir)
copy_template("poetry", target_dir)
console_log("Project initialized.")
def build(self):
if not Path("poetry.lock").exists():
print("Error: No poetry.lock present, please setup your poetry project.")
sys.exit(1)
console_log(f"Packaging assets into {self.dist_dir}/")
# TODO: Add an option for --force-local-build
self._run_docker_build(self.dist_dir)
def _run_local_build(self, output_dir: str = "dist"):
subprocess.run(
["poetry", "bundle", "venv", "poeticemrbundle", "--without", "dev"],
check=True,
)
def _run_docker_build(self, output_dir: str):
validate_build_target("export-poetry")
subprocess.run(
[
"docker",
"build",
"--target",
"export-poetry",
"--output",
output_dir,
"--file",
self._dockerfile_path(),
".",
],
check=True,
env=dict(os.environ, DOCKER_BUILDKIT="1"),
)
def _dockerfile_path(self) -> str:
if Path("Dockerfile").is_file():
return "Dockerfile"
templates = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "templates", "pyspark")
)
return os.path.join(templates, "Dockerfile")
def deploy(self, s3_code_uri: str, profile: str=None) -> str:
"""
Copies local code to S3 and returns the path to the uploaded entrypoint
"""
aws_session = ""
if profile:
aws_session = boto3.session.Session(profile_name=profile)
else:
aws_session = boto3.session.Session()
s3_client = aws_session.client("s3")
bucket, prefix = self._parse_bucket_uri(s3_code_uri)
filename = os.path.basename(self.entry_point_path)
console_log(f"Deploying {filename} and dependencies to {s3_code_uri}")
uploader = PrettyUploader(
s3_client,
bucket,
{
self.entry_point_path: os.path.join(prefix, filename),
os.path.join(self.dist_dir, "pyspark_deps.tar.gz"): os.path.join(
prefix, "pyspark_deps.tar.gz"
),
},
)
uploader.run()
return f"s3://{bucket}/{prefix}/{filename}"
def spark_submit_parameters(self) -> SparkParams:
tar_path = os.path.join(self.s3_uri_base, "pyspark_deps.tar.gz")
return SparkParams(
common_params={
"spark.archives": f"{tar_path}#environment",
},
emr_serverless_params={
"spark.emr-serverless.driverEnv.PYSPARK_DRIVER_PYTHON": "./environment/bin/python",
"spark.emr-serverless.driverEnv.PYSPARK_PYTHON": "./environment/bin/python",
"spark.executorEnv.PYSPARK_PYTHON": "./environment/bin/python",
},
emr_ec2_params={
"spark.executorEnv.PYSPARK_PYTHON": "./environment/bin/python",
"spark.yarn.appMasterEnv.PYSPARK_PYTHON": "./environment/bin/python",
},
emr_eks_params={
"spark.pyspark.python": "./environment/bin/python",
},
)
def _parse_bucket_uri(self, uri: str) -> List[str]:
result = urlparse(uri, allow_fragments=False)
return [result.netloc, result.path.strip("/")]