-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathpython_project.py
More file actions
126 lines (109 loc) · 4.1 KB
/
Copy pathpython_project.py
File metadata and controls
126 lines (109 loc) · 4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import subprocess
import sys
from pathlib import Path
from shutil import copy
import boto3
from emr_cli.deployments import SparkParams
from emr_cli.deployments.emr_serverless import DeploymentPackage
from emr_cli.utils import (
PrettyUploader,
console_log,
copy_template,
parse_bucket_uri,
validate_build_target,
)
class PythonProject(DeploymentPackage):
def initialize(self, target_dir: str = os.getcwd()):
"""
Initializes a pyspark project in the provided directory.
- Creates a basic project
- Creates a pyproject.toml file
- Creates a Dockerfile
"""
console_log(f"Initializing project in {target_dir}")
copy_template("pyspark", target_dir)
console_log("Project initialized.")
def copy_single_file(self, relative_file_path: str, target_dir: str = os.getcwd()):
"""
Copies a single file from the template directory to the target directory.
"""
template_path = (
Path(__file__).parent.parent / "templates" / "pyspark" / relative_file_path
)
target_path = Path(target_dir)
copy(template_path, target_path)
def build(self):
"""
For now, uses a pre-existing Docker file and setuptools
"""
if not Path("Dockerfile").exists():
print(
"Error: No Dockerfile present, use 'emr-cli init --dockerfile' to generate one" # noqa: E501
)
sys.exit(1)
if not Path("pyproject.toml").exists():
print("Error: No pyproject.toml present, please set one up before building")
sys.exit(1)
console_log(f"Packaging assets into {self.dist_dir}/")
self._run_docker_build(self.dist_dir)
def _run_docker_build(self, output_dir: str):
validate_build_target("export-python")
subprocess.run(
[
"docker",
"build",
"--target",
"export-python",
"--output",
output_dir,
".",
],
check=True,
env=dict(os.environ, DOCKER_BUILDKIT="1"),
)
def deploy(self, s3_code_uri: str, profile: str = None) -> str:
"""
Copies local code to S3 and returns the path to the uploaded entrypoint
"""
self.s3_uri_base = s3_code_uri
aws_session = ""
if profile:
aws_session = boto3.session.Session(profile_name=profile)
else:
aws_session = boto3.session.Session()
s3_client = aws_session.client("s3")
bucket, prefix = parse_bucket_uri(self.s3_uri_base)
filename = os.path.basename(self.entry_point_path)
console_log(f"Deploying {filename} and dependencies to {self.s3_uri_base}")
uploader = PrettyUploader(
s3_client,
bucket,
{
self.entry_point_path: os.path.join(prefix, filename),
os.path.join(self.dist_dir, "pyspark_deps.tar.gz"): os.path.join(
prefix, "pyspark_deps.tar.gz"
),
},
)
uploader.run()
return f"s3://{bucket}/{prefix}/{filename}"
def spark_submit_parameters(self) -> SparkParams:
tar_path = os.path.join(self.s3_uri_base, "pyspark_deps.tar.gz")
return SparkParams(
common_params={
"spark.archives": f"{tar_path}#environment",
},
emr_serverless_params={
"spark.emr-serverless.driverEnv.PYSPARK_DRIVER_PYTHON": "./environment/bin/python",
"spark.emr-serverless.driverEnv.PYSPARK_PYTHON": "./environment/bin/python",
"spark.executorEnv.PYSPARK_PYTHON": "./environment/bin/python",
},
emr_ec2_params={
"spark.executorEnv.PYSPARK_PYTHON": "./environment/bin/python",
"spark.yarn.appMasterEnv.PYSPARK_PYTHON": "./environment/bin/python",
},
emr_eks_params={
"spark.pyspark.python": "./environment/bin/python",
},
)