-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdocker-compose.yml
More file actions
284 lines (274 loc) · 12.3 KB
/
docker-compose.yml
File metadata and controls
284 lines (274 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# Note: docker-compose automagically pulls the current directory's .env as environment variables to use here
# TODO: When upgrading to Spark 4.0, replace the .ivy2 with .ivy2.5.2
services:
# Simple container for development and testing
brus-backend-common:
profiles:
- brus-backend-common
- spark
- minio
container_name: brus-backend-common
image: brus-backend-common
build:
context: ./
args:
PYTHON_VERSION: ${PYTHON_VERSION}
JAVA_VERSION: ${JAVA_VERSION}
SPARK_VERSION: ${SPARK_VERSION}
HADOOP_VERSION: ${HADOOP_VERSION}
SCALA_VERSION: ${SCALA_VERSION}
DELTA_VERSION: ${DELTA_VERSION}
DOWNLOAD_JARS: ${DOWNLOAD_JARS}
restart: on-failure:3 # 3 max attempt, and then it will stop restarting
command: /bin/sh -c "sleep infinity"
ports:
- "9998:9998"
environment:
AWS_ACCESS_KEY: ${MINIO_ROOT_USER:-minio_user}
AWS_SECRET_KEY: ${MINIO_ROOT_PASSWORD:-minio_secret}
volumes:
- type: bind
source: .
target: /project
read_only: false
- type: bind
source: ${pwd}/.ivy2
target: /root/.ivy2
read_only: false
minio:
profiles: # must pass one of these with --profile to docker compose
- s3
- spark
image: minio/minio:RELEASE.2025-09-07T16-13-09Z
container_name: minio
volumes:
- .:/dockermount
- type: bind
source: ${MINIO_DATA_DIR:-../data/s3}
target: /data
ports:
- ${MINIO_PORT:-10001}:10001
- ${MINIO_CONSOLE_PORT:-10002}:10002
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minio_user}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minio_secret}
entrypoint: >
/bin/sh -c "
# Create the bucket for MinIO used for Spark
mkdir -p data/data/files
minio server --address ":10001" --console-address ":10002" /data
"
healthcheck:
test: [ "CMD", "curl", "-f", "http://${MINIO_HOST:-localhost}:${MINIO_PORT:-10001}/minio/health/live" ]
interval: 30s
timeout: 20s
retries: 3
# driver
spark-master:
profiles:
- spark # must pass --profile spark to docker compose for this to come up
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
PYTHON_VERSION: ${PYTHON_VERSION}
JAVA_VERSION: ${JAVA_VERSION}
SPARK_VERSION: ${SPARK_VERSION}
HADOOP_VERSION: ${HADOOP_VERSION}
container_name: spark-master
environment:
SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
SPARK_MASTER_WEBUI_PORT: ${SPARK_MASTER_WEBUI_PORT:-4040}
command: >
/bin/sh -c "
$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.master.Master \
--port $${SPARK_MASTER_PORT} \
--webui-port $${SPARK_MASTER_WEBUI_PORT}"
ports:
- ${SPARK_MASTER_PORT:-7077}:7077
- ${SPARK_MASTER_WEBUI_PORT:-4040}:4040
volumes:
- type: bind
source: .
target: /project
read_only: false
spark-worker:
profiles:
- spark # must pass --profile spark to docker compose for this to come up
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
PYTHON_VERSION: ${PYTHON_VERSION}
JAVA_VERSION: ${JAVA_VERSION}
SPARK_VERSION: ${SPARK_VERSION}
HADOOP_VERSION: ${HADOOP_VERSION}
container_name: spark-worker
depends_on:
- spark-master
environment:
SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
SPARK_WORKER_WEBUI_PORT: ${SPARK_WORKER_WEBUI_PORT:-4041}
PYTHONPATH: /project
command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $${SPARK_WORKER_WEBUI_PORT} spark://$${SPARK_MASTER_HOST}:$${SPARK_MASTER_PORT}"
ports:
- ${SPARK_WORKER_WEBUI_PORT:-4041}:4041
volumes:
- type: bind
source: .
target: /project
read_only: false
spark-history-server:
profiles:
- spark # must pass --profile spark to docker compose for this to come up
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
PYTHON_VERSION: ${PYTHON_VERSION}
JAVA_VERSION: ${JAVA_VERSION}
SPARK_VERSION: ${SPARK_VERSION}
HADOOP_VERSION: ${HADOOP_VERSION}
container_name: spark-history-server
environment:
SPARK_HISTORY_SERVER_PORT: ${SPARK_HISTORY_SERVER_PORT:-18080}
command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.history.HistoryServer"
ports:
- ${SPARK_HISTORY_SERVER_PORT:-18080}:18080
volumes:
- type: bind
source: .
target: /project
read_only: false
# Example of running spark-submit container:
# NOTE: double check package dependency versions here with those used in unit tests (conftest_spark.py), as these docs could have gotten stale
# (1) Review config values in brus_backend_common/config.py and override any as needed in a .env file or -e environment variable
# (2) Deploy minio in docker container (see README.md)
# (3) Deploy the postgres DB docker container, if your script connects to a DB.
# - If so, also export a JDBC_URL environment variable to pass in to the spark-submit container so it can find its connection
# (4) If reading or writing to S3, make sure the bucket given by the value of config setting CONFIG.AWS_S3_BUCKET exists
# - e.g. create via UI at http://localhost:10001
# - or use MinIO client CLI: mc mb local/data
# (4) Run the spark-submit container, citing the dependent packages:
# (NOTEs:
# - postgresql is needed as a JDBC driver, if connecting to a Postgres DB
# - delta-core is needed to read/write in Delta Lake format
# - hadoop-aws is needed for the S3AFileSystem.java, used to write data to S3,
# - and should use the same hadoop version in your local setup
# - NOTE that specifying hadoop-awas should pull in on its own the required version of the aws-java-sdk
# - spark-hive is needed to use a hive metastore_db of schemas and tables
# - the Docker image at this time only installs spark and hadoop standalone, which does not seem to include all the needed Hive jars
#
# make docker-compose-run profiles="--profile spark" args="--rm -e MINIO_HOST=minio -e JDBC_URL -e COMPONENT_NAME='My Spark Prototype Script' spark-submit \
# --packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
# /brus_backend_common/path_to_your_spark_prototype_script.py"
spark-submit:
profiles:
- spark # must pass --profile spark to docker compose for this to come up
image: spark-base # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
# build context path needs to be relative to project root, from where docker compose will be run
build:
context: .
dockerfile: Dockerfile.spark
args:
PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
PYTHON_VERSION: ${PYTHON_VERSION}
JAVA_VERSION: ${JAVA_VERSION}
SPARK_VERSION: ${SPARK_VERSION}
HADOOP_VERSION: ${HADOOP_VERSION}
container_name: spark-submit
depends_on:
- spark-master
- spark-worker
- spark-history-server
- minio
environment:
IS_LOCAL: ${IS_LOCAL}
MINIO_HOST: ${MINIO_HOST}
SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
# i.e. target where host warehouse dir is bound in below volume config.
# This env var needs to be picked up as the config for the spark.sql.warehouse.dir spark conf setting when SparkSessions are created inside of a spark-submitted job
SPARK_SQL_WAREHOUSE_DIR: /spark-warehouse
# i.e. a metastore_db sub dir of the target where host warehouse dir is bound in below volume config.
# This env var needs to be picked up as the path part of the config for the spark.hadoop.javax.jdo.option.ConnectionURL spark conf setting when SparkSessions are created inside of a spark-submitted job
HIVE_METASTORE_DERBY_DB_DIR: /spark-warehouse/metastore_db
PYTHONPATH: "/project"
# NOTE: entrypoint CANNOT interpolate env vars when processed. They are passed through literally.
# So in using 1 $ rather than 2 $$, the var is evaluated based on the current SHELL ENV when docker compose is run,
# and interpolated before accessed as the entrypoint.
# While this service has values for these interpolated vars in the environment: element, those are not used here,
# but merely passed into the container. KEEP the two references to these vars and their defaults consistent!
# To see what it will be, you can run docker compose config (i.e. make docker-compose-config in this project's Makefile)
entrypoint: ./bin/spark-submit --master spark://${SPARK_MASTER_HOST:-spark-master}:${SPARK_MASTER_PORT:-7077}
command: --help
volumes:
- type: bind
source: .
target: /project
read_only: false
# NOTE: The hive metastore_db Derby database folder is expected to be configured to show up as a subfolder of the spark-warehouse dir
- type: bind
source: ${SPARK_SQL_WAREHOUSE_DIR:-${pwd}/spark-warehouse}
target: /spark-warehouse
# Mount the JAR dependencies local repo on host into container to take advantage of caching/reuse
# i.e., to download the dependencies only once and reuse on subsequent docker compose run calls
- type: bind
source: ${pwd}/.ivy2
target: /root/.ivy2
read_only: false
brus-backend-common-ci:
profiles:
- test
- spark
- ci # must pass --profile ci to docker compose for this to come up, or use docker compose run
image: brus-backend-common # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
build:
context: ./
args:
PYTHON_VERSION: ${PYTHON_VERSION}
JAVA_VERSION: ${JAVA_VERSION}
SPARK_VERSION: ${SPARK_VERSION}
HADOOP_VERSION: ${HADOOP_VERSION}
SCALA_VERSION: ${SCALA_VERSION}
DELTA_VERSION: ${DELTA_VERSION}
container_name: brus-backend-common-ci
volumes:
- .:/dockermount
# Required to interact with host's docker daemon from within this running container,
- /var/run/docker.sock:/var/run/docker.sock
- type: bind
source: .
target: /project
read_only: false
- type: bind
source: ${pwd}/.ivy2
target: /root/.ivy2
read_only: false
environment:
AWS_ACCESS_KEY: ${MINIO_ROOT_USER:-minio_user}
AWS_SECRET_KEY: ${MINIO_ROOT_PASSWORD:-minio_secret}
command:
- sh
- -c
- |
printf "==============\nChecking code format:\n"
black --check --diff .
printf -- "-------\nChecking code syntax:\n"
flake8 && echo "Successfully passed"
printf -- "-------\nRunning unit tests:\n"
pytest --durations 50 --ignore-glob='**/tests/integration/*' --cov=brus_backend_common --cov-report= -rsx
printf -- "-------\nRunning integration tests:\n"
pytest --durations 50 --override-ini=python_files='**/tests/integration/*' --cov=brus_backend_common --cov-append --cov-report term --cov-report xml:coverage.xml -rsx
# TODO: AWSGlue container?