brus-backend-common/docker-compose.yml at qat · fedspendingtransparency/brus-backend-common · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# Note: docker-compose automagically pulls the current directory's .env as environment variables to use here
# TODO: When upgrading to Spark 4.0, replace the .ivy2 with .ivy2.5.2

services:
  # Simple container for development and testing
  brus-backend-common:
    profiles:
      - brus-backend-common
      - spark
      - minio
    container_name: brus-backend-common
    image: brus-backend-common
    build:
      context: ./
      args:
        PYTHON_VERSION: ${PYTHON_VERSION}
        JAVA_VERSION: ${JAVA_VERSION}
        SPARK_VERSION: ${SPARK_VERSION}
        HADOOP_VERSION: ${HADOOP_VERSION}
        SCALA_VERSION: ${SCALA_VERSION}
        DELTA_VERSION: ${DELTA_VERSION}
        DOWNLOAD_JARS: ${DOWNLOAD_JARS}
    restart: on-failure:3 # 3 max attempt, and then it will stop restarting
    command: /bin/sh -c "sleep infinity"
    ports:
      - "9998:9998"
    environment:
      AWS_ACCESS_KEY: ${MINIO_ROOT_USER:-minio_user}
      AWS_SECRET_KEY: ${MINIO_ROOT_PASSWORD:-minio_secret}

    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false
      - type: bind
        source: ${pwd}/.ivy2
        target: /root/.ivy2
        read_only: false

  minio:
    profiles: # must pass one of these with --profile to docker compose
      - s3
      - spark
    image: minio/minio:RELEASE.2025-09-07T16-13-09Z
    container_name: minio
    volumes:
      - .:/dockermount
      - type: bind
        source: ${MINIO_DATA_DIR:-../data/s3}
        target: /data
    ports:
      - ${MINIO_PORT:-10001}:10001
      - ${MINIO_CONSOLE_PORT:-10002}:10002
    environment:
      MINIO_ROOT_USER: ${MINIO_ROOT_USER:-minio_user}
      MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD:-minio_secret}
    entrypoint: >
      /bin/sh -c "
        # Create the bucket for MinIO used for Spark
        mkdir -p data/data/files
        minio server --address ":10001" --console-address ":10002" /data
      "
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://${MINIO_HOST:-localhost}:${MINIO_PORT:-10001}/minio/health/live" ]
      interval: 30s
      timeout: 20s
      retries: 3

  # driver
  spark-master:
    profiles:
      - spark  # must pass --profile spark to docker compose for this to come up
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
        PYTHON_VERSION: ${PYTHON_VERSION}
        JAVA_VERSION: ${JAVA_VERSION}
        SPARK_VERSION: ${SPARK_VERSION}
        HADOOP_VERSION: ${HADOOP_VERSION}
    container_name: spark-master
    environment:
      SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
      SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
      SPARK_MASTER_WEBUI_PORT: ${SPARK_MASTER_WEBUI_PORT:-4040}
    command: >
      /bin/sh -c "
        $${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.master.Master \
        --port $${SPARK_MASTER_PORT} \
        --webui-port $${SPARK_MASTER_WEBUI_PORT}"
    ports:
      - ${SPARK_MASTER_PORT:-7077}:7077
      - ${SPARK_MASTER_WEBUI_PORT:-4040}:4040
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false

  spark-worker:
    profiles:
      - spark  # must pass --profile spark to docker compose for this to come up
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
        PYTHON_VERSION: ${PYTHON_VERSION}
        JAVA_VERSION: ${JAVA_VERSION}
        SPARK_VERSION: ${SPARK_VERSION}
        HADOOP_VERSION: ${HADOOP_VERSION}
    container_name: spark-worker
    depends_on:
      - spark-master
    environment:
      SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
      SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
      SPARK_WORKER_WEBUI_PORT: ${SPARK_WORKER_WEBUI_PORT:-4041}
      PYTHONPATH: /project
    command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $${SPARK_WORKER_WEBUI_PORT} spark://$${SPARK_MASTER_HOST}:$${SPARK_MASTER_PORT}"
    ports:
      - ${SPARK_WORKER_WEBUI_PORT:-4041}:4041
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false

  spark-history-server:
    profiles:
      - spark  # must pass --profile spark to docker compose for this to come up
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
        PYTHON_VERSION: ${PYTHON_VERSION}
        JAVA_VERSION: ${JAVA_VERSION}
        SPARK_VERSION: ${SPARK_VERSION}
        HADOOP_VERSION: ${HADOOP_VERSION}
    container_name: spark-history-server
    environment:
      SPARK_HISTORY_SERVER_PORT: ${SPARK_HISTORY_SERVER_PORT:-18080}
    command: /bin/sh -c "$${SPARK_HOME}/bin/spark-class org.apache.spark.deploy.history.HistoryServer"
    ports:
      - ${SPARK_HISTORY_SERVER_PORT:-18080}:18080
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false

  # Example of running spark-submit container:
  #  NOTE: double check package dependency versions here with those used in unit tests (conftest_spark.py), as these docs could have gotten stale
  #  (1) Review config values in brus_backend_common/config.py and override any as needed in a .env file or -e environment variable
  #  (2) Deploy minio in docker container (see README.md)
  #  (3) Deploy the postgres DB docker container, if your script connects to a DB.
  #      - If so, also export a JDBC_URL environment variable to pass in to the spark-submit container so it can find its connection
  #  (4) If reading or writing to S3, make sure the bucket given by the value of config setting CONFIG.AWS_S3_BUCKET exists
  #      - e.g. create via UI at http://localhost:10001
  #      - or use MinIO client CLI: mc mb local/data
  #  (4) Run the spark-submit container, citing the dependent packages:
  #    (NOTEs:
  #      - postgresql is needed as a JDBC driver, if connecting to a Postgres DB
  #      - delta-core is needed to read/write in Delta Lake format
  #      - hadoop-aws is needed for the S3AFileSystem.java, used to write data to S3,
  #        - and should use the same hadoop version in your local setup
  #        - NOTE that specifying hadoop-awas should pull in on its own the required version of the aws-java-sdk
  #      - spark-hive is needed to use a hive metastore_db of schemas and tables
  #        - the Docker image at this time only installs spark and hadoop standalone, which does not seem to include all the needed Hive jars
  #
  #    make docker-compose-run profiles="--profile spark" args="--rm -e MINIO_HOST=minio -e JDBC_URL -e COMPONENT_NAME='My Spark Prototype Script' spark-submit \
  #      --packages org.postgresql:postgresql:42.2.23,io.delta:delta-core_2.12:1.2.1,org.apache.hadoop:hadoop-aws:3.3.1,org.apache.spark:spark-hive_2.12:3.2.1 \
  #      /brus_backend_common/path_to_your_spark_prototype_script.py"
  spark-submit:
    profiles:
      - spark  # must pass --profile spark to docker compose for this to come up
    image: spark-base  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    # build context path needs to be relative to project root, from where docker compose will be run
    build:
      context: .
      dockerfile: Dockerfile.spark
      args:
        PROJECT_LOG_DIR: ${PROJECT_LOG_DIR}
        PYTHON_VERSION: ${PYTHON_VERSION}
        JAVA_VERSION: ${JAVA_VERSION}
        SPARK_VERSION: ${SPARK_VERSION}
        HADOOP_VERSION: ${HADOOP_VERSION}
    container_name: spark-submit
    depends_on:
      - spark-master
      - spark-worker
      - spark-history-server
      - minio
    environment:
      IS_LOCAL: ${IS_LOCAL}
      MINIO_HOST: ${MINIO_HOST}
      SPARK_MASTER_HOST: ${SPARK_MASTER_HOST:-spark-master}
      SPARK_MASTER_PORT: ${SPARK_MASTER_PORT:-7077}
      # i.e. target where host warehouse dir is bound in below volume config.
      #   This env var needs to be picked up as the config for the spark.sql.warehouse.dir spark conf setting when SparkSessions are created inside of a spark-submitted job
      SPARK_SQL_WAREHOUSE_DIR: /spark-warehouse
      # i.e. a metastore_db sub dir of the target where host warehouse dir is bound in below volume config.
      #   This env var needs to be picked up as the path part of the config for the spark.hadoop.javax.jdo.option.ConnectionURL spark conf setting when SparkSessions are created inside of a spark-submitted job
      HIVE_METASTORE_DERBY_DB_DIR: /spark-warehouse/metastore_db
      PYTHONPATH: "/project"
    # NOTE: entrypoint CANNOT interpolate env vars when processed. They are passed through literally.
    # So in using 1 $ rather than 2 $$, the var is evaluated based on the current SHELL ENV when docker compose is run,
    # and interpolated before accessed as the entrypoint.
    # While this service has values for these interpolated vars in the environment: element, those are not used here,
    # but merely passed into the container. KEEP the two references to these vars and their defaults consistent!
    # To see what it will be, you can run docker compose config (i.e. make docker-compose-config in this project's Makefile)
    entrypoint: ./bin/spark-submit --master spark://${SPARK_MASTER_HOST:-spark-master}:${SPARK_MASTER_PORT:-7077}
    command: --help
    volumes:
      - type: bind
        source: .
        target: /project
        read_only: false
      # NOTE: The hive metastore_db Derby database folder is expected to be configured to show up as a subfolder of the spark-warehouse dir
      - type: bind
        source: ${SPARK_SQL_WAREHOUSE_DIR:-${pwd}/spark-warehouse}
        target: /spark-warehouse
      # Mount the JAR dependencies local repo on host into container to take advantage of caching/reuse
      # i.e., to download the dependencies only once and reuse on subsequent docker compose run calls
      - type: bind
        source: ${pwd}/.ivy2
        target: /root/.ivy2
        read_only: false

  brus-backend-common-ci:
    profiles:
      - test
      - spark
      - ci  # must pass --profile ci to docker compose for this to come up, or use docker compose run
    image: brus-backend-common  # when an image by this name is not found in the local repo, and it is forced to build, it will use this as the tag
    build:
      context: ./
      args:
        PYTHON_VERSION: ${PYTHON_VERSION}
        JAVA_VERSION: ${JAVA_VERSION}
        SPARK_VERSION: ${SPARK_VERSION}
        HADOOP_VERSION: ${HADOOP_VERSION}
        SCALA_VERSION: ${SCALA_VERSION}
        DELTA_VERSION: ${DELTA_VERSION}
    container_name: brus-backend-common-ci
    volumes:
      - .:/dockermount
      # Required to interact with host's docker daemon from within this running container,
      - /var/run/docker.sock:/var/run/docker.sock
      - type: bind
        source: .
        target: /project
        read_only: false
      - type: bind
        source: ${pwd}/.ivy2
        target: /root/.ivy2
        read_only: false
    environment:
      AWS_ACCESS_KEY: ${MINIO_ROOT_USER:-minio_user}
      AWS_SECRET_KEY: ${MINIO_ROOT_PASSWORD:-minio_secret}
    command:
      - sh
      - -c
      - |
        printf "==============\nChecking code format:\n"
        black --check --diff .
        printf -- "-------\nChecking code syntax:\n"
        flake8 && echo "Successfully passed"
        printf -- "-------\nRunning unit tests:\n"
        pytest --durations 50 --ignore-glob='**/tests/integration/*' --cov=brus_backend_common --cov-report= -rsx
        printf -- "-------\nRunning integration tests:\n"
        pytest --durations 50 --override-ini=python_files='**/tests/integration/*' --cov=brus_backend_common --cov-append --cov-report term --cov-report xml:coverage.xml -rsx


  # TODO: AWSGlue container?