Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 21 additions & 22 deletions module4-analytics-engineering/redshift/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
# Builder stage
FROM python:3.12-slim AS builder
FROM ghcr.io/astral-sh/uv:python3.13-trixie-slim AS builder

COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
WORKDIR /app

ENV UV_COMPILE_BYTECODE=1
COPY uv.lock pyproject.toml README.md ./

RUN --mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
--mount=type=bind,source=README.md,target=README.md \
uv pip install -r pyproject.toml --system
RUN --mount=type=cache,target=/root/.cache/uv \
UV_COMPILE_BYTECODE=1 \
uv pip install --system --no-cache .

ADD analyses /dbt/analyses
ADD macros /dbt/macros
ADD models /dbt/models
ADD seeds /dbt/seeds
ADD tests /dbt/tests
ADD dbt_project.yml package-lock.yml packages.yml /dbt/
ADD profiles.tmpl.yml /dbt/profiles.yml
ADD analyses /dbt/analyses
ADD macros /dbt/macros
ADD models /dbt/models
ADD seeds /dbt/seeds
ADD tests /dbt/tests
ADD dbt_project.yml package-lock.yml packages.yml /dbt/
ADD profiles.tmpl.yml /dbt/profiles.yml

# Runner stage
FROM python:3.12-slim AS runner
FROM python:3.13-slim AS runner

WORKDIR /dbt/

COPY --from=builder /usr/local/lib/python3.13/site-packages /usr/local/lib/python3.13/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /dbt /dbt

ENV DBT_PROFILES_DIR=/dbt/
ENV DBT_REDSHIFT_HOST=
Expand All @@ -28,10 +33,4 @@ ENV DBT_REDSHIFT_SCHEMA=
ENV AWS_ACCESS_KEY_ID=
ENV AWS_SECRET_ACCESS_KEY=

COPY --from=builder /usr/local/lib/python3.12 /usr/local/lib/python3.12
COPY --from=builder /usr/local/bin/* /usr/local/bin/
COPY --from=builder /dbt /dbt

WORKDIR /dbt/

ENTRYPOINT ["sh", "-c", "dbt deps && dbt seed --target prod && dbt run --target prod"]
ENTRYPOINT ["sh", "-c", "dbt deps && dbt build --target prod && cp target/run_results.json target/run_results_backup.json && dbt docs generate --target prod && cp target/run_results_backup.json target/run_results.json"]
13 changes: 4 additions & 9 deletions module4-analytics-engineering/redshift/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# dbt and Redshift for Analytics

![Python](https://img.shields.io/badge/Python-3.12-4B8BBE.svg?style=flat&logo=python&logoColor=FFD43B&labelColor=306998)
[![Redshift](https://img.shields.io/badge/Redshift_Serverless-2766A7?style=flat&logo=Amazon%20RedShift&logoColor=white&labelColor=2766A7)](https://aws.amazon.com/pt/redshift/redshift-serverless/)
[![dbt](https://img.shields.io/badge/dbt--redshift-1.9-262A38?style=flat&logo=dbt&logoColor=FF6849&labelColor=262A38)](https://docs.getdbt.com/reference/warehouse-setups/redshift-setup)
![Python](https://img.shields.io/badge/Python-3.13_|_3.12-4B8BBE.svg?style=flat&logo=python&logoColor=FFD43B&labelColor=306998)
[![dbt](https://img.shields.io/badge/dbt--redshift-1.10-262A38?style=flat&labelColor=262A38&logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIGZpbGw9IiNmZjY5NGIiIHZpZXdCb3g9IjAgMCAyNCAyNCI+PHBhdGggZD0iTTE3LjkgOS4zOGE4IDggMCAwIDAtMy4wNC0zLjEybDEuNzcuODNhMTAgMTAgMCAwIDEgMy43NCAzbDMuMjMtNS45M2EyLjkgMi45IDAgMCAwLS4wNi0yLjk2IDIuNzMgMi43MyAwIDAgMC0zLjU2LS44N0wxNC4xIDMuNTRhNC40IDQuNCAwIDAgMS00LjE4IDBMNC4xOC40MWEyLjkgMi45IDAgMCAwLTIuOTYuMDYgMi43MyAyLjczIDAgMCAwLS44OCAzLjU3TDMuNTYgOS45YTQuNCA0LjQgMCAwIDEgMCA0LjE4TC40MiAxOS44M2EyLjkgMi45IDAgMCAwIC4wOSAzIDIuNzMgMi43MyAwIDAgMCAzLjU0Ljg0bDYuMDYtMy4zYTEwIDEwIDAgMCAxLTMtMy43NmwtLjg0LTEuNzdhOCA4IDAgMCAwIDMuMTIgMy4wNWwxMC41OCA1Ljc4YTIuNzMgMi43MyAwIDAgMCAzLjU1LS44NCAyLjkgMi45IDAgMCAwIC4wOC0zem0zLjM4LTcuNzRhMS4wOSAxLjA5IDAgMSAxIDAgMi4xOCAxLjA5IDEuMDkgMCAwIDEgMC0yLjE4TTIuNzQgMy44MmExLjA5IDEuMDkgMCAxIDEgMC0yLjE4IDEuMDkgMS4wOSAwIDAgMSAwIDIuMThtMCAxOC41NGExLjA5IDEuMDkgMCAxIDEgMC0yLjE4IDEuMDkgMS4wOSAwIDAgMSAwIDIuMThNMTMuMSAxMC45YTIuMTcgMi4xNyAwIDAgMC0yLjE4IDIuMTcgMi4yIDIuMiAwIDAgMCAuNyAxLjYgMi43MiAyLjcyIDAgMSAxIC43Ny01LjM4IDIuNyAyLjcgMCAwIDEgMi4zIDIuMzIgMi4yIDIuMiAwIDAgMC0xLjU5LS43MW04LjE4IDExLjQ1YTEuMDkgMS4wOSAwIDEgMSAwLTIuMTggMS4wOSAxLjA5IDAgMCAxIDAgMi4xOCIvPjwvc3ZnPgo=)](https://docs.getdbt.com/reference/warehouse-setups/bigquery-setup)
[![uv](https://img.shields.io/badge/astral/uv-261230?style=flat&logo=uv&logoColor=DE5FE9&labelColor=261230)](https://docs.astral.sh/uv/getting-started/installation/)
[![Docker](https://img.shields.io/badge/Docker-329DEE?style=flat&logo=docker&logoColor=white&labelColor=329DEE)](https://docs.docker.com/get-docker/)

![License](https://img.shields.io/badge/license-CC--BY--SA--4.0-31393F?style=flat&logo=creativecommons&logoColor=black&labelColor=white)

Expand Down Expand Up @@ -62,12 +62,7 @@ export AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY
dbt deps
```

4.2. Run `dbt seed` to push/create the tables from the .csv seed files to the target schema
```shell
dbt seed
```

4.3. Run dbt run to trigger the dbt models to run
4.2. Run dbt build to trigger the dbt models to run
```shell
dbt build

Expand Down
6 changes: 3 additions & 3 deletions module4-analytics-engineering/redshift/dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ clean-targets: # directories to be removed by `dbt clean`
# directory as views. These settings can be overridden in the individual model
# files using the `{{ config(...) }}` macro.
models:
dbt_redshift_analytics:
dbt_bigquery_analytics:
staging:
materialized: table
+materialized: view
core:
materialized: table
+materialized: table
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
{% macro resolve_type(model_type='staging') -%}

{%- set target_env_var = 'DBT_REDSHIFT_TARGET_SCHEMA' -%}
{%- set stging_env_var = 'DBT_REDSHIFT_STAGING_SCHEMA' -%}
{%- set staging_env_var = 'DBT_REDSHIFT_STAGING_SCHEMA' -%}

{%- if model_type == 'core' -%} {{- env_var(target_env_var) -}}
{%- else -%} {{- env_var(stging_env_var, 'stg_' ~ env_var(target_env_var)) -}}
{%- else -%} {{- env_var(staging_env_var, 'stg_' ~ env_var(target_env_var)) -}}
{%- endif -%}

{%- endmacro %}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ case {{ payment_type_id }}
when 4 then 'Dispute'
when 5 then 'Unknown'
when 6 then 'Voided trip'
else 'Unknown'
end

{%- endmacro %}
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,40 @@
schema=resolve_schema_for('core')
) }}

with fhv_tripdata as (
select
dispatching_base_num,
affiliated_base_num,
pickup_datetime,
dropoff_datetime,
pickup_location_id,
dropoff_location_id,
shared_ride_flag
from
{{ ref('stg_fhv_tripdata') }}
),

lookup_zones as (
select *
from {{ ref('dim_zone_lookup' )}}
with lookup_zones as (
select location_id, borough, zone, service_zone
from {{ ref('dim_zone_lookup') }}
where borough != 'Unknown'
)

select
ft.dispatching_base_num as dispatching_base_num,
ft.affiliated_base_num as affiliated_base_num,
ft.pickup_location_id as pickup_location_id,
pickup.borough as pickup_borough,
pickup.zone as pickup_zone,
pickup.service_zone as pickup_service_zone,
ft.dropoff_location_id as dropoff_location_id,
dropoff.borough as dropoff_borough,
dropoff.zone as dropoff_zone,
dropoff.service_zone as dropoff_service_zone,
ft.shared_ride_flag as shared_ride_flag,
ft.pickup_datetime as pickup_datetime,
ft.dropoff_datetime as dropoff_datetime
from
fhv_tripdata ft
inner join
lookup_zones pickup on ft.pickup_location_id = pickup.location_id
inner join
lookup_zones dropoff on ft.dropoff_location_id = dropoff.location_id
ft.trip_id as trip_id,
ft.dispatching_base_num as dispatching_base_num,
ft.affiliated_base_num as affiliated_base_num,
ft.shared_ride_flag as shared_ride_flag,

ft.pickup_location_id as pickup_location_id,
pu.borough as pickup_borough,
pu.zone as pickup_zone,
pu.service_zone as pickup_service_zone,

ft.dropoff_location_id as dropoff_location_id,
do.borough as dropoff_borough,
do.zone as dropoff_zone,
do.service_zone as dropoff_service_zone,

ft.pickup_datetime as pickup_datetime,
extract(year from ft.pickup_datetime) as pickup_year,
extract(quarter from ft.pickup_datetime) as pickup_quarter,
extract(month from ft.pickup_datetime) as pickup_month,

ft.dropoff_datetime as dropoff_datetime,
extract(year from ft.dropoff_datetime) as dropoff_year,
extract(quarter from ft.dropoff_datetime) as dropoff_quarter,
extract(month from ft.dropoff_datetime) as dropoff_month
from
{{ ref('stg_fhv_tripdata') }} ft
inner join
lookup_zones pu on ft.pickup_location_id = pu.location_id
inner join
lookup_zones do on ft.dropoff_location_id = do.location_id
101 changes: 47 additions & 54 deletions module4-analytics-engineering/redshift/models/core/dim_taxi_trips.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,65 +2,58 @@
schema=resolve_schema_for('core')
) }}

with green_taxi_trips as (
select
gt.*,
'green' as service_type
from
{{ ref('stg_green_tripdata') }} gt
),

yellow_taxi_trips as (
select
yt.*,
'yellow' as service_type
from
{{ ref('stg_yellow_tripdata') }} yt
),

taxi_trips as (
select * from green_taxi_trips
union all
select * from yellow_taxi_trips
with taxi_trips as (
select *, 'green' as service_type from {{ ref('stg_green_tripdata') }}
union all
select *, 'yellow' as service_type from {{ ref('stg_yellow_tripdata') }}
),

lookup_zones as (
select *
from {{ ref('dim_zone_lookup' )}}
select location_id, borough, zone
from {{ ref('dim_zone_lookup') }}
where borough != 'Unknown'
)

select
tt.trip_id as trip_id,
tt.vendor_id as vendor_id,
tt.service_type as service_type,
tt.ratecode_id as ratecode_id,
tt.pickup_location_id as pickup_location_id,
pickup.borough as pickup_borough,
pickup.zone as pickup_zone,
tt.dropoff_location_id as dropoff_location_id,
dropoff.borough as dropoff_borough,
dropoff.zone as dropoff_zone,
tt.pickup_datetime as pickup_datetime,
tt.dropoff_datetime as dropoff_datetime,
tt.store_and_fwd_flag as store_and_fwd_flag,
tt.passenger_count as passenger_count,
tt.trip_distance as trip_distance,
tt.trip_type as trip_type,
tt.fare_amount as fare_amount,
tt.extra as extra,
tt.mta_tax as mta_tax,
tt.tip_amount as tip_amount,
tt.tolls_amount as tolls_amount,
tt.ehail_fee as ehail_fee,
tt.improvement_surcharge as improvement_surcharge,
tt.congestion_surcharge as congestion_surcharge,
tt.total_amount as total_amount,
tt.payment_type as payment_type,
tt.payment_type_desc as payment_type_description
from
tt.trip_id as trip_id,
tt.vendor_id as vendor_id,
tt.service_type as service_type,
tt.ratecode_id as ratecode_id,

tt.pickup_location_id as pickup_location_id,
pu.borough as pickup_borough,
pu.zone as pickup_zone,
tt.pickup_datetime as pickup_datetime,
extract(year from tt.pickup_datetime) as pickup_year,
extract(quarter from tt.pickup_datetime) as pickup_quarter,
extract(month from tt.pickup_datetime) as pickup_month,

tt.dropoff_location_id as dropoff_location_id,
do.borough as dropoff_borough,
do.zone as dropoff_zone,
tt.dropoff_datetime as dropoff_datetime,
extract(year from tt.dropoff_datetime) as dropoff_year,
extract(quarter from tt.dropoff_datetime) as dropoff_quarter,
extract(month from tt.dropoff_datetime) as dropoff_month,

tt.store_and_fwd_flag as store_and_fwd_flag,
tt.passenger_count as passenger_count,
tt.trip_distance as trip_distance,
tt.trip_type as trip_type,
tt.fare_amount as fare_amount,
tt.extra as extra,
tt.mta_tax as mta_tax,
tt.tip_amount as tip_amount,
tt.tolls_amount as tolls_amount,
tt.ehail_fee as ehail_fee,
tt.improvement_surcharge as improvement_surcharge,
tt.total_amount as total_amount,
tt.congestion_surcharge as congestion_surcharge,
tt.payment_type as payment_type,
tt.payment_type_desc as payment_type_description
from
taxi_trips tt
inner join
lookup_zones pickup on tt.pickup_location_id = pickup.location_id
inner join
lookup_zones dropoff on tt.dropoff_location_id = dropoff.location_id
inner join
lookup_zones pu on tt.pickup_location_id = pu.location_id
inner join
lookup_zones do on tt.dropoff_location_id = do.location_id
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ select
Borough as borough,
Zone as zone,
service_zone as service_zone
from
from
{{ ref('taxi_zone_lookup') }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{{ config(
schema=resolve_schema_for('core')
) }}

with fhv_timedelta as (
select
pickup_year as year,
pickup_month as month,
pickup_zone as pickup_zone,
dropoff_zone as dropoff_zone,
percentile_cont(timestamp_diff(dropoff_datetime, pickup_datetime, SECOND), 0.90)
over (partition by pickup_year, pickup_month, pickup_zone, dropoff_zone) as timedelta_p90
from
{{ ref('dim_fhv_trips') }}
),

fhv_rnk_timedelta as (
select
year as year,
month as month,
pickup_zone as pickup_zone,
dropoff_zone as dropoff_zone,
count(1) as num_trips,
max(timedelta_p90) as timedelta_p90,
dense_rank() over (partition by year, month, pickup_zone order by max(timedelta_p90) desc) as rnk
from
fhv_timedelta
group by
year,
month,
pickup_zone,
dropoff_zone
)

select
year as year,
month as month,
pickup_zone as pickup_zone,
dropoff_zone as dropoff_zone,
timedelta_p90 as timedelta_p90,
rnk as rnk,
num_trips as num_trips
from
fhv_rnk_timedelta
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@

select
-- Revenue Grouping
pickup_zone as zone,
service_type as service_type,
{{ date_trunc("month", "pickup_datetime") }} as order_year,
pickup_zone as zone,
service_type as service_type,
{{ date_trunc("month", "pickup_datetime") }} as revenue_month,
-- Revenue Calculations
round(sum(fare_amount), 2) as fare,
round(sum(extra), 2) as extra,
round(sum(mta_tax), 2) as mta_tax,
round(sum(tip_amount), 2) as tip_amount,
round(sum(tolls_amount), 2) as tolls_amount,
round(sum(ehail_fee), 2) as ehail_fee,
round(sum(improvement_surcharge), 2) as improvement_surcharge,
round(sum(congestion_surcharge), 2) as congestion_surcharge,
round(sum(total_amount), 2) as total_amount
from
round(sum(fare_amount), 2) as fare,
round(sum(extra), 2) as extra,
round(sum(mta_tax), 2) as mta_tax,
round(sum(tip_amount), 2) as tip_amount,
round(sum(tolls_amount), 2) as tolls_amount,
round(sum(ehail_fee), 2) as ehail_fee,
round(sum(improvement_surcharge), 2) as improvement_surcharge,
round(sum(congestion_surcharge), 2) as congestion_surcharge,
round(sum(total_amount), 2) as total_amount
from
{{ ref('dim_taxi_trips') }}
group by
pickup_zone,
group by
pickup_zone,
service_type,
{{ date_trunc("month", "pickup_datetime") }}
Loading