diff --git a/1- Databricks Lakehouse Platform/1.0 - Creating Clusters.py b/1- Databricks Lakehouse Platform/1.0 - Creating Clusters.py new file mode 100644 index 0000000..210da5a --- /dev/null +++ b/1- Databricks Lakehouse Platform/1.0 - Creating Clusters.py @@ -0,0 +1,75 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Creating Clusters + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Creating a Demo Cluster +# MAGIC +# MAGIC Create a cluster with the following configurations: +# MAGIC +# MAGIC | Setting | Instructions | +# MAGIC |--|--| +# MAGIC |Cluster name|**Demo Cluster**| +# MAGIC |Cluster mode|**Signle node**| +# MAGIC |Runtime version|Select the Databricks runtime version 13.3 LTS| +# MAGIC |Photon Acceleration| Uncheck the option | +# MAGIC |Node type|4 cores| +# MAGIC |Auto termination|30 minutes| +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC 1- Navigate to the **Compute** tab in the left side bar. +# MAGIC +# MAGIC 2- Under **All-purpose compute** tab, click **Create compute**. +# MAGIC + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC 3- On top, click on the default name to change it. Name your cluster as **Demo Cluster** +# MAGIC +# MAGIC 4- Select **Single node** cluster +# MAGIC + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC 5- Select the Databricks runtime version 13.3 LTS (Long Term Support) +# MAGIC +# MAGIC 6- Uncheck the option for the **Use Photon Acceleration** +# MAGIC +# MAGIC 7- Select a Node type of 4 cores +# MAGIC +# MAGIC 8- Set the auto termination of the cluster to 30 minutes +# MAGIC +# MAGIC 9- Lastly, click **Create compute**. +# MAGIC + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
diff --git a/1- Databricks Lakehouse Platform/1.1 - Notebook Basics.py b/1- Databricks Lakehouse Platform/1.1 - Notebook Basics.py index 65c4d54..dc19c61 100644 --- a/1- Databricks Lakehouse Platform/1.1 - Notebook Basics.py +++ b/1- Databricks Lakehouse Platform/1.1 - Notebook Basics.py @@ -12,31 +12,31 @@ # MAGIC # Title 1 # MAGIC ## Title 2 # MAGIC ### Title 3 -# MAGIC +# MAGIC # MAGIC text with a **bold** and *italicized* in it. -# MAGIC +# MAGIC # MAGIC Ordered list # MAGIC 1. first # MAGIC 1. second # MAGIC 1. third -# MAGIC +# MAGIC # MAGIC Unordered list # MAGIC * coffee # MAGIC * tea -# MAGIC * tea -# MAGIC -# MAGIC +# MAGIC * milk +# MAGIC +# MAGIC # MAGIC Images: # MAGIC ![Associate-badge](https://www.databricks.com/wp-content/uploads/2022/04/associate-badge-eng.svg) -# MAGIC +# MAGIC # MAGIC And of course, tables: -# MAGIC +# MAGIC # MAGIC | user_id | user_name | # MAGIC |---------|-----------| # MAGIC | 1 | Adam | # MAGIC | 2 | Sarah | # MAGIC | 3 | John | -# MAGIC +# MAGIC # MAGIC Links (or Embedded HTML): Managing Notebooks documentation # COMMAND ---------- diff --git a/1- Databricks Lakehouse Platform/1.2 - Understanding Delta Tables.sql b/1- Databricks Lakehouse Platform/1.2 - Understanding Delta Tables.sql index e122db9..d20be47 100644 --- a/1- Databricks Lakehouse Platform/1.2 - Understanding Delta Tables.sql +++ b/1- Databricks Lakehouse Platform/1.2 - Understanding Delta Tables.sql @@ -1,32 +1,83 @@ -- Databricks notebook source +-- MAGIC %md +-- MAGIC ## Creating Delta Lake Tables + +-- COMMAND ---------- + +USE CATALOG hive_metastore + +-- COMMAND ---------- + CREATE TABLE employees (id INT, name STRING, salary DOUBLE); -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Catalog Explorer +-- MAGIC +-- MAGIC Check the created **employees** table in the **Catalog** explorer. + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## Inserting Data + +-- COMMAND ---------- + +-- NOTE: With latest Databricks Runtimes, inserting few records in single transaction is optimized into single data file. +-- For this demo, we will insert the records in multiple transactions in order to create 4 data files. + INSERT INTO employees VALUES (1, "Adam", 3500.0), - (2, "Sarah", 4020.5), + (2, "Sarah", 4020.5); + +INSERT INTO employees +VALUES (3, "John", 2999.3), - (4, "Thomas", 4000.3), - (5, "Anna", 2500.0), + (4, "Thomas", 4000.3); + +INSERT INTO employees +VALUES + (5, "Anna", 2500.0); + +INSERT INTO employees +VALUES (6, "Kim", 6200.3) +-- NOTE: When executing multiple SQL statements in the same cell, only the last statement's result will be displayed in the cell output. + -- COMMAND ---------- SELECT * FROM employees -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Exploring Table Metadata + +-- COMMAND ---------- + DESCRIBE DETAIL employees -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Exploring Table Directory + +-- COMMAND ---------- + -- MAGIC %fs ls 'dbfs:/user/hive/warehouse/employees' -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Updating Table + +-- COMMAND ---------- + UPDATE employees SET salary = salary + 100 WHERE name LIKE "A%" @@ -49,6 +100,11 @@ SELECT * FROM employees -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Exploring Table History + +-- COMMAND ---------- + DESCRIBE HISTORY employees -- COMMAND ---------- @@ -57,7 +113,7 @@ DESCRIBE HISTORY employees -- COMMAND ---------- --- MAGIC %fs head 'dbfs:/user/hive/warehouse/employees/_delta_log/00000000000000000002.json' +-- MAGIC %fs head 'dbfs:/user/hive/warehouse/employees/_delta_log/00000000000000000005.json' -- COMMAND ---------- diff --git a/1- Databricks Lakehouse Platform/1.3 - Advanced Delta Lake Features (1).sql b/1- Databricks Lakehouse Platform/1.3 - Advanced Delta Lake Features.sql similarity index 74% rename from 1- Databricks Lakehouse Platform/1.3 - Advanced Delta Lake Features (1).sql rename to 1- Databricks Lakehouse Platform/1.3 - Advanced Delta Lake Features.sql index 13fe336..30cb2e0 100644 --- a/1- Databricks Lakehouse Platform/1.3 - Advanced Delta Lake Features (1).sql +++ b/1- Databricks Lakehouse Platform/1.3 - Advanced Delta Lake Features.sql @@ -1,14 +1,24 @@ -- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Delta Time Travel + +-- COMMAND ---------- + +USE CATALOG hive_metastore + +-- COMMAND ---------- + DESCRIBE HISTORY employees -- COMMAND ---------- SELECT * -FROM employees VERSION AS OF 1 +FROM employees VERSION AS OF 4 -- COMMAND ---------- -SELECT * FROM employees@v1 +SELECT * FROM employees@v4 -- COMMAND ---------- @@ -20,7 +30,7 @@ SELECT * FROM employees -- COMMAND ---------- -RESTORE TABLE employees TO VERSION AS OF 2 +RESTORE TABLE employees TO VERSION AS OF 5 -- COMMAND ---------- @@ -32,6 +42,12 @@ DESCRIBE HISTORY employees -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## OPTIMIZE Command + +-- COMMAND ---------- + DESCRIBE DETAIL employees -- COMMAND ---------- @@ -53,6 +69,12 @@ DESCRIBE HISTORY employees -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## VACUUM Command + +-- COMMAND ---------- + VACUUM employees -- COMMAND ---------- @@ -81,16 +103,18 @@ SELECT * FROM employees@v1 -- COMMAND ---------- -DROP TABLE employees +-- MAGIC %md +-- MAGIC +-- MAGIC ## Dropping Tables -- COMMAND ---------- -SELECT * FROM employees +DROP TABLE employees -- COMMAND ---------- --- MAGIC %fs ls 'dbfs:/user/hive/warehouse/employees' +SELECT * FROM employees -- COMMAND ---------- - +-- MAGIC %fs ls 'dbfs:/user/hive/warehouse/employees' diff --git a/1- Databricks Lakehouse Platform/1.4 - Databases and Tables on Databricks.sql b/1- Databricks Lakehouse Platform/1.4 - Databases and Tables on Databricks.sql index 3bdd7ad..0c4eb86 100644 --- a/1- Databricks Lakehouse Platform/1.4 - Databases and Tables on Databricks.sql +++ b/1- Databricks Lakehouse Platform/1.4 - Databases and Tables on Databricks.sql @@ -1,4 +1,11 @@ -- Databricks notebook source +-- MAGIC %md +-- MAGIC ## Managed Tables + +-- COMMAND ---------- + +USE CATALOG hive_metastore; + CREATE TABLE managed_default (width INT, length INT, height INT); @@ -11,6 +18,12 @@ DESCRIBE EXTENDED managed_default -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## External Tables + +-- COMMAND ---------- + CREATE TABLE external_default (width INT, length INT, height INT) LOCATION 'dbfs:/mnt/demo/external_default'; @@ -24,6 +37,12 @@ DESCRIBE EXTENDED external_default -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Dropping Tables + +-- COMMAND ---------- + DROP TABLE managed_default -- COMMAND ---------- @@ -40,6 +59,11 @@ DROP TABLE external_default -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Creating Schemas + +-- COMMAND ---------- + CREATE SCHEMA new_default -- COMMAND ---------- @@ -80,7 +104,7 @@ DROP TABLE external_new_default; -- COMMAND ---------- --- MAGIC %fs ls 'dbfs:/user/hive/warehouse/managed_new_default' +-- MAGIC %fs ls 'dbfs:/user/hive/warehouse/new_default.db/managed_new_default' -- COMMAND ---------- @@ -88,6 +112,11 @@ DROP TABLE external_new_default; -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Creating Schemas in Custom Location + +-- COMMAND ---------- + CREATE SCHEMA custom LOCATION 'dbfs:/Shared/schemas/custom.db' @@ -134,7 +163,3 @@ DROP TABLE external_custom; -- COMMAND ---------- -- MAGIC %fs ls 'dbfs:/mnt/demo/external_custom' - --- COMMAND ---------- - - diff --git a/1- Databricks Lakehouse Platform/1.5A - Views.sql b/1- Databricks Lakehouse Platform/1.5A - Views.sql index 16d569e..165aceb 100644 --- a/1- Databricks Lakehouse Platform/1.5A - Views.sql +++ b/1- Databricks Lakehouse Platform/1.5A - Views.sql @@ -1,4 +1,11 @@ -- Databricks notebook source +-- MAGIC %md +-- MAGIC ## Preparing Sample Data + +-- COMMAND ---------- + +USE CATALOG hive_metastore; + CREATE TABLE IF NOT EXISTS smartphones (id INT, name STRING, brand STRING, year INT); @@ -20,6 +27,11 @@ SHOW TABLES -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Creating Stored Views + +-- COMMAND ---------- + CREATE VIEW view_apple_phones AS SELECT * FROM smartphones @@ -35,6 +47,12 @@ SHOW TABLES; -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Creating Temporary Views + +-- COMMAND ---------- + CREATE TEMP VIEW temp_view_phones_brands AS SELECT DISTINCT brand FROM smartphones; @@ -47,6 +65,12 @@ SHOW TABLES; -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Creating Global Temporary Views + +-- COMMAND ---------- + CREATE GLOBAL TEMP VIEW global_temp_view_latest_phones AS SELECT * FROM smartphones WHERE year > 2020 @@ -67,7 +91,3 @@ SHOW TABLES IN global_temp; -- COMMAND ---------- SHOW TABLES - --- COMMAND ---------- - - diff --git a/1- Databricks Lakehouse Platform/1.5B - Views (Session 2).sql b/1- Databricks Lakehouse Platform/1.5B - Views (Session 2).sql index bcb8b4e..e45e864 100644 --- a/1- Databricks Lakehouse Platform/1.5B - Views (Session 2).sql +++ b/1- Databricks Lakehouse Platform/1.5B - Views (Session 2).sql @@ -1,4 +1,8 @@ -- Databricks notebook source +USE CATALOG hive_metastore; + +-- COMMAND ---------- + SHOW TABLES; -- COMMAND ---------- @@ -11,6 +15,12 @@ SELECT * FROM global_temp.global_temp_view_latest_phones; -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Dropping Views + +-- COMMAND ---------- + DROP TABLE smartphones; DROP VIEW view_apple_phones; diff --git a/2- ELT with Spark SQL and Python/2.1 - Querying Files.sql b/2- ELT with Spark SQL and Python/2.1 - Querying Files.sql index 7d26b9d..dc26764 100644 --- a/2- ELT with Spark SQL and Python/2.1 - Querying Files.sql +++ b/2- ELT with Spark SQL and Python/2.1 - Querying Files.sql @@ -1,12 +1,17 @@ -- Databricks notebook source -- MAGIC %md-sandbox --- MAGIC +-- MAGIC -- MAGIC
--- MAGIC Databricks Learning +-- MAGIC Databricks Learning -- MAGIC
-- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Querying JSON + +-- COMMAND ---------- + -- MAGIC %run ../Includes/Copy-Datasets -- COMMAND ---------- @@ -39,14 +44,30 @@ SELECT count(*) FROM json.`${dataset.bookstore}/customers-json` -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Querying text Format + +-- COMMAND ---------- + SELECT * FROM text.`${dataset.bookstore}/customers-json` -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Querying binaryFile Format + +-- COMMAND ---------- + SELECT * FROM binaryFile.`${dataset.bookstore}/customers-json` -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Querying CSV + +-- COMMAND ---------- + SELECT * FROM csv.`${dataset.bookstore}/books-csv` -- COMMAND ---------- @@ -66,6 +87,12 @@ SELECT * FROM books_csv -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Limitations of Non-Delta Tables + +-- COMMAND ---------- + DESCRIBE EXTENDED books_csv -- COMMAND ---------- @@ -106,6 +133,11 @@ SELECT COUNT(*) FROM books_csv -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## CTAS Statements + +-- COMMAND ---------- + CREATE TABLE customers AS SELECT * FROM json.`${dataset.bookstore}/customers-json`; @@ -137,7 +169,3 @@ SELECT * FROM books -- COMMAND ---------- DESCRIBE EXTENDED books - --- COMMAND ---------- - - diff --git a/2- ELT with Spark SQL and Python/2.2 - Writing to Tables.sql b/2- ELT with Spark SQL and Python/2.2 - Writing to Tables.sql index 91f3a23..ed17646 100644 --- a/2- ELT with Spark SQL and Python/2.2 - Writing to Tables.sql +++ b/2- ELT with Spark SQL and Python/2.2 - Writing to Tables.sql @@ -1,8 +1,8 @@ -- Databricks notebook source -- MAGIC %md-sandbox --- MAGIC +-- MAGIC -- MAGIC
--- MAGIC Databricks Learning +-- MAGIC Databricks Learning -- MAGIC
-- COMMAND ---------- @@ -20,6 +20,11 @@ SELECT * FROM orders -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Overwriting Tables + +-- COMMAND ---------- + CREATE OR REPLACE TABLE orders AS SELECT * FROM parquet.`${dataset.bookstore}/orders` @@ -43,6 +48,11 @@ SELECT *, current_timestamp() FROM parquet.`${dataset.bookstore}/orders` -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Appending Data + +-- COMMAND ---------- + INSERT INTO orders SELECT * FROM parquet.`${dataset.bookstore}/orders-new` @@ -52,6 +62,11 @@ SELECT count(*) FROM orders -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Merging Data + +-- COMMAND ---------- + CREATE OR REPLACE TEMP VIEW customers_updates AS SELECT * FROM json.`${dataset.bookstore}/customers-json-new`; diff --git a/2- ELT with Spark SQL and Python/2.3 - Advanced Transformations.sql b/2- ELT with Spark SQL and Python/2.3 - Advanced Transformations.sql index 27a0baa..d611285 100644 --- a/2- ELT with Spark SQL and Python/2.3 - Advanced Transformations.sql +++ b/2- ELT with Spark SQL and Python/2.3 - Advanced Transformations.sql @@ -1,8 +1,8 @@ -- Databricks notebook source -- MAGIC %md-sandbox --- MAGIC +-- MAGIC -- MAGIC
--- MAGIC Databricks Learning +-- MAGIC Databricks Learning -- MAGIC
-- COMMAND ---------- @@ -11,6 +11,12 @@ -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Parsing JSON Data + +-- COMMAND ---------- + SELECT * FROM customers -- COMMAND ---------- @@ -65,11 +71,21 @@ FROM orders -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Explode Function + +-- COMMAND ---------- + SELECT order_id, customer_id, explode(books) AS book FROM orders -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Collecting Rows + +-- COMMAND ---------- + SELECT customer_id, collect_set(order_id) AS orders_set, collect_set(books.book_id) AS books_set @@ -78,6 +94,12 @@ GROUP BY customer_id -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ##Flatten Arrays + +-- COMMAND ---------- + SELECT customer_id, collect_set(books.book_id) As before_flatten, array_distinct(flatten(collect_set(books.book_id))) AS after_flatten @@ -86,6 +108,12 @@ GROUP BY customer_id -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ##Join Operations + +-- COMMAND ---------- + CREATE OR REPLACE VIEW orders_enriched AS SELECT * FROM ( @@ -98,6 +126,11 @@ SELECT * FROM orders_enriched -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Set Operations + +-- COMMAND ---------- + CREATE OR REPLACE TEMP VIEW orders_updates AS SELECT * FROM parquet.`${dataset.bookstore}/orders-new`; @@ -119,6 +152,11 @@ SELECT * FROM orders_updates -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Reshaping Data with Pivot + +-- COMMAND ---------- + CREATE OR REPLACE TABLE transactions AS SELECT * FROM ( @@ -135,36 +173,3 @@ SELECT * FROM ( ); SELECT * FROM transactions - --- COMMAND ---------- - -SELECT - order_id, - books, - FILTER (books, i -> i.quantity >= 2) AS many_books -FROM orders - --- COMMAND ---------- - -SELECT order_id, many_books -FROM ( - SELECT - order_id, - FILTER (books, i -> i.quantity >= 2) AS many_books - FROM orders) -WHERE size(many_books) > 0; - --- COMMAND ---------- - -SELECT - order_id, - books, - TRANSFORM ( - books, - k -> CAST(k.subtotal * 0.8 AS INT) - ) AS subtotal_after_discount -FROM orders; - --- COMMAND ---------- - - diff --git a/2- ELT with Spark SQL and Python/2.4 - Higher Order Functions and SQL UDFs.sql b/2- ELT with Spark SQL and Python/2.4 - Higher Order Functions and SQL UDFs.sql index 620fcf7..f0909d1 100644 --- a/2- ELT with Spark SQL and Python/2.4 - Higher Order Functions and SQL UDFs.sql +++ b/2- ELT with Spark SQL and Python/2.4 - Higher Order Functions and SQL UDFs.sql @@ -1,8 +1,8 @@ -- Databricks notebook source -- MAGIC %md-sandbox --- MAGIC +-- MAGIC -- MAGIC
--- MAGIC Databricks Learning +-- MAGIC Databricks Learning -- MAGIC
-- COMMAND ---------- @@ -15,6 +15,12 @@ SELECT * FROM orders -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Filtering Arrays + +-- COMMAND ---------- + SELECT order_id, books, @@ -33,6 +39,12 @@ WHERE size(multiple_copies) > 0; -- COMMAND ---------- +-- MAGIC %md +-- MAGIC +-- MAGIC ## Transforming Arrays + +-- COMMAND ---------- + SELECT order_id, books, @@ -44,6 +56,11 @@ FROM orders; -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## User Defined Functions (UDF) + +-- COMMAND ---------- + CREATE OR REPLACE FUNCTION get_url(email STRING) RETURNS STRING @@ -82,7 +99,3 @@ FROM customers DROP FUNCTION get_url; DROP FUNCTION site_type; - --- COMMAND ---------- - - diff --git a/3- Incremental Data Processing/3.1 - Structured Streaming.py b/3- Incremental Data Processing/3.1 - Structured Streaming.py index 91e8c07..b341907 100644 --- a/3- Incremental Data Processing/3.1 - Structured Streaming.py +++ b/3- Incremental Data Processing/3.1 - Structured Streaming.py @@ -1,8 +1,8 @@ # Databricks notebook source # MAGIC %md-sandbox -# MAGIC +# MAGIC # MAGIC
-# MAGIC Databricks Learning +# MAGIC Databricks Learning # MAGIC
# COMMAND ---------- @@ -11,6 +11,12 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Reading Stream + +# COMMAND ---------- + (spark.readStream .table("books") .createOrReplaceTempView("books_streaming_tmp_vw") @@ -18,11 +24,22 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Displaying Streaming Data + +# COMMAND ---------- + # MAGIC %sql # MAGIC SELECT * FROM books_streaming_tmp_vw # COMMAND ---------- +# MAGIC %md +# MAGIC ## Applying Transformations + +# COMMAND ---------- + # MAGIC %sql # MAGIC SELECT author, count(book_id) AS total_books # MAGIC FROM books_streaming_tmp_vw @@ -30,6 +47,12 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Unsupported Operations + +# COMMAND ---------- + # MAGIC %sql # MAGIC SELECT * # MAGIC FROM books_streaming_tmp_vw @@ -37,6 +60,12 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Persisting Streaming Data + +# COMMAND ---------- + # MAGIC %sql # MAGIC CREATE OR REPLACE TEMP VIEW author_counts_tmp_vw AS ( # MAGIC SELECT author, count(book_id) AS total_books @@ -62,6 +91,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## Adding New Data + +# COMMAND ---------- + # MAGIC %sql # MAGIC INSERT INTO books # MAGIC values ("B19", "Introduction to Modeling and Simulation", "Mark W. Spong", "Computer Science", 25), @@ -70,6 +104,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## Streaming in Batch Mode + +# COMMAND ---------- + # MAGIC %sql # MAGIC INSERT INTO books # MAGIC values ("B16", "Hands-On Deep Learning Algorithms with Python", "Sudharsan Ravichandiran", "Computer Science", 25), @@ -92,7 +131,3 @@ # MAGIC %sql # MAGIC SELECT * # MAGIC FROM author_counts - -# COMMAND ---------- - - diff --git a/3- Incremental Data Processing/3.2 - Auto Loader.py b/3- Incremental Data Processing/3.2 - Auto Loader.py index 1c6f1bb..c669291 100644 --- a/3- Incremental Data Processing/3.2 - Auto Loader.py +++ b/3- Incremental Data Processing/3.2 - Auto Loader.py @@ -1,8 +1,8 @@ # Databricks notebook source # MAGIC %md-sandbox -# MAGIC +# MAGIC # MAGIC
-# MAGIC Databricks Learning +# MAGIC Databricks Learning # MAGIC
# COMMAND ---------- @@ -11,11 +11,23 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Exploring The Source Directory + +# COMMAND ---------- + files = dbutils.fs.ls(f"{dataset_bookstore}/orders-raw") display(files) # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Auto Loader + +# COMMAND ---------- + (spark.readStream .format("cloudFiles") .option("cloudFiles.format", "parquet") @@ -38,6 +50,12 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Landing New Files + +# COMMAND ---------- + load_new_data() # COMMAND ---------- @@ -52,18 +70,26 @@ # COMMAND ---------- -# MAGIC %sql -# MAGIC DESCRIBE HISTORY orders_updates +# MAGIC %md +# MAGIC +# MAGIC ## Exploring Table History # COMMAND ---------- # MAGIC %sql -# MAGIC DROP TABLE orders_updates +# MAGIC DESCRIBE HISTORY orders_updates # COMMAND ---------- -dbutils.fs.rm("dbfs:/mnt/demo/orders_checkpoint", True) +# MAGIC %md +# MAGIC +# MAGIC ## Cleaning Up # COMMAND ---------- +# MAGIC %sql +# MAGIC DROP TABLE orders_updates + +# COMMAND ---------- +dbutils.fs.rm("dbfs:/mnt/demo/orders_checkpoint", True) diff --git a/3- Incremental Data Processing/3.3 - Multi-Hop Architecture.py b/3- Incremental Data Processing/3.3 - Multi-Hop Architecture.py index 15777c4..772cd53 100644 --- a/3- Incremental Data Processing/3.3 - Multi-Hop Architecture.py +++ b/3- Incremental Data Processing/3.3 - Multi-Hop Architecture.py @@ -1,8 +1,8 @@ # Databricks notebook source # MAGIC %md-sandbox -# MAGIC +# MAGIC # MAGIC
-# MAGIC Databricks Learning +# MAGIC Databricks Learning # MAGIC
# COMMAND ---------- @@ -11,11 +11,23 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Exploring The Source dDirectory + +# COMMAND ---------- + files = dbutils.fs.ls(f"{dataset_bookstore}/orders-raw") display(files) # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Auto Loader + +# COMMAND ---------- + (spark.readStream .format("cloudFiles") .option("cloudFiles.format", "parquet") @@ -25,6 +37,12 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Enriching Raw Data + +# COMMAND ---------- + # MAGIC %sql # MAGIC CREATE OR REPLACE TEMPORARY VIEW orders_tmp AS ( # MAGIC SELECT *, current_timestamp() arrival_time, input_file_name() source_file @@ -38,6 +56,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## Creating Bronze Table + +# COMMAND ---------- + (spark.table("orders_tmp") .writeStream .format("delta") @@ -56,6 +79,12 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC #### Creating Static Lookup Table + +# COMMAND ---------- + (spark.read .format("json") .load(f"{dataset_bookstore}/customers-json") @@ -68,6 +97,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## Creating Silver Table + +# COMMAND ---------- + (spark.readStream .table("orders_bronze") .createOrReplaceTempView("orders_bronze_tmp")) @@ -108,6 +142,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## Creating Gold Table + +# COMMAND ---------- + (spark.readStream .table("orders_silver") .createOrReplaceTempView("orders_silver_tmp")) @@ -142,11 +181,13 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC +# MAGIC ## Stopping active streams + +# COMMAND ---------- + for s in spark.streams.active: print("Stopping stream: " + s.id) s.stop() s.awaitTermination() - -# COMMAND ---------- - - diff --git a/4- Production Pipelines/4.1 - Delta Live Tables.sql b/4- Production Pipelines/4.1 - Delta Live Tables.sql index 1968c3b..6db0b5e 100644 --- a/4- Production Pipelines/4.1 - Delta Live Tables.sql +++ b/4- Production Pipelines/4.1 - Delta Live Tables.sql @@ -1,19 +1,23 @@ -- Databricks notebook source -- MAGIC %md --- MAGIC --- MAGIC +-- MAGIC +-- MAGIC -- MAGIC # Delta Live Tables -- COMMAND ---------- -- MAGIC %md-sandbox --- MAGIC +-- MAGIC -- MAGIC
--- MAGIC Databricks Learning +-- MAGIC Databricks Learning -- MAGIC
-- COMMAND ---------- +SET datasets.path=dbfs:/mnt/demo-datasets/bookstore; + +-- COMMAND ---------- + -- MAGIC %md -- MAGIC ## Bronze Layer Tables @@ -26,7 +30,7 @@ CREATE OR REFRESH STREAMING LIVE TABLE orders_raw COMMENT "The raw books orders, ingested from orders-raw" -AS SELECT * FROM cloud_files("${datasets_path}/orders-json-raw", "json", +AS SELECT * FROM cloud_files("${datasets.path}/orders-json-raw", "json", map("cloudFiles.inferColumnTypes", "true")) -- COMMAND ---------- @@ -38,16 +42,16 @@ AS SELECT * FROM cloud_files("${datasets_path}/orders-json-raw", "json", CREATE OR REFRESH LIVE TABLE customers COMMENT "The customers lookup table, ingested from customers-json" -AS SELECT * FROM json.`${datasets_path}/customers-json` +AS SELECT * FROM json.`${datasets.path}/customers-json` -- COMMAND ---------- -- MAGIC %md --- MAGIC --- MAGIC --- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC -- MAGIC ## Silver Layer Tables --- MAGIC +-- MAGIC -- MAGIC #### orders_cleaned -- COMMAND ---------- @@ -68,7 +72,7 @@ AS -- MAGIC %md -- MAGIC >> Constraint violation --- MAGIC +-- MAGIC -- MAGIC | **`ON VIOLATION`** | Behavior | -- MAGIC | --- | --- | -- MAGIC | **`DROP ROW`** | Discard records that violate constraints | @@ -78,8 +82,8 @@ AS -- COMMAND ---------- -- MAGIC %md --- MAGIC --- MAGIC +-- MAGIC +-- MAGIC -- MAGIC ## Gold Tables -- COMMAND ---------- diff --git a/4- Production Pipelines/4.1.2 - Books Pipeline.sql b/4- Production Pipelines/4.1.2 - Books Pipeline.sql index 79b0e38..a61af84 100644 --- a/4- Production Pipelines/4.1.2 - Books Pipeline.sql +++ b/4- Production Pipelines/4.1.2 - Books Pipeline.sql @@ -1,22 +1,26 @@ -- Databricks notebook source +SET datasets.path=dbfs:/mnt/demo-datasets/bookstore; + +-- COMMAND ---------- + -- MAGIC %md --- MAGIC --- MAGIC --- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC -- MAGIC ## Bronze Layer Tables -- COMMAND ---------- CREATE OR REFRESH STREAMING LIVE TABLE books_bronze COMMENT "The raw books data, ingested from CDC feed" -AS SELECT * FROM cloud_files("${datasets_path}/books-cdc", "json") +AS SELECT * FROM cloud_files("${datasets.path}/books-cdc", "json") -- COMMAND ---------- -- MAGIC %md --- MAGIC --- MAGIC --- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC -- MAGIC ## Silver Layer Tables -- COMMAND ---------- @@ -33,8 +37,8 @@ APPLY CHANGES INTO LIVE.books_silver -- COMMAND ---------- -- MAGIC %md --- MAGIC --- MAGIC +-- MAGIC +-- MAGIC -- MAGIC ## Gold Layer Tables -- COMMAND ---------- diff --git a/4- Production Pipelines/4.2 - Pipeline Results.py b/4- Production Pipelines/4.2 - Pipeline Results.py index b53935b..09b7144 100644 --- a/4- Production Pipelines/4.2 - Pipeline Results.py +++ b/4- Production Pipelines/4.2 - Pipeline Results.py @@ -20,9 +20,9 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT * FROM demo_bookstore_dlt_db.cn_daily_customer_books +# MAGIC SELECT * FROM hive_metastore.demo_bookstore_dlt_db.cn_daily_customer_books # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT * FROM demo_bookstore_dlt_db.fr_daily_customer_books +# MAGIC SELECT * FROM hive_metastore.demo_bookstore_dlt_db.fr_daily_customer_books diff --git a/4- Production Pipelines/4.3 - Land New Data Task.py b/4- Production Pipelines/4.3 - Land New Data Task.py index be10bfb..404285e 100644 --- a/4- Production Pipelines/4.3 - Land New Data Task.py +++ b/4- Production Pipelines/4.3 - Land New Data Task.py @@ -4,3 +4,8 @@ # COMMAND ---------- load_new_json_data() + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * from json.`${dataset.bookstore}/books-cdc/02.json` diff --git a/5- Data Governance/5.1 - Managing Permissions.sql b/5- Data Governance/5.1 - Managing Permissions.sql new file mode 100644 index 0000000..db643d4 --- /dev/null +++ b/5- Data Governance/5.1 - Managing Permissions.sql @@ -0,0 +1,28 @@ +CREATE DATABASE IF NOT EXISTS hive_metastore.hr_db +LOCATION 'dbfs:/mnt/demo/hr_db.db'; + +CREATE TABLE hive_metastore.hr_db.employees (id INT, name STRING, salary DOUBLE, city STRING); + +INSERT INTO hive_metastore.hr_db.employees +VALUES (1, "Anna", 2500, "Paris"), + (2, "Thomas", 3000, "London"), + (3, "Bilal", 3500, "Paris"), + (4, "Maya", 2000, "Paris"), + (5, "Sophie", 2500, "London"), + (6, "Adam", 3500, "London"), + (7, "Ali", 3000, "Paris"); + +CREATE VIEW hive_metastore.hr_db.paris_emplyees_vw +AS SELECT * FROM hive_metastore.hr_db.employees WHERE city = 'Paris'; + +------------------------------------------------------ + +GRANT SELECT, MODIFY, READ_METADATA, CREATE ON SCHEMA hive_metastore.hr_db TO hr_team; + +GRANT USAGE ON SCHEMA hive_metastore.hr_db TO hr_team; + +GRANT SELECT ON VIEW hive_metastore.hr_db.paris_emplyees_vw TO `adam@derar.cloud`; + +SHOW GRANTS ON SCHEMA hive_metastore.hr_db; + +SHOW GRANTS ON VIEW hive_metastore.hr_db.paris_emplyees_vw; diff --git a/Includes/Copy-Datasets.py b/Includes/Copy-Datasets.py index 44dcb8e..ec6ef9f 100644 --- a/Includes/Copy-Datasets.py +++ b/Includes/Copy-Datasets.py @@ -23,9 +23,12 @@ def download_dataset(source, target): # COMMAND ---------- -data_source_uri = "wasbs://course-resources@dalhussein.blob.core.windows.net/datasets/bookstore/v1/" +data_source_uri = "s3://dalhussein-courses/datasets/bookstore/v1/" dataset_bookstore = 'dbfs:/mnt/demo-datasets/bookstore' +data_catalog = 'hive_metastore' spark.conf.set(f"dataset.bookstore", dataset_bookstore) +spark.conf.set("fs.s3a.endpoint", "s3.eu-west-3.amazonaws.com") +spark.conf.set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider") # COMMAND ---------- @@ -39,6 +42,11 @@ def get_index(dir): # COMMAND ---------- +def set_current_catalog(catalog_name): + spark.sql(f"USE CATALOG {catalog_name}") + +# COMMAND ---------- + # Structured Streaming streaming_dir = f"{dataset_bookstore}/orders-streaming" raw_dir = f"{dataset_bookstore}/orders-raw" @@ -95,3 +103,4 @@ def load_new_json_data(all=False): # COMMAND ---------- download_dataset(data_source_uri, dataset_bookstore) +set_current_catalog(data_catalog) diff --git a/Includes/images/bookstore_schema.png b/Includes/images/bookstore_schema.png new file mode 100644 index 0000000..47c66c7 Binary files /dev/null and b/Includes/images/bookstore_schema.png differ diff --git a/Labs/1- Databricks Lakehouse Platform/1.0L - Creating Clusters.py b/Labs/1- Databricks Lakehouse Platform/1.0L - Creating Clusters.py new file mode 100644 index 0000000..1c9d89c --- /dev/null +++ b/Labs/1- Databricks Lakehouse Platform/1.0L - Creating Clusters.py @@ -0,0 +1,21 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab: Creating Clusters + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1 - Creating a Demo Cluster +# MAGIC +# MAGIC Create a cluster with the following configurations: +# MAGIC +# MAGIC | Setting | Instructions | +# MAGIC |--|--| +# MAGIC |Cluster name|**Demo Cluster**| +# MAGIC |Cluster mode|**Signle node**| +# MAGIC |Runtime version|Select the Databricks runtime version 13.3 LTS| +# MAGIC |Photon Acceleration| Uncheck the option | +# MAGIC |Node type|4 cores| +# MAGIC |Auto termination|30 minutes| +# MAGIC diff --git a/Labs/1- Databricks Lakehouse Platform/1.1L - Notebook Basics.py b/Labs/1- Databricks Lakehouse Platform/1.1L - Notebook Basics.py new file mode 100644 index 0000000..0531ce7 --- /dev/null +++ b/Labs/1- Databricks Lakehouse Platform/1.1L - Notebook Basics.py @@ -0,0 +1,98 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab: Get started with Databricks Notebook +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1 - Renaming the Notebook +# MAGIC +# MAGIC Change the name of the current notebook to "1.1L - My first lab" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2 - Attaching a cluster +# MAGIC +# MAGIC Attach the cluster you created previously to this notebook + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3 - Execute a Python code +# MAGIC +# MAGIC In the below cell, fill in the blank to print the result of adding the two variables x and y + +# COMMAND ---------- + +x = 5 +y = 10 +result = x + y + +-------------------- + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4 - Execute a SQL cell +# MAGIC +# MAGIC Change the language in the below cell to execute the SQL statement + +# COMMAND ---------- + +_____________________ + +SELECT 5 + 10 + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q5 - Create a Markdown Cell +# MAGIC +# MAGIC 1. Insert a new cell below this one +# MAGIC 1. In the new cell, add Markdown with a header and bullet points as shown in the following image +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q6 - Using %run command +# MAGIC +# MAGIC 1. Create a new Python notebook named **helper** in the current directory (i.e., in the **labs/1- Databricks Lakehouse Platform** folder) +# MAGIC 1. In the **helper** notebook, declare a variable called **my_country** and assign it a value of your country name. For example: +# MAGIC > my_country = "France" +# MAGIC 1. In the following cell, execute a %run command to include the **helper** notebook into this current notebook +# MAGIC > **Hint**: use a dot (**.**) to refer to the current directory + +# COMMAND ---------- + +-------------------- + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, run the following cell to test that you are able to print the **my_country** variable + +# COMMAND ---------- + +print(my_country) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q7 - Functions definition +# MAGIC 1. In the **helper** notebook, define a Python function named **addition()** that print the sum of two numbers +# MAGIC 1. Execute again the above %run command to take the function definition into account +# MAGIC 1. Run the below cell to call the function + +# COMMAND ---------- + +num1 = 20 +num2 = 30 + +addition(num1, num2) diff --git a/Labs/1- Databricks Lakehouse Platform/1.2L - Delta Lake.sql b/Labs/1- Databricks Lakehouse Platform/1.2L - Delta Lake.sql new file mode 100644 index 0000000..2aae47a --- /dev/null +++ b/Labs/1- Databricks Lakehouse Platform/1.2L - Delta Lake.sql @@ -0,0 +1,97 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab: Delta Lake + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Creating a Delta Table +-- MAGIC +-- MAGIC Run the cell below to create the **persons** Delta Table, and apply some operations on it. + +-- COMMAND ---------- + +USE CATALOG hive_metastore; + +CREATE OR REPLACE TABLE persons + (id INT, name STRING, age INT); + +INSERT INTO persons +VALUES + (1, "Tom", 18), + (2, "Kumar", 25); + +INSERT INTO persons +VALUES + (3, "Ali", 50), + (4, "Sandra", 35); + +INSERT INTO persons +VALUES + (5, "Eric", 28), + (6, "Salma", 42); + +UPDATE persons +SET age = age + 10 +WHERE id = 1; + +DELETE FROM persons +WHERE name = "Eric"; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1 - Querying Delta Lake table +-- MAGIC +-- MAGIC Query the data in the **persons** table using **SELECT** statement + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Checking table history +-- MAGIC +-- MAGIC Review the history of the table transactions using the **DESCRIBE HISTORY** command + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- Checking table metadata +-- MAGIC +-- MAGIC Review the basic metadata information of the table using the **DESCRIBE DETAIL** command + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4- Exploring table directory +-- MAGIC +-- MAGIC Explore the table directory using the **%fs** magic command. +-- MAGIC +-- MAGIC >**Hint:** get the table location from the above metadata information + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q5- Exploring the transactions log +-- MAGIC +-- MAGIC Explore the **_delta_log** subfolder in the table directory + +-- COMMAND ---------- + +-------------------- diff --git a/Labs/1- Databricks Lakehouse Platform/1.3L - Databases and Tables on Databricks.sql b/Labs/1- Databricks Lakehouse Platform/1.3L - Databases and Tables on Databricks.sql new file mode 100644 index 0000000..883814a --- /dev/null +++ b/Labs/1- Databricks Lakehouse Platform/1.3L - Databases and Tables on Databricks.sql @@ -0,0 +1,190 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC ## Lab: Databases and Tables on Databricks + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Setting the default catalog +-- MAGIC +-- MAGIC Run the cell below to set the current catalog to **hive_metastore** + +-- COMMAND ---------- + +USE CATALOG hive_metastore; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1 - Creating managed table +-- MAGIC +-- MAGIC In the default database, create a managed table named **movies_managed** that has the following schema: +-- MAGIC +-- MAGIC +-- MAGIC | Column Name | Column Type | +-- MAGIC | --- | --- | +-- MAGIC | title | STRING | +-- MAGIC | category | STRING | +-- MAGIC | length | FLOAT | +-- MAGIC | release_date | DATE | + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Review the extended metadata information of the table, and verify that: +-- MAGIC 1. The table type is Managed +-- MAGIC 1. The table is located under the default hive directory + +-- COMMAND ---------- + +DESCRIBE EXTENDED movies_managed + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Creating external table +-- MAGIC +-- MAGIC In the default database, create an external Delta table named **actors_external**, and located under the directory: +-- MAGIC **dbfs:/mnt/demo/actors_external** +-- MAGIC +-- MAGIC The schema for the table: +-- MAGIC +-- MAGIC | Column Name | Column Type | +-- MAGIC | --- | --- | +-- MAGIC | actor_id | INT | +-- MAGIC | name | STRING | +-- MAGIC | nationality | STRING | + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4- Checking table metadata +-- MAGIC +-- MAGIC Review the extended metadata information of the table, and verify that: +-- MAGIC 1. The table type is External +-- MAGIC 1. The table is located under the directory: **dbfs:/mnt/demo/actors_external** + +-- COMMAND ---------- + +DESCRIBE EXTENDED actors_external + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- Dropping manged table +-- MAGIC +-- MAGIC Drop the manged table **movies_managed** + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Check that the directory of the managed table has been deleted +-- MAGIC + +-- COMMAND ---------- + +-- MAGIC %fs ls 'dbfs:/user/hive/warehouse/movies_managed' + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4- Drop external table +-- MAGIC +-- MAGIC Drop the external table **actors_external** + +-- COMMAND ---------- + +DROP TABLE actors_external + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Check that the directory of the external table has **Not** been deleted + +-- COMMAND ---------- + +-- MAGIC %fs ls 'dbfs:/mnt/demo/actors_external' + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q5- Creating new schema +-- MAGIC +-- MAGIC Create a new schema named **db_cinema** + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Review the extended metadata information of the database, and verify that the database is located under the default hive directory. +-- MAGIC +-- MAGIC Note that the database folder has the extenstion **.db** + +-- COMMAND ---------- + +DESCRIBE DATABASE EXTENDED db_cinema + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Use the new schema to create the below **movies** table + +-- COMMAND ---------- + +-------------------- + + +CREATE TABLE movies + (title STRING, category STRING, length INT, release_date DATE); + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q6- Creating new schema in custom location +-- MAGIC +-- MAGIC Create a new schema named **cinema_custom** in the directory: **dbfs:/Shared/schemas/cinema_custom.db** + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Use the new schema to create the below **movies** table + +-- COMMAND ---------- + +USE cinema_custom; + +CREATE TABLE movies + (title STRING, category STRING, length INT, release_date DATE); + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Finally, review the extended metadata information of the table **movies**, and verify that: +-- MAGIC +-- MAGIC 1. The table type is Managed +-- MAGIC 1. The table is located in the new database defined under the custom location + +-- COMMAND ---------- + +DESCRIBE EXTENDED movies diff --git a/Labs/2- ELT with Spark SQL and Python/2.1L - Querying Files.sql b/Labs/2- ELT with Spark SQL and Python/2.1L - Querying Files.sql new file mode 100644 index 0000000..49b5942 --- /dev/null +++ b/Labs/2- ELT with Spark SQL and Python/2.1L - Querying Files.sql @@ -0,0 +1,118 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab: Querying Files + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Run the following cell to setup the lab environment + +-- COMMAND ---------- + +-- MAGIC %run ../Includes/Setup-Lab + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1- Extracting data directly from Parquet files +-- MAGIC +-- MAGIC Use a SELECT statement to directly query the content of the Parquet files in the directory **${dataset.school}/enrollments** + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Use the above SELECT query in a CTAS statement to create the table **enrollments** + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Run the below cell to ensure data was written as expected in the **enrollments** table + +-- COMMAND ---------- + +SELECT * FROM enrollments + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2- Registering Tables from JSON Files +-- MAGIC +-- MAGIC Use CTAS statement to create the table **students** from the json files in the directory: **${dataset.school}/students-json** +-- MAGIC + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Run the below cell to ensure data was written as expected in the **students** table + +-- COMMAND ---------- + +SELECT * FROM students + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- Registering Tables from CSV Files +-- MAGIC +-- MAGIC Create the temporary view **courses_tmp_vw** from the csv files in the directory: **${dataset.school}/courses-csv** +-- MAGIC +-- MAGIC Knowing that: +-- MAGIC * The delimiter is semicolon (**;**) +-- MAGIC * There is a header of column names in each file +-- MAGIC +-- MAGIC The schema for the view: +-- MAGIC +-- MAGIC | Column Name | Column Type | +-- MAGIC | --- | --- | +-- MAGIC | course_id | STRING | +-- MAGIC | title | STRING | +-- MAGIC | instructor | STRING | +-- MAGIC | category | STRING | +-- MAGIC | price | DOUBLE | + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Create the manged table **courses** from the temporary view **courses_tmp_vw** + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Run the below cell to ensure data was written as expected in the **courses** table + +-- COMMAND ---------- + +SELECT * FROM courses + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Finally, review the metadata information of the table **courses**, and verify that the table type is Managed + +-- COMMAND ---------- + +DESCRIBE EXTENDED courses diff --git a/Labs/2- ELT with Spark SQL and Python/2.2L - Advanced ETL.sql b/Labs/2- ELT with Spark SQL and Python/2.2L - Advanced ETL.sql new file mode 100644 index 0000000..186162f --- /dev/null +++ b/Labs/2- ELT with Spark SQL and Python/2.2L - Advanced ETL.sql @@ -0,0 +1,102 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab: Advanced ETL + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Run the following cell to setup the lab environment + +-- COMMAND ---------- + +-- MAGIC %run ../Includes/Setup-Lab + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1- Interacting with JSON data +-- MAGIC +-- MAGIC Review the nested data structures of the **profile** column in the **students** table created in the previous lab + +-- COMMAND ---------- + +SELECT email, profile +FROM students + +-- COMMAND ---------- + +DESCRIBE students + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Use the appropriate syntax to access the **last_name** and **city** information from the **profile** column + +-- COMMAND ---------- + +SELECT email, ______________ AS student_surname, ______________ AS student_city +FROM students + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2- Higher Order Functions +-- MAGIC +-- MAGIC Review the array column **courses** in the **enrollments** table created in the previous lab + +-- COMMAND ---------- + +SELECT enroll_id, courses +FROM enrollments + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Filter this array column to keep only course elements having subtotal greater than 40 + +-- COMMAND ---------- + +SELECT + enroll_id, + courses, + ______________ AS large_totals +FROM enrollments + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- SQL UDFs + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Define a UDF function named **get_letter_grade** that takes one parameter named **gpa** of type DOUBLE. It returns the corresponding letter grade as indicated in the following table: +-- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC | GPA (4.0 Scale) | Grade Letter | +-- MAGIC |---------|-----------| +-- MAGIC | 3.50 - 4.0 | A | +-- MAGIC | 2.75 - 3.44 | B | +-- MAGIC | 2.0 - 2.74 | C | +-- MAGIC | 0.0 - 1.99 | F | + +-- COMMAND ---------- + +-------------------- + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Let's apply the above UDF on the **students** table created in the previous lab +-- MAGIC +-- MAGIC Fill in the below query to call the defined UDF on the **gpa** column + +-- COMMAND ---------- + +SELECT student_id, gpa, _______________ as letter_grade +FROM students diff --git a/Labs/3- Incremental Data Processing/3.1L - Spark Structured Streaming.py b/Labs/3- Incremental Data Processing/3.1L - Spark Structured Streaming.py new file mode 100644 index 0000000..1f7ee52 --- /dev/null +++ b/Labs/3- Incremental Data Processing/3.1L - Spark Structured Streaming.py @@ -0,0 +1,113 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab: Spark Structured Streaming + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC School Schema +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the following cell to setup the lab environment + +# COMMAND ---------- + +# MAGIC %run ../Includes/Setup-Lab + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1- Auto Loader +# MAGIC +# MAGIC Use Auto Loader to incrementally load enrollments json files from the directory **{dataset_school}/enrollments-json-raw** into a streaming view called **`enrollments_tmp_vw`** +# MAGIC + +# COMMAND ---------- + +dataset_source = f"{dataset_school}/enrollments-json-raw" +schema_location = "dbfs:/mnt/DE-Associate/checkpoints/school/enrollments_stats" + + +(spark + .readStream + .___________________ + .___________________ + .____________________ + .load(dataset_source) + .createOrReplaceTempView("enrollments_tmp_vw")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2 - Calculating aggregations on streaming data +# MAGIC +# MAGIC Using CTAS syntax, define a new streaming view against **`enrollments_tmp_vw`** to count the number of enrollments per **`student_id`**. Name the aggregated field: **`enrollments_counts`** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC +# MAGIC CREATE OR REPLACE TEMPORARY VIEW enrollments_per_student_tmp_vw AS +# MAGIC SELECT ___________________ +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3 - Writing stream data +# MAGIC +# MAGIC Stream the aggregated data from the **`enrollments_per_student_tmp_vw`** view to a Delta table called **`enrollments_stats`**. + +# COMMAND ---------- + +checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/enrollments_stats" + +query = (spark._________________ + ._________________ + ._________________ + ._________________ + ._________________ + ) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC Run the below cell to ensure data was written as expected in the **enrollments_stats** table + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the below cell to land a new json file of enrollments data + +# COMMAND ---------- + +load_new_json_data() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Verify that the statistics have been updated in the table **enrollments_stats** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4 - Canceling streaming query +# MAGIC +# MAGIC Finally, cancel the above streaming query diff --git a/Labs/3- Incremental Data Processing/3.2L - Multi-Hop Architecture.py b/Labs/3- Incremental Data Processing/3.2L - Multi-Hop Architecture.py new file mode 100644 index 0000000..ef8c23b --- /dev/null +++ b/Labs/3- Incremental Data Processing/3.2L - Multi-Hop Architecture.py @@ -0,0 +1,175 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab: Multi-Hop Architecture + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC School Schema +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the following cell to setup the lab environment + +# COMMAND ---------- + +# MAGIC %run ../Includes/Setup-Lab + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1- Declaring Bronze Table +# MAGIC +# MAGIC Use Auto Loader to incrementally load enrollments json files from the directory **{dataset_school}/enrollments-json-raw** to a Delta table called **`bronze`** +# MAGIC + +# COMMAND ---------- + +dataset_source = f"{dataset_school}/enrollments-json-raw" +bronze_checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/bronze" +schema_location = bronze_checkpoint_path + +(spark.readStream + .___________________ + .___________________ + .___________________ + .load(dataset_source) + .writeStream + .___________________ + .___________________ + .table("bronze") +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Create a streaming temporary view **bronze_tmp** from the bronze table in order to perform transformations using SQL. + +# COMMAND ---------- + +(spark + .readStream + .table("bronze") + .___________________("bronze_tmp")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2 - Data Cleansing & Enrichment +# MAGIC +# MAGIC Using CTAS syntax, define a new streaming view **`bronze_cleaned_tmp`** against **`bronze_tmp`** that does the following: +# MAGIC * Remove records with **quantity** of 0 item +# MAGIC * Add a column called **`processing_time`** containing the current timestamp using the **current_timestamp()** function + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC CREATE OR REPLACE TEMPORARY VIEW bronze_cleaned_tmp AS +# MAGIC SELECT ___________________ +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3 - Declaring Silver Table +# MAGIC +# MAGIC Stream the data from **`bronze_cleaned_tmp`** to a table called **`silver`**. + +# COMMAND ---------- + +silver_checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/silver" + +(spark.table("bronze_cleaned_tmp") + .___________________ + .___________________ + .___________________ + .table("silver") +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Let's create a streaming temporary view from the silver table in order to perform business-level aggregation using SQL + +# COMMAND ---------- + +(spark + .readStream + .table("silver") + .createOrReplaceTempView("silver_tmp")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4- Declaring Gold Table +# MAGIC +# MAGIC Using CTAS syntax, define a new streaming view **`enrollments_per_student_tmp_vw`** against **`silver_tmp`** to count the number of enrollments per **`student`**. Name the aggregated field: **`enrollments_count`** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC CREATE OR REPLACE TEMPORARY VIEW enrollments_per_student_tmp_vw AS +# MAGIC SELECT ___________________ +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC Stream the aggregated data from the **`enrollments_per_student_tmp_vw`** view to a Delta table called **`gold_enrollments_stats`**. + +# COMMAND ---------- + +gold_checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/gold_enrollments_stats" + +query = (spark.table("enrollments_per_student_tmp_vw") + .writeStream + .___________________ + .___________________ + .table("gold_enrollments_stats")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Query the data in the **`gold_enrollments_stats`** table to ensure data was written as expected. + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM gold_enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the below cell to land new a json file of enrollments data + +# COMMAND ---------- + +load_new_json_data() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Wait for the new data to be propagated, and then run the below query to verify that the statistics have been updated in the table **gold_enrollments_stats** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM gold_enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Finally, run the below cell for canceling the above streaming queries + +# COMMAND ---------- + +for s in spark.streams.active: + print("Stopping stream: " + s.id) + s.stop() + s.awaitTermination() diff --git a/Labs/4- Production Pipelines/4.1L - Delta Live Tables.sql b/Labs/4- Production Pipelines/4.1L - Delta Live Tables.sql new file mode 100644 index 0000000..58a845c --- /dev/null +++ b/Labs/4- Production Pipelines/4.1L - Delta Live Tables.sql @@ -0,0 +1,123 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab: implementing a DLT pipeline +-- MAGIC +-- MAGIC > This notebook is **not intended** to be executed interactively, but rather to be deployed as a DLT pipeline from the **workflows** tab +-- MAGIC +-- MAGIC +-- MAGIC * Help: DLT syntax documentation. + +-- COMMAND ---------- + +-- MAGIC %md-sandbox +-- MAGIC +-- MAGIC
+-- MAGIC School Schema +-- MAGIC
+ +-- COMMAND ---------- + +SET datasets.path=dbfs:/mnt/DE-Associate/datasets/school; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1- Declaring Bronze Tables +-- MAGIC +-- MAGIC Declare a streaming live table, **`enrollments_bronze`**, that ingests JSON data incrementally using Auto Loader from the directory **"${datasets.path}/enrollments-json-raw"** + +-- COMMAND ---------- + +CREATE ____________________ +AS SELECT * FROM cloud_files("${datasets.path}/enrollments-json-raw", "json", + map("cloudFiles.inferColumnTypes", "true")) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Declare a live table, **`students_bronze`**, that load data directly from JSON files in the directory **"${datasets.path}/students-json"** + +-- COMMAND ---------- + +CREATE ____________________ +AS SELECT * FROM json.`${datasets.path}/students-json` + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Declaring Silver Table +-- MAGIC +-- MAGIC Declare a streaming live table, **`enrollments_cleaned`**, that: +-- MAGIC +-- MAGIC 1. Enrich the **enrollments_bronze** data through an inner join with the **`students_bronze`** table on the common **`student_id`** field to obtain the student's country +-- MAGIC 1. Implement quality control by applying a constraint to drop records with a null **`email`** +-- MAGIC 1. The table will have the following schema: +-- MAGIC +-- MAGIC | Field | Type | +-- MAGIC | --- | --- | +-- MAGIC | **`enroll_id`** | **`STRING`** | +-- MAGIC | **`total`** | **`DOUBLE`** | +-- MAGIC | **`email`** | **`STRING`** | +-- MAGIC | **`country`** | **`STRING`** | +-- MAGIC + +-- COMMAND ---------- + +CREATE OR REFRESH STREAMING LIVE TABLE enrollments_cleaned + (CONSTRAINT ____________________ ON VIOLATION ____________________ ) +AS SELECT enroll_id, total, email, profile:address:country as country + FROM ____________________ n + INNER ____________________ s + ON n.student_id = s.student_id + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Q3- Declaring Gold Table +-- MAGIC +-- MAGIC Declare a live table, **`course_sales_per_country`** against **`enrollments_cleaned`** that calculate per **`country`** the following: +-- MAGIC * **`enrollments_count`**: the number of enrollments +-- MAGIC * **`enrollments_amount`**: the sum of the total amount of enrollments +-- MAGIC +-- MAGIC Add a comment to the table: "Course Sales Per Country" + +-- COMMAND ---------- + +CREATE OR REFRESH LIVE TABLE course_sales_per_country + COMMENT ____________________ +AS SELECT ____________________ + + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Q4- Deploying DLT pipeline +-- MAGIC +-- MAGIC From the **Workflows** button on the sidebar, under the **Delta Live Tables** tab, click **Create Pipeline** +-- MAGIC +-- MAGIC Configure the pipeline settings specified below: +-- MAGIC +-- MAGIC | Setting | Instructions | +-- MAGIC |--|--| +-- MAGIC | Pipeline name | School DLT | +-- MAGIC | Product edition | Choose **Advanced** | +-- MAGIC | Pipeline mode | Choose **Triggered** | +-- MAGIC | Source code | Use the navigator to select this current notebook (4.1L - Delta Live Tables) | +-- MAGIC | Storage location | dbfs:/mnt/DE-Associate/dlt/school | +-- MAGIC | Target schema | DE_Associate_School_DLT | +-- MAGIC | Cluster policy | Leave it **None**| +-- MAGIC | Cluster mode | Choose **Fixed size**| +-- MAGIC | Workers | Enter **0**| +-- MAGIC | Photon Acceleration | Leave it unchecked | +-- MAGIC | Advanced Configuration | Click **Add Configuration** and enter:
- Key: **datasets.path**
- Value: **dbfs:/mnt/DE-Associate/datasets/school** | +-- MAGIC | Channel | Choose **Current**| +-- MAGIC +-- MAGIC Finally, click **Create**. + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Q5 - Run your Pipeline +-- MAGIC +-- MAGIC Select **Development** mode and Click **Start** to begin the update to your pipeline's tables diff --git a/Labs/4- Production Pipelines/4.2L - Jobs - Land New Data.py b/Labs/4- Production Pipelines/4.2L - Jobs - Land New Data.py new file mode 100644 index 0000000..e5f60c9 --- /dev/null +++ b/Labs/4- Production Pipelines/4.2L - Jobs - Land New Data.py @@ -0,0 +1,91 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab: Creating a multi-task job +# MAGIC +# MAGIC In this lab, we will create a job that has 2 tasks: +# MAGIC 1. The current notebook that lands a new batch of data in the lab dataset directory +# MAGIC 1. The Delta Live Table pipeline created in the previous lab to processes this data +# MAGIC +# MAGIC * Help: Databricks Jobs documentation. + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1- Configuring Task 1 - Land New Data +# MAGIC +# MAGIC +# MAGIC From the **Workflows** button on the sidebar, under the **Jobs** tab, click the **Create Job** button. +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 1. Set the job name in the top-left of the screen to **School Job** +# MAGIC 1. Configure the first task as specified below: +# MAGIC | Setting | Value | +# MAGIC |--|--| +# MAGIC | Task name | Enter **Land New Data** | +# MAGIC | Type | Choose **Notebook** | +# MAGIC | Source | Choose **Workspace** | +# MAGIC | Path | Use the navigator to choose the current notebook (4.2L - Jobs - Land New Data) | +# MAGIC | Cluster | Select your cluster from the dropdown, under **Existing All Purpose Clusters** | +# MAGIC +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 3. Click the **Create** button + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2- Configuring Task 2 - DLT pipeline +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 1. Click the add button (**+**) to add a new **Delta Live Tables pipeline** task +# MAGIC 1. Configure the task: +# MAGIC +# MAGIC | Setting | Value | +# MAGIC |--|--| +# MAGIC | Task name | Enter **DLT pipeline** | +# MAGIC | Type | Choose **Delta Live Tables pipeline** | +# MAGIC | Pipeline | Choose the DLT pipeline created in the previous lab | +# MAGIC | Depends on | Choose **Land New Data**, which is the previous task we defined above | +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 3. Click the **Create task** button + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3- Run the job +# MAGIC +# MAGIC Click the **Run now** button in the top right to run this job. From the **Runs** tab, check your job run + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4- Review the finished job +# MAGIC +# MAGIC Once all tasks completed successfully, review the contents of each task to verify its result + +# COMMAND ---------- + +# MAGIC %md +# MAGIC > **Note**: The below cells are to be run as part of the **Task 1** to land new batch of data in the dataset directory + +# COMMAND ---------- + +# MAGIC %run ../Includes/Setup-Lab + +# COMMAND ---------- + +load_new_json_data() diff --git a/Labs/4- Production Pipelines/4.3L - Databricks SQL.sql b/Labs/4- Production Pipelines/4.3L - Databricks SQL.sql new file mode 100644 index 0000000..6a878b5 --- /dev/null +++ b/Labs/4- Production Pipelines/4.3L - Databricks SQL.sql @@ -0,0 +1,121 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab: Design a Dashboard with DBSQL +-- MAGIC +-- MAGIC In this lab, we will design a dashboard in DBSQL that has 2 graphs: +-- MAGIC 1. Bar graph that shows the number of students per country +-- MAGIC 1. Line graph that shows the daily enrollments amount +-- MAGIC +-- MAGIC * Help: Databricks SQL documentation. + +-- COMMAND ---------- + +-- MAGIC %md-sandbox +-- MAGIC +-- MAGIC
+-- MAGIC +-- MAGIC
+ +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1 - Working with queries in SQL Editor +-- MAGIC +-- MAGIC Run the the below query in **SQL Editor** in Databricks SQL, and then save it with the name **Student Counts** + +-- COMMAND ---------- + +SELECT profile:address:country as country, count(student_id) AS students_count +FROM hive_metastore.de_associate_school.students +GROUP BY profile:address:country +ORDER BY students_count DESC +LIMIT 10 + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Creating a Bar Graph Visualization +-- MAGIC +-- MAGIC Create a bar graph that shows the number of students per country +-- MAGIC +-- MAGIC
+-- MAGIC +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC Steps: +-- MAGIC 1. Click the Add butoon (**+**) next to the results tab, and select **Visualization** from the dialog box +-- MAGIC 1. Select **`Bar`** for the **Visualization Type** +-- MAGIC 1. Set **`country`** for the **X Column** +-- MAGIC 1. Under **Y columns** click **Add column**, and set it to **`students_count`** +-- MAGIC 1. Click **Save** +-- MAGIC 1. Finally, set the title of the graph to **Student Counts Viz** + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3 - Creating a New Dashboard +-- MAGIC +-- MAGIC Add the above graph to a new dashboard named **Students Statistics** +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC +-- MAGIC Steps: +-- MAGIC 1. Click the three vertical dots button at the top of the graph and select **Add to Dashboard**. +-- MAGIC 1. Click the **Create new dashboard** option +-- MAGIC 1. Name your dashboard **Students Statistics** +-- MAGIC 1. Click **Save** +-- MAGIC 1. With the new dashboard selected as the target, click **OK** to add your visualization + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4 - Creating a Line Plot Visualization +-- MAGIC +-- MAGIC 1. Run the the below query in a new query tab in the **SQL Editor**, and then save it with the name **Daily Sales** + +-- COMMAND ---------- + +SELECT cast(from_unixtime(enroll_timestamp, 'yyyy-MM-dd HH:mm:ss') AS date) enroll_timestamp, + sum(total) AS enrollments_amount +FROM hive_metastore.de_associate_school.enrollments n +INNER JOIN hive_metastore.de_associate_school.students s ON s.student_id = n.student_id +GROUP BY enroll_timestamp + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC 2. Create a Line Plot Visualization that shows the daily enrollments amount +-- MAGIC +-- MAGIC +-- MAGIC
+-- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC +-- MAGIC Steps: +-- MAGIC 1. Click the **Add Visualization** button +-- MAGIC 1. Select **`Line`** for the **Visualization Type** +-- MAGIC 1. Set **`enroll_timestamp`** for the **X Column** +-- MAGIC 1. Under **Y columns** click **Add column**, and set it to **`enrollments_amount`** +-- MAGIC 1. Click **Save** +-- MAGIC 1. Finally, set the title of the graph to **Daily Sales Viz** +-- MAGIC 1. Click the three vertical dots button at the top of the graph and select **Add to Dashboard**. +-- MAGIC 1. Select the dashboard **Students Statistics** created above +-- MAGIC 1. Click **OK** to add your visualization + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q5 - Review your Dashboard +-- MAGIC +-- MAGIC Open your Dashboard and refresh its underlaying data +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC +-- MAGIC Steps: +-- MAGIC 1. Click on the **Dashboards** button on left side bar +-- MAGIC 1. Find the dashboard **Students Statistics** created earlier. Click to open it +-- MAGIC 1. Click the **Refresh** button to update your dashboard diff --git a/Labs/Includes/Setup-Lab.py b/Labs/Includes/Setup-Lab.py new file mode 100644 index 0000000..d2c548e --- /dev/null +++ b/Labs/Includes/Setup-Lab.py @@ -0,0 +1,135 @@ +# Databricks notebook source +data_source_uri = "s3://dalhussein-courses/datasets/school/v1/" +dataset_school = 'dbfs:/mnt/DE-Associate/datasets/school' +checkpoint_path = 'dbfs:/mnt/DE-Associate/checkpoints/school' +dlt_path = 'dbfs:/mnt/DE-Associate/dlt/school' +db_name = 'DE_Associate_School' +dlt_db_name = 'DE_Associate_School_DLT' +spark.conf.set(f"dataset.school", dataset_school) + +# COMMAND ---------- + +def clean_up(): + print("Removing Checkpoints ...") + dbutils.fs.rm(checkpoint_path, True) + print("Removing DLT storage location ...") + dbutils.fs.rm(dlt_path, True) + print("Dropping Database ...") + spark.sql(f"DROP SCHEMA IF EXISTS {db_name} CASCADE") + print("Dropping DLT database ...") + spark.sql(f"DROP SCHEMA IF EXISTS {dlt_db_name} CASCADE") + print("Removing Dataset ...") + dbutils.fs.rm(dataset_school, True) + print("Done") + +# COMMAND ---------- + +try: + clean = int(dbutils.widgets.get("clean")) +except: + clean = 0 + +if clean: + clean_up() + +# COMMAND ---------- + +def path_exists(path): + try: + dbutils.fs.ls(path) + return True + except Exception as e: + if 'java.io.FileNotFoundException' in str(e): + return False + else: + raise + +# COMMAND ---------- + +def download_dataset(source, target): + files = dbutils.fs.ls(source) + + for f in files: + source_path = f"{source}/{f.name}" + target_path = f"{target}/{f.name}" + if not path_exists(target_path): + print(f"Copying {f.name} ...") + dbutils.fs.cp(source_path, target_path, True) + +# COMMAND ---------- + +def get_index(dir): + files = dbutils.fs.ls(dir) + index = 0 + if files: + file = max(files).name + index = int(file.rsplit('.', maxsplit=1)[0]) + return index+1 + +# COMMAND ---------- + +def set_current_schema(schema_name, catalog_name='hive_metastore'): + spark.sql(f"USE CATALOG {catalog_name}") + spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}") + spark.sql(f"USE {schema_name}") + print(f"Schema for the hands-on labs: {catalog_name}.{schema_name}") + +# COMMAND ---------- + +# Structured Streaming +streaming_dir = f"{dataset_school}/enrollments-streaming" +raw_dir = f"{dataset_school}/enrollments-raw" + +def load_file(current_index): + latest_file = f"{str(current_index).zfill(2)}.parquet" + print(f"Loading {latest_file} file to the school dataset") + dbutils.fs.cp(f"{streaming_dir}/{latest_file}", f"{raw_dir}/{latest_file}") + + +def load_new_data(all=False): + index = get_index(raw_dir) + if index >= 10: + print("No more data to load\n") + + elif all == True: + while index <= 10: + load_file(index) + index += 1 + else: + load_file(index) + index += 1 + +# COMMAND ---------- + +# DLT +streaming_enrollments_dir = f"{dataset_school}/enrollments-json-streaming" +streaming_courses_dir = f"{dataset_school}/courses-streaming" + +raw_enrollments_dir = f"{dataset_school}/enrollments-json-raw" +raw_courses_dir = f"{dataset_school}/courses-cdc" + +def load_json_file(current_index): + latest_file = f"{str(current_index).zfill(2)}.json" + print(f"Loading {latest_file} enrollments file to the school dataset") + dbutils.fs.cp(f"{streaming_enrollments_dir}/{latest_file}", f"{raw_enrollments_dir}/{latest_file}") + #print(f"Loading {latest_file} courses file to the school dataset") + #dbutils.fs.cp(f"{streaming_courses_dir}/{latest_file}", f"{raw_courses_dir}/{latest_file}") + + +def load_new_json_data(all=False): + index = get_index(raw_enrollments_dir) + if index >= 10: + print("No more data to load\n") + + elif all == True: + while index <= 10: + load_json_file(index) + index += 1 + else: + load_json_file(index) + index += 1 + +# COMMAND ---------- + +download_dataset(data_source_uri, dataset_school) +set_current_schema(db_name) diff --git a/Labs/Includes/images/bar_graph.png b/Labs/Includes/images/bar_graph.png new file mode 100644 index 0000000..013c7c2 Binary files /dev/null and b/Labs/Includes/images/bar_graph.png differ diff --git a/Labs/Includes/images/cluster_par1.png b/Labs/Includes/images/cluster_par1.png new file mode 100644 index 0000000..801f94e Binary files /dev/null and b/Labs/Includes/images/cluster_par1.png differ diff --git a/Labs/Includes/images/cluster_par2.png b/Labs/Includes/images/cluster_par2.png new file mode 100644 index 0000000..5a10f40 Binary files /dev/null and b/Labs/Includes/images/cluster_par2.png differ diff --git a/Labs/Includes/images/cluster_par3.png b/Labs/Includes/images/cluster_par3.png new file mode 100644 index 0000000..bf77236 Binary files /dev/null and b/Labs/Includes/images/cluster_par3.png differ diff --git a/Labs/Includes/images/dashboard.png b/Labs/Includes/images/dashboard.png new file mode 100644 index 0000000..e8a8d2c Binary files /dev/null and b/Labs/Includes/images/dashboard.png differ diff --git a/Labs/Includes/images/line_graph.png b/Labs/Includes/images/line_graph.png new file mode 100644 index 0000000..901d4f9 Binary files /dev/null and b/Labs/Includes/images/line_graph.png differ diff --git a/Labs/Includes/images/markdown.png b/Labs/Includes/images/markdown.png new file mode 100644 index 0000000..0c93afa Binary files /dev/null and b/Labs/Includes/images/markdown.png differ diff --git a/Labs/Includes/images/school_job.png b/Labs/Includes/images/school_job.png new file mode 100644 index 0000000..66179b2 Binary files /dev/null and b/Labs/Includes/images/school_job.png differ diff --git a/Labs/Includes/images/school_schema.png b/Labs/Includes/images/school_schema.png new file mode 100644 index 0000000..2945a2f Binary files /dev/null and b/Labs/Includes/images/school_schema.png differ diff --git a/Labs/Solutions/1- Databricks Lakehouse Platform/1.0L Solution - Creating Clusters.py b/Labs/Solutions/1- Databricks Lakehouse Platform/1.0L Solution - Creating Clusters.py new file mode 100644 index 0000000..ebcb29f --- /dev/null +++ b/Labs/Solutions/1- Databricks Lakehouse Platform/1.0L Solution - Creating Clusters.py @@ -0,0 +1,75 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab: Creating Clusters + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1 - Creating a Demo Cluster +# MAGIC +# MAGIC Create a cluster with the following configurations: +# MAGIC +# MAGIC | Setting | Instructions | +# MAGIC |--|--| +# MAGIC |Cluster name|**Demo Cluster**| +# MAGIC |Cluster mode|**Signle node**| +# MAGIC |Runtime version|Select the Databricks runtime version 13.3 LTS| +# MAGIC |Photon Acceleration| Uncheck the option | +# MAGIC |Node type|4 cores| +# MAGIC |Auto termination|30 minutes| +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC 1- Navigate to the **Compute** tab in the left side bar. +# MAGIC +# MAGIC 2- Under **All-purpose compute** tab, click **Create compute**. +# MAGIC + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC 3- On top, click on the default name to change it. Name your cluster as **Demo Cluster** +# MAGIC +# MAGIC 4- Select **Single node** cluster +# MAGIC + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC 5- Select the Databricks runtime version 13.3 LTS (Long Term Support) +# MAGIC +# MAGIC 6- Uncheck the option for the **Use Photon Acceleration** +# MAGIC +# MAGIC 7- Select a Node type of 4 cores +# MAGIC +# MAGIC 8- Set the auto termination of the cluster to 30 minutes +# MAGIC +# MAGIC 9- Lastly, click **Create compute**. +# MAGIC + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
diff --git a/Labs/Solutions/1- Databricks Lakehouse Platform/1.1L Solution - Notebook Basics.py b/Labs/Solutions/1- Databricks Lakehouse Platform/1.1L Solution - Notebook Basics.py new file mode 100644 index 0000000..82424c2 --- /dev/null +++ b/Labs/Solutions/1- Databricks Lakehouse Platform/1.1L Solution - Notebook Basics.py @@ -0,0 +1,112 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab Solution: Get started with Databricks Notebook +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1 - Renaming the Notebook +# MAGIC +# MAGIC Change the name of the current notebook to "1.1L - My first lab" +# MAGIC +# MAGIC **Anwser:** to change the name of the notebook, click on the name at the top of this page, then make changes to the name + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2 - Attaching a cluster +# MAGIC +# MAGIC Attach the cluster you created previously to this notebook +# MAGIC +# MAGIC **Anwser:** to attach a cluster to this notebook, click the dropdown near the top-right corner of this page. Select your cluster. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3 - Execute a Python code +# MAGIC +# MAGIC In the below cell, fill in the blank to print the result of adding the two variables x and y + +# COMMAND ---------- + +x = 5 +y = 10 +result = x + y + +# Answer +print(result) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4 - Execute a SQL cell +# MAGIC +# MAGIC Change the language in the below cell to execute the SQL statement + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT 5 + 10 + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q5 - Create a Markdown Cell +# MAGIC +# MAGIC 1. Insert a new cell below this one +# MAGIC 1. In the new cell, add Markdown with a header and bullet points as shown in the following image +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC # Animals +# MAGIC +# MAGIC * Cats +# MAGIC * Dogs +# MAGIC * Birds + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q6 - Using %run command +# MAGIC +# MAGIC 1. Create a new Python notebook named **helper** in the current directory (i.e., in the **labs/1- Databricks Lakehouse Platform** folder) +# MAGIC 1. In the **helper** notebook, declare a variable called **my_country** and assign it a value of your country name. For example: +# MAGIC > my_country = "France" +# MAGIC 1. In the following cell, execute a %run command to include the **helper** notebook into this current notebook +# MAGIC > **Hint**: use a dot (**.**) to refer to the current directory + +# COMMAND ---------- + +# MAGIC %run ./helper + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, run the following cell to test that you are able to print the **my_country** variable + +# COMMAND ---------- + +print(my_country) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q7 - Functions definition +# MAGIC 1. In the **helper** notebook, define a Python function named **addition()** that print the sum of two numbers +# MAGIC 1. Execute again the above %run command to take the function definition into account +# MAGIC 1. Run the below cell to call the function + +# COMMAND ---------- + +num1 = 20 +num2 = 30 + +addition(num1, num2) diff --git a/Labs/Solutions/1- Databricks Lakehouse Platform/1.2L Solution - Delta Lake.sql b/Labs/Solutions/1- Databricks Lakehouse Platform/1.2L Solution - Delta Lake.sql new file mode 100644 index 0000000..8278576 --- /dev/null +++ b/Labs/Solutions/1- Databricks Lakehouse Platform/1.2L Solution - Delta Lake.sql @@ -0,0 +1,100 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab Solution: Delta Lake + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Creating a Delta Table +-- MAGIC +-- MAGIC Run the cell below to create the **persons** Delta Table, and apply some operations on it. + +-- COMMAND ---------- + +USE CATALOG hive_metastore; + +CREATE OR REPLACE TABLE persons + (id INT, name STRING, age INT); + +INSERT INTO persons +VALUES + (1, "Tom", 18), + (2, "Kumar", 25); + +INSERT INTO persons +VALUES + (3, "Ali", 50), + (4, "Sandra", 35); + +INSERT INTO persons +VALUES + (5, "Eric", 28), + (6, "Salma", 42); + +UPDATE persons +SET age = age + 10 +WHERE id = 1; + +DELETE FROM persons +WHERE name = "Eric"; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1 - Querying Delta Lake table +-- MAGIC +-- MAGIC Query the data in the **persons** table using **SELECT** statement + +-- COMMAND ---------- + +-- Answer +SELECT * FROM persons + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Checking table history +-- MAGIC +-- MAGIC Review the history of the table transactions using the **DESCRIBE HISTORY** command + +-- COMMAND ---------- + +-- Answer +DESCRIBE HISTORY persons + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- Checking table metadata +-- MAGIC +-- MAGIC Review the basic metadata information of the table using the **DESCRIBE DETAIL** command + +-- COMMAND ---------- + +--Answer +DESCRIBE DETAIL persons + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4- Exploring table directory +-- MAGIC +-- MAGIC Explore the table directory using the **%fs** magic command. +-- MAGIC +-- MAGIC **Hint:** get the table location from the above metadata information + +-- COMMAND ---------- + +-- MAGIC %fs ls 'dbfs:/user/hive/warehouse/persons' + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q5- Exploring the transactions log +-- MAGIC +-- MAGIC Explore the **_delta_log** subfolder in the table directory + +-- COMMAND ---------- + +-- MAGIC %fs ls 'dbfs:/user/hive/warehouse/persons/_delta_log' diff --git a/Labs/Solutions/1- Databricks Lakehouse Platform/1.3L Solution - Databases and Tables on Databricks.sql b/Labs/Solutions/1- Databricks Lakehouse Platform/1.3L Solution - Databases and Tables on Databricks.sql new file mode 100644 index 0000000..5d69479 --- /dev/null +++ b/Labs/Solutions/1- Databricks Lakehouse Platform/1.3L Solution - Databases and Tables on Databricks.sql @@ -0,0 +1,199 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC ## Lab Solution: Databases and Tables on Databricks + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Setting the default catalog +-- MAGIC +-- MAGIC Run the cell below to set the current catalog to **hive_metastore** + +-- COMMAND ---------- + +USE CATALOG hive_metastore + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1 - Creating managed table +-- MAGIC +-- MAGIC In the default database, create a managed table named **movies_managed** that has the following schema: +-- MAGIC +-- MAGIC +-- MAGIC | Column Name | Column Type | +-- MAGIC | --- | --- | +-- MAGIC | title | STRING | +-- MAGIC | category | STRING | +-- MAGIC | length | FLOAT | +-- MAGIC | release_date | DATE | + +-- COMMAND ---------- + +-- Answer +CREATE TABLE movies_managed + (title STRING, category STRING, length INT, release_date DATE); + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Review the extended metadata information of the table, and verify that: +-- MAGIC 1. The table type is Managed +-- MAGIC 1. The table is located under the default hive directory + +-- COMMAND ---------- + +DESCRIBE EXTENDED movies_managed + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Creating external table +-- MAGIC +-- MAGIC In the default database, create an external Delta table named **actors_external**, and located under the directory: +-- MAGIC **dbfs:/mnt/demo/actors_external** +-- MAGIC +-- MAGIC The schema for the table: +-- MAGIC +-- MAGIC | Column Name | Column Type | +-- MAGIC | --- | --- | +-- MAGIC | actor_id | INT | +-- MAGIC | name | STRING | +-- MAGIC | nationality | STRING | + +-- COMMAND ---------- + +-- Answer +CREATE OR REPLACE TABLE actors_external + (actor_id INT, name STRING, nationality STRING) +LOCATION 'dbfs:/mnt/demo/actors_external'; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4- Checking table metadata +-- MAGIC +-- MAGIC Review the extended metadata information of the table, and verify that: +-- MAGIC 1. The table type is External +-- MAGIC 1. The table is located under the directory: **dbfs:/mnt/demo/actors_external** + +-- COMMAND ---------- + +DESCRIBE EXTENDED actors_external + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- Dropping manged table +-- MAGIC +-- MAGIC Drop the manged table **movies_managed** + +-- COMMAND ---------- + +-- Answer +DROP TABLE movies_managed + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Check that the directory of the managed table has been deleted +-- MAGIC + +-- COMMAND ---------- + +-- MAGIC %fs ls 'dbfs:/user/hive/warehouse/movies_managed' + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4- Drop external table +-- MAGIC +-- MAGIC Drop the external table **actors_external** + +-- COMMAND ---------- + +DROP TABLE actors_external + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Check that the directory of the external table has **Not** been deleted + +-- COMMAND ---------- + +-- MAGIC %fs ls 'dbfs:/mnt/demo/actors_external' + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q5- Creating new schema +-- MAGIC +-- MAGIC Create a new schema named **db_cinema** + +-- COMMAND ---------- + +-- Answer +CREATE SCHEMA db_cinema + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Review the extended metadata information of the database, and verify that the database is located under the default hive directory. +-- MAGIC +-- MAGIC Note that the database folder has the extenstion **.db** + +-- COMMAND ---------- + +DESCRIBE DATABASE EXTENDED db_cinema + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Use the new schema to create the below **movies** table + +-- COMMAND ---------- + +-- Answer +USE db_cinema; + +CREATE TABLE movies + (title STRING, category STRING, length INT, release_date DATE); + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q6- Creating new schema in custom location +-- MAGIC +-- MAGIC Create a new schema named **cinema_custom** in the directory: **dbfs:/Shared/schemas/cinema_custom.db** + +-- COMMAND ---------- + +-- Answer +CREATE SCHEMA cinema_custom +LOCATION 'dbfs:/Shared/schemas/cinema_custom.db' + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Use the new schema to create the below **movies** table + +-- COMMAND ---------- + +USE cinema_custom; + +CREATE TABLE movies + (title STRING, category STRING, length INT, release_date DATE); + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Finally, review the extended metadata information of the table **movies**, and verify that: +-- MAGIC +-- MAGIC 1. The table type is Managed +-- MAGIC 1. The table is located in the new database defined under the custom location + +-- COMMAND ---------- + +DESCRIBE EXTENDED movies diff --git a/Labs/Solutions/1- Databricks Lakehouse Platform/helper.py b/Labs/Solutions/1- Databricks Lakehouse Platform/helper.py new file mode 100644 index 0000000..00fb308 --- /dev/null +++ b/Labs/Solutions/1- Databricks Lakehouse Platform/helper.py @@ -0,0 +1,7 @@ +# Databricks notebook source +my_country = "France" + +# COMMAND ---------- + +def addition(a, b): + print(a + b) diff --git a/Labs/Solutions/2- ELT with Spark SQL and Python/2.1L Solution - Querying Files.sql b/Labs/Solutions/2- ELT with Spark SQL and Python/2.1L Solution - Querying Files.sql new file mode 100644 index 0000000..ce81033 --- /dev/null +++ b/Labs/Solutions/2- ELT with Spark SQL and Python/2.1L Solution - Querying Files.sql @@ -0,0 +1,134 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab Solution: Querying Files + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Run the following cell to setup the lab environment + +-- COMMAND ---------- + +-- MAGIC %run ../Includes/Setup-Lab + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1- Extracting data directly from Parquet files +-- MAGIC +-- MAGIC Use a SELECT statement to directly query the content of the Parquet files in the directory **${dataset.school}/enrollments** + +-- COMMAND ---------- + +-- ANSWER +SELECT * FROM parquet.`${dataset.school}/enrollments` + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Use the above SELECT query in a CTAS statement to create the table **enrollments** + +-- COMMAND ---------- + +-- ANSWER +CREATE TABLE enrollments AS +SELECT * FROM parquet.`${dataset.school}/enrollments` + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Run the below cell to ensure data was written as expected in the **enrollments** table + +-- COMMAND ---------- + +SELECT * FROM enrollments + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2- Registering Tables from JSON Files +-- MAGIC +-- MAGIC Use CTAS statement to create the table **students** from the json files in the directory: **${dataset.school}/students-json** +-- MAGIC + +-- COMMAND ---------- + +-- ANSWER +CREATE TABLE students AS +SELECT * FROM json.`${dataset.school}/students-json`; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Run the below cell to ensure data was written as expected in the **students** table + +-- COMMAND ---------- + +SELECT * FROM students + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- Registering Tables from CSV Files +-- MAGIC +-- MAGIC Create the temporary view **courses_tmp_vw** from the csv files in the directory: **${dataset.school}/courses-csv** +-- MAGIC +-- MAGIC Knowing that: +-- MAGIC * The delimiter is semicolon (**;**) +-- MAGIC * There is a header of column names in each file +-- MAGIC +-- MAGIC The schema for the view: +-- MAGIC +-- MAGIC | Column Name | Column Type | +-- MAGIC | --- | --- | +-- MAGIC | course_id | STRING | +-- MAGIC | title | STRING | +-- MAGIC | instructor | STRING | +-- MAGIC | category | STRING | +-- MAGIC | price | DOUBLE | + +-- COMMAND ---------- + +-- ANSWER +CREATE TEMP VIEW courses_tmp_vw + (course_id STRING, title STRING, instructor STRING, category STRING, price DOUBLE) +USING CSV +OPTIONS ( + path = "${dataset.school}/courses-csv", + header = "true", + delimiter = ";" +); + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Create the manged table **courses** from the temporary view **courses_tmp_vw** + +-- COMMAND ---------- + +-- ANSWER +CREATE TABLE courses AS + SELECT * FROM courses_tmp_vw; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Query the data in the **courses** table to ensure data was written as expected. + +-- COMMAND ---------- + +-- ANSWER +SELECT * FROM courses + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Finally, review the metadata information of the table **courses**, and verify that the table type is Managed + +-- COMMAND ---------- + +DESCRIBE EXTENDED courses diff --git a/Labs/Solutions/2- ELT with Spark SQL and Python/2.2L Solution - Advanced ETL.sql b/Labs/Solutions/2- ELT with Spark SQL and Python/2.2L Solution - Advanced ETL.sql new file mode 100644 index 0000000..fa79f84 --- /dev/null +++ b/Labs/Solutions/2- ELT with Spark SQL and Python/2.2L Solution - Advanced ETL.sql @@ -0,0 +1,117 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab Solution: Advanced ETL + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Run the following cell to setup the lab environment + +-- COMMAND ---------- + +-- MAGIC %run ../Includes/Setup-Lab + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1- Interacting with JSON data +-- MAGIC +-- MAGIC Review the nested data structures of the **profile** column in the **students** table created in the previous lab + +-- COMMAND ---------- + +SELECT email, profile +FROM students + +-- COMMAND ---------- + +DESCRIBE students + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Use the appropriate syntax to access the **last_name** and **city** information from the **profile** column + +-- COMMAND ---------- + +-- ANSWER +SELECT email, profile:last_name AS student_surname, profile:address:city AS student_city +FROM students + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2- Higher Order Functions +-- MAGIC +-- MAGIC Review the array column **courses** in the **enrollments** table created in the previous lab + +-- COMMAND ---------- + +SELECT enroll_id, courses +FROM enrollments + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Filter this array column to keep only course elements having subtotal greater than 40 + +-- COMMAND ---------- + +-- ANSWER +SELECT + enroll_id, + courses, + FILTER (courses, i -> i.subtotal > 40) AS large_totals +FROM enrollments + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3- SQL UDFs + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Define a UDF function named **get_letter_grade** that takes one parameter named **gpa** of type DOUBLE. It returns the corresponding letter grade as indicated in the following table: +-- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC | GPA (4.0 Scale) | Grade Letter | +-- MAGIC |---------|-----------| +-- MAGIC | 3.50 - 4.0 | A | +-- MAGIC | 2.75 - 3.44 | B | +-- MAGIC | 2.0 - 2.74 | C | +-- MAGIC | 0.0 - 1.99 | F | + +-- COMMAND ---------- + +-- ANSWER +CREATE FUNCTION get_letter_grade(gpa DOUBLE) +RETURNS STRING +RETURN CASE + WHEN gpa >= 3.5 THEN "A" + WHEN gpa >= 2.75 AND gpa < 3.5 THEN "B" + WHEN gpa >= 2 AND gpa < 2.75 THEN "C" + ELSE "F" + END + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC Let's apply the above UDF on the **students** table created in the previous lab +-- MAGIC +-- MAGIC Fill in the below query to call the defined UDF on the **gpa** column + +-- COMMAND ---------- + +-- ANSWER +SELECT student_id, gpa, get_letter_grade(gpa) as letter_grade +FROM students + +-- COMMAND ---------- + + diff --git a/Labs/Solutions/3- Incremental Data Processing/3.1L Solution - Spark Structured Streaming.py b/Labs/Solutions/3- Incremental Data Processing/3.1L Solution - Spark Structured Streaming.py new file mode 100644 index 0000000..ef51253 --- /dev/null +++ b/Labs/Solutions/3- Incremental Data Processing/3.1L Solution - Spark Structured Streaming.py @@ -0,0 +1,114 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab Solution: Spark Structured Streaming + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC School Schema +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the following cell to setup the lab environment + +# COMMAND ---------- + +# MAGIC %run ../Includes/Setup-Lab + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1- Auto Loader +# MAGIC +# MAGIC Use Auto Loader to incrementally load enrollments json files from the directory **{dataset_school}/enrollments-json-raw** into a streaming view called **`enrollments_tmp_vw`** +# MAGIC + +# COMMAND ---------- + +dataset_source = f"{dataset_school}/enrollments-json-raw" +schema_location = "dbfs:/mnt/DE-Associate/checkpoints/school/enrollments_stats" + +# ANSWER +(spark.readStream + .format("cloudFiles") + .option("cloudFiles.format", "json") + .option("cloudFiles.schemaLocation", schema_location) + .load(dataset_source) + .createOrReplaceTempView("enrollments_tmp_vw")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2 - Calculating aggregations on streaming data +# MAGIC +# MAGIC Using CTAS syntax, define a new streaming view against **`enrollments_tmp_vw`** to count the number of enrollments per **`student_id`**. Name the aggregated field: **`enrollments_counts`** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- ANSWER +# MAGIC CREATE OR REPLACE TEMPORARY VIEW enrollments_per_student_tmp_vw AS +# MAGIC SELECT +# MAGIC student_id, count(enroll_id) AS enrollments_count +# MAGIC FROM enrollments_tmp_vw +# MAGIC GROUP BY student_id + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3 - Writing stream data +# MAGIC +# MAGIC Stream the aggregated data from the **`enrollments_per_student_tmp_vw`** view to a Delta table called **`enrollments_stats`**. + +# COMMAND ---------- + +checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/enrollments_stats" + +# ANSWER +query = (spark.table("enrollments_per_student_tmp_vw") + .writeStream + .option("checkpointLocation", checkpoint_path) + .outputMode("complete") + .table("enrollments_stats") + ) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Query the data in the **`enrollments_stats`** table to ensure data was written as expected. + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the below cell to land a new json file of enrollments data + +# COMMAND ---------- + +load_new_json_data() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Verify that the statistics have been updated in the table **enrollments_stats** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4 - Canceling streaming query +# MAGIC +# MAGIC Finally, cancel the above streaming query diff --git a/Labs/Solutions/3- Incremental Data Processing/3.2L Solution - Multi-Hop Architecture.py b/Labs/Solutions/3- Incremental Data Processing/3.2L Solution - Multi-Hop Architecture.py new file mode 100644 index 0000000..4d222c1 --- /dev/null +++ b/Labs/Solutions/3- Incremental Data Processing/3.2L Solution - Multi-Hop Architecture.py @@ -0,0 +1,188 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab Solution: Multi-Hop Architecture + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC School Schema +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the following cell to setup the lab environment + +# COMMAND ---------- + +# MAGIC %run ../Includes/Setup-Lab + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1- Declaring Bronze Table +# MAGIC +# MAGIC Use Auto Loader to incrementally load enrollments json files from the directory **{dataset_school}/enrollments-json-raw** to a Delta table called **`bronze`** +# MAGIC + +# COMMAND ---------- + +dataset_source = f"{dataset_school}/enrollments-json-raw" +bronze_checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/bronze" +schema_location = bronze_checkpoint_path + +# ANSWER +(spark.readStream + .format("cloudFiles") + .option("cloudFiles.format", "json") + .option("cloudFiles.schemaLocation", schema_location) + .load(dataset_source) + .writeStream + .option("checkpointLocation", bronze_checkpoint_path) + .outputMode("append") + .table("bronze") +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Let's create a streaming temporary view from the bronze table in order to perform transformations using SQL. + +# COMMAND ---------- + +(spark + .readStream + .table("bronze") + .createOrReplaceTempView("bronze_tmp")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2 - Data Cleansing & Enrichment +# MAGIC +# MAGIC Using CTAS syntax, define a new streaming view **`bronze_cleaned_tmp`** against **`bronze_tmp`** that does the following: +# MAGIC * Remove records with **quantity** of 0 item +# MAGIC * Add a column called **`processing_time`** containing the current timestamp using the **current_timestamp()** function + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- ANSWER +# MAGIC CREATE OR REPLACE TEMPORARY VIEW bronze_cleaned_tmp AS +# MAGIC SELECT +# MAGIC *, current_timestamp() processing_time +# MAGIC FROM bronze_tmp +# MAGIC WHERE quantity > 0 + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3 - Declaring Silver Table +# MAGIC +# MAGIC Stream the data from **`bronze_cleaned_tmp`** to a table called **`silver`**. + +# COMMAND ---------- + +silver_checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/silver" + +# ANSWER +(spark.table("bronze_cleaned_tmp") + .writeStream + .option("checkpointLocation", silver_checkpoint_path) + .outputMode("append") + .table("silver") +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Let's create a streaming temporary view from the silver table in order to perform business-level aggregation using SQL + +# COMMAND ---------- + +(spark + .readStream + .table("silver") + .createOrReplaceTempView("silver_tmp")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4- Declaring Gold Table +# MAGIC +# MAGIC Using CTAS syntax, define a new streaming view **`enrollments_per_student_tmp_vw`** against **`silver_tmp`** to count the number of enrollments per **`student`**. Name the aggregated field: **`enrollments_count`** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- ANSWER +# MAGIC CREATE OR REPLACE TEMPORARY VIEW enrollments_per_student_tmp_vw AS +# MAGIC SELECT +# MAGIC student_id, count(enroll_id) AS enrollments_count +# MAGIC FROM silver_tmp +# MAGIC GROUP BY student_id + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC Stream the aggregated data from the **`enrollments_per_student_tmp_vw`** view to a Delta table called **`gold_enrollments_stats`**. + +# COMMAND ---------- + +gold_checkpoint_path = "dbfs:/mnt/DE-Associate/checkpoints/school/gold_enrollments_stats" + +# ANSWER +query = (spark.table("enrollments_per_student_tmp_vw") + .writeStream + .option("checkpointLocation", gold_checkpoint_path) + .outputMode("complete") + .table("gold_enrollments_stats")) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Query the data in the **`gold_enrollments_stats`** table to ensure data was written as expected. + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM gold_enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Run the below cell to land new a json file of enrollments data + +# COMMAND ---------- + +load_new_json_data() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Wait for the new data to be propagated, and then run the below query to verify that the statistics have been updated in the table **gold_enrollments_stats** + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM gold_enrollments_stats + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Finally, run the below cell for canceling the above streaming queries + +# COMMAND ---------- + +for s in spark.streams.active: + print("Stopping stream: " + s.id) + s.stop() + s.awaitTermination() + +# COMMAND ---------- + + diff --git a/Labs/Solutions/4- Production Pipelines/4.1L Solution - Delta Live Tables.sql b/Labs/Solutions/4- Production Pipelines/4.1L Solution - Delta Live Tables.sql new file mode 100644 index 0000000..a7cd3e8 --- /dev/null +++ b/Labs/Solutions/4- Production Pipelines/4.1L Solution - Delta Live Tables.sql @@ -0,0 +1,130 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab Solution: implementing a DLT pipeline +-- MAGIC +-- MAGIC > This notebook is **not intended** to be executed interactively, but rather to be deployed as a DLT pipeline from the **workflows** tab +-- MAGIC +-- MAGIC +-- MAGIC * Help: DLT syntax documentation. + +-- COMMAND ---------- + +-- MAGIC %md-sandbox +-- MAGIC +-- MAGIC
+-- MAGIC School Schema +-- MAGIC
+ +-- COMMAND ---------- + +SET datasets.path=dbfs:/mnt/DE-Associate/datasets/school; + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1- Declaring Bronze Tables +-- MAGIC +-- MAGIC Declare a streaming live table, **`enrollments_bronze`**, that ingests JSON data incrementally using Auto Loader from the directory **"${datasets.path}/enrollments-json-raw"** + +-- COMMAND ---------- + +-- ANSWER +CREATE OR REFRESH STREAMING LIVE TABLE enrollments_bronze +AS SELECT * FROM cloud_files("${datasets.path}/enrollments-json-raw", "json", + map("cloudFiles.inferColumnTypes", "true")) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Declare a live table, **`students_bronze`**, that load data directly from JSON files in the directory **"${datasets.path}/students-json"** + +-- COMMAND ---------- + +-- ANSWER +CREATE OR REFRESH LIVE TABLE students_bronze +AS SELECT * FROM json.`${datasets.path}/students-json` + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Declaring Silver Table +-- MAGIC +-- MAGIC Declare a streaming live table, **`enrollments_cleaned`**, that: +-- MAGIC +-- MAGIC 1. Enrich the **enrollments_bronze** data through an inner join with the **`students_bronze`** table on the common **`student_id`** field to obtain the student's country +-- MAGIC 1. Implement quality control by applying a constraint to drop records with a null **`email`** +-- MAGIC 1. The table will have the following schema: +-- MAGIC +-- MAGIC | Field | Type | +-- MAGIC | --- | --- | +-- MAGIC | **`enroll_id`** | **`STRING`** | +-- MAGIC | **`total`** | **`DOUBLE`** | +-- MAGIC | **`email`** | **`STRING`** | +-- MAGIC | **`country`** | **`STRING`** | +-- MAGIC + +-- COMMAND ---------- + +-- ANSWER +CREATE OR REFRESH STREAMING LIVE TABLE enrollments_cleaned ( + CONSTRAINT valid_email EXPECT (email IS NOT NULL) ON VIOLATION DROP ROW +) +AS SELECT enroll_id, total, email, profile:address:country as country + FROM STREAM(LIVE.enrollments_bronze) n + LEFT JOIN LIVE.students_bronze s + ON n.student_id = s.student_id + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Q3- Declaring Gold Table +-- MAGIC +-- MAGIC Declare a live table, **`course_sales_per_country`** against **`enrollments_cleaned`** that calculate per **`country`** the following: +-- MAGIC * **`enrollments_count`**: the number of enrollments +-- MAGIC * **`enrollments_amount`**: the sum of the total amount of enrollments +-- MAGIC +-- MAGIC Add a comment to the table: "Course Sales Per Country" + +-- COMMAND ---------- + +-- ANSWER +CREATE OR REFRESH LIVE TABLE course_sales_per_country +COMMENT "Course Sales Per Country" +AS + SELECT country, count(enroll_id) AS enrollments_count, sum(total) AS enrollments_amount + FROM LIVE.enrollments_cleaned + GROUP BY country + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Q4- Deploying DLT pipeline +-- MAGIC +-- MAGIC From the **Workflows** button on the sidebar, under the **Delta Live Tables** tab, click **Create Pipeline** +-- MAGIC +-- MAGIC Configure the pipeline settings specified below: +-- MAGIC +-- MAGIC | Setting | Instructions | +-- MAGIC |--|--| +-- MAGIC | Pipeline name | School DLT | +-- MAGIC | Product edition | Choose **Advanced** | +-- MAGIC | Pipeline mode | Choose **Triggered** | +-- MAGIC | Source code | Use the navigator to select this current notebook (4.1L - Delta Live Tables) | +-- MAGIC | Storage location | dbfs:/mnt/DE-Associate/dlt/school | +-- MAGIC | Target schema | DE_Associate_School_DLT | +-- MAGIC | Cluster policy | Leave it **None**| +-- MAGIC | Cluster mode | Choose **Fixed size**| +-- MAGIC | Workers | Enter **0**| +-- MAGIC | Photon Acceleration | Leave it unchecked | +-- MAGIC | Advanced Configuration | Click **Add Configuration** and enter:
- Key: **datasets.path**
- Value: **dbfs:/mnt/DE-Associate/datasets/school** | +-- MAGIC | Channel | Choose **Current**| +-- MAGIC +-- MAGIC Finally, click **Create**. + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Q5 - Run your Pipeline +-- MAGIC +-- MAGIC Select **Development** mode and Click **Start** to begin the update to your pipeline's tables diff --git a/Labs/Solutions/4- Production Pipelines/4.2L Solution - Jobs - Land New Data.py b/Labs/Solutions/4- Production Pipelines/4.2L Solution - Jobs - Land New Data.py new file mode 100644 index 0000000..4a3af21 --- /dev/null +++ b/Labs/Solutions/4- Production Pipelines/4.2L Solution - Jobs - Land New Data.py @@ -0,0 +1,91 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC +# MAGIC ## Lab Solution: Creating a multi-task job +# MAGIC +# MAGIC In this lab, we will create a job that has 2 tasks: +# MAGIC 1. The current notebook that lands a new batch of data in the lab dataset directory +# MAGIC 1. The Delta Live Table pipeline created in the previous lab to processes this data +# MAGIC +# MAGIC * Help: Databricks Jobs documentation. + +# COMMAND ---------- + +# MAGIC %md-sandbox +# MAGIC +# MAGIC
+# MAGIC +# MAGIC
+ +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q1- Configuring Task 1 - Land New Data +# MAGIC +# MAGIC +# MAGIC From the **Workflows** button on the sidebar, under the **Jobs** tab, click the **Create Job** button. +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 1. Set the job name in the top-left of the screen to **School Job** +# MAGIC 1. Configure the first task as specified below: +# MAGIC | Setting | Value | +# MAGIC |--|--| +# MAGIC | Task name | Enter **Land New Data** | +# MAGIC | Type | Choose **Notebook** | +# MAGIC | Source | Choose **Workspace** | +# MAGIC | Path | Use the navigator to choose the current notebook (4.2L - Jobs - Land New Data) | +# MAGIC | Cluster | Select your cluster from the dropdown, under **Existing All Purpose Clusters** | +# MAGIC +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 3. Click the **Create** button + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q2- Configuring Task 2 - DLT pipeline +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 1. Click the add button (**+**) to add a new **Delta Live Tables pipeline** task +# MAGIC 1. Configure the task: +# MAGIC +# MAGIC | Setting | Value | +# MAGIC |--|--| +# MAGIC | Task name | Enter **DLT pipeline** | +# MAGIC | Type | Choose **Delta Live Tables pipeline** | +# MAGIC | Pipeline | Choose the DLT pipeline created in the previous lab | +# MAGIC | Depends on | Choose **Land New Data**, which is the previous task we defined above | +# MAGIC +# MAGIC
+# MAGIC +# MAGIC 3. Click the **Create task** button + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q3- Run the job +# MAGIC +# MAGIC Click the **Run now** button in the top right to run this job. From the **Runs** tab, check your job run + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Q4- Review the finished job +# MAGIC +# MAGIC Once all tasks completed successfully, review the contents of each task to verify its result + +# COMMAND ---------- + +# MAGIC %md +# MAGIC > **Note**: The below cells are to be run as part of the **Task 1** to land new batch of data in the dataset directory + +# COMMAND ---------- + +# MAGIC %run ../Includes/Setup-Lab + +# COMMAND ---------- + +load_new_json_data() diff --git a/Labs/Solutions/4- Production Pipelines/4.3L Solution - Databricks SQL.sql b/Labs/Solutions/4- Production Pipelines/4.3L Solution - Databricks SQL.sql new file mode 100644 index 0000000..c5b32dd --- /dev/null +++ b/Labs/Solutions/4- Production Pipelines/4.3L Solution - Databricks SQL.sql @@ -0,0 +1,121 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC +-- MAGIC ## Lab Solution: Design a Dashboard with DBSQL +-- MAGIC +-- MAGIC In this lab, we will design a dashboard in DBSQL that has 2 graphs: +-- MAGIC 1. Bar graph that shows the number of students per country +-- MAGIC 1. Line graph that shows the daily enrollments amount +-- MAGIC +-- MAGIC * Help: Databricks SQL documentation. + +-- COMMAND ---------- + +-- MAGIC %md-sandbox +-- MAGIC +-- MAGIC
+-- MAGIC +-- MAGIC
+ +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q1 - Working with queries in SQL Editor +-- MAGIC +-- MAGIC Run the the below query in **SQL Editor** in Databricks SQL, and then save it with the name **Student Counts** + +-- COMMAND ---------- + +SELECT profile:address:country as country, count(student_id) AS students_count +FROM hive_metastore.de_associate_school.students +GROUP BY profile:address:country +ORDER BY students_count DESC +LIMIT 10 + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q2 - Creating a Bar Graph Visualization +-- MAGIC +-- MAGIC Create a bar graph that shows the number of students per country +-- MAGIC +-- MAGIC
+-- MAGIC +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC Steps: +-- MAGIC 1. Click the Add butoon (**+**) next to the results tab, and select **Visualization** from the dialog box +-- MAGIC 1. Select **`Bar`** for the **Visualization Type** +-- MAGIC 1. Set **`country`** for the **X Column** +-- MAGIC 1. Under **Y columns** click **Add column**, and set it to **`students_count`** +-- MAGIC 1. Click **Save** +-- MAGIC 1. Finally, set the title of the graph to **Student Counts Viz** + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q3 - Creating a New Dashboard +-- MAGIC +-- MAGIC Add the above graph to a new dashboard named **Students Statistics** +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC +-- MAGIC Steps: +-- MAGIC 1. Click the three vertical dots button at the top of the graph and select **Add to Dashboard**. +-- MAGIC 1. Click the **Create new dashboard** option +-- MAGIC 1. Name your dashboard **Students Statistics** +-- MAGIC 1. Click **Save** +-- MAGIC 1. With the new dashboard selected as the target, click **OK** to add your visualization + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q4 - Creating a Line Plot Visualization +-- MAGIC +-- MAGIC 1. Run the the below query in a new query tab in the **SQL Editor**, and then save it with the name **Daily Sales** + +-- COMMAND ---------- + +SELECT cast(from_unixtime(enroll_timestamp, 'yyyy-MM-dd HH:mm:ss') AS date) enroll_timestamp, + sum(total) AS enrollments_amount +FROM hive_metastore.de_associate_school.enrollments n +INNER JOIN hive_metastore.de_associate_school.students s ON s.student_id = n.student_id +GROUP BY enroll_timestamp + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC +-- MAGIC 2. Create a Line Plot Visualization that shows the daily enrollments amount +-- MAGIC +-- MAGIC +-- MAGIC
+-- MAGIC +-- MAGIC +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC +-- MAGIC Steps: +-- MAGIC 1. Click the **Add Visualization** button +-- MAGIC 1. Select **`Line`** for the **Visualization Type** +-- MAGIC 1. Set **`enroll_timestamp`** for the **X Column** +-- MAGIC 1. Under **Y columns** click **Add column**, and set it to **`enrollments_amount`** +-- MAGIC 1. Click **Save** +-- MAGIC 1. Finally, set the title of the graph to **Daily Sales Viz** +-- MAGIC 1. Click the three vertical dots button at the top of the graph and select **Add to Dashboard**. +-- MAGIC 1. Select the dashboard **Students Statistics** created above +-- MAGIC 1. Click **OK** to add your visualization + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC #### Q5 - Review your Dashboard +-- MAGIC +-- MAGIC Open your Dashboard and refresh its underlaying data +-- MAGIC +-- MAGIC ##### Anwser: +-- MAGIC +-- MAGIC Steps: +-- MAGIC 1. Click on the **Dashboards** button on left side bar +-- MAGIC 1. Find the dashboard **Students Statistics** created earlier. Click to open it +-- MAGIC 1. Click the **Refresh** button to update your dashboard diff --git a/Labs/Solutions/Includes/Setup-Lab.py b/Labs/Solutions/Includes/Setup-Lab.py new file mode 100644 index 0000000..d2c548e --- /dev/null +++ b/Labs/Solutions/Includes/Setup-Lab.py @@ -0,0 +1,135 @@ +# Databricks notebook source +data_source_uri = "s3://dalhussein-courses/datasets/school/v1/" +dataset_school = 'dbfs:/mnt/DE-Associate/datasets/school' +checkpoint_path = 'dbfs:/mnt/DE-Associate/checkpoints/school' +dlt_path = 'dbfs:/mnt/DE-Associate/dlt/school' +db_name = 'DE_Associate_School' +dlt_db_name = 'DE_Associate_School_DLT' +spark.conf.set(f"dataset.school", dataset_school) + +# COMMAND ---------- + +def clean_up(): + print("Removing Checkpoints ...") + dbutils.fs.rm(checkpoint_path, True) + print("Removing DLT storage location ...") + dbutils.fs.rm(dlt_path, True) + print("Dropping Database ...") + spark.sql(f"DROP SCHEMA IF EXISTS {db_name} CASCADE") + print("Dropping DLT database ...") + spark.sql(f"DROP SCHEMA IF EXISTS {dlt_db_name} CASCADE") + print("Removing Dataset ...") + dbutils.fs.rm(dataset_school, True) + print("Done") + +# COMMAND ---------- + +try: + clean = int(dbutils.widgets.get("clean")) +except: + clean = 0 + +if clean: + clean_up() + +# COMMAND ---------- + +def path_exists(path): + try: + dbutils.fs.ls(path) + return True + except Exception as e: + if 'java.io.FileNotFoundException' in str(e): + return False + else: + raise + +# COMMAND ---------- + +def download_dataset(source, target): + files = dbutils.fs.ls(source) + + for f in files: + source_path = f"{source}/{f.name}" + target_path = f"{target}/{f.name}" + if not path_exists(target_path): + print(f"Copying {f.name} ...") + dbutils.fs.cp(source_path, target_path, True) + +# COMMAND ---------- + +def get_index(dir): + files = dbutils.fs.ls(dir) + index = 0 + if files: + file = max(files).name + index = int(file.rsplit('.', maxsplit=1)[0]) + return index+1 + +# COMMAND ---------- + +def set_current_schema(schema_name, catalog_name='hive_metastore'): + spark.sql(f"USE CATALOG {catalog_name}") + spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}") + spark.sql(f"USE {schema_name}") + print(f"Schema for the hands-on labs: {catalog_name}.{schema_name}") + +# COMMAND ---------- + +# Structured Streaming +streaming_dir = f"{dataset_school}/enrollments-streaming" +raw_dir = f"{dataset_school}/enrollments-raw" + +def load_file(current_index): + latest_file = f"{str(current_index).zfill(2)}.parquet" + print(f"Loading {latest_file} file to the school dataset") + dbutils.fs.cp(f"{streaming_dir}/{latest_file}", f"{raw_dir}/{latest_file}") + + +def load_new_data(all=False): + index = get_index(raw_dir) + if index >= 10: + print("No more data to load\n") + + elif all == True: + while index <= 10: + load_file(index) + index += 1 + else: + load_file(index) + index += 1 + +# COMMAND ---------- + +# DLT +streaming_enrollments_dir = f"{dataset_school}/enrollments-json-streaming" +streaming_courses_dir = f"{dataset_school}/courses-streaming" + +raw_enrollments_dir = f"{dataset_school}/enrollments-json-raw" +raw_courses_dir = f"{dataset_school}/courses-cdc" + +def load_json_file(current_index): + latest_file = f"{str(current_index).zfill(2)}.json" + print(f"Loading {latest_file} enrollments file to the school dataset") + dbutils.fs.cp(f"{streaming_enrollments_dir}/{latest_file}", f"{raw_enrollments_dir}/{latest_file}") + #print(f"Loading {latest_file} courses file to the school dataset") + #dbutils.fs.cp(f"{streaming_courses_dir}/{latest_file}", f"{raw_courses_dir}/{latest_file}") + + +def load_new_json_data(all=False): + index = get_index(raw_enrollments_dir) + if index >= 10: + print("No more data to load\n") + + elif all == True: + while index <= 10: + load_json_file(index) + index += 1 + else: + load_json_file(index) + index += 1 + +# COMMAND ---------- + +download_dataset(data_source_uri, dataset_school) +set_current_schema(db_name) diff --git a/My Folder/Demo Notebook.ipynb b/My Folder/Demo Notebook.ipynb new file mode 100644 index 0000000..4eac9d4 --- /dev/null +++ b/My Folder/Demo Notebook.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ddfe99fa-06de-4a92-a814-7ffb11d78ccc", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "print(\"Hello from Git Folder\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "55dc6e8d-5c21-408b-ab72-c3a0a4df9f29", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Demo Notebook", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Notebook Basics.ipynb b/Notebook Basics.ipynb new file mode 100644 index 0000000..207f880 --- /dev/null +++ b/Notebook Basics.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8aad9817-668e-4424-9cc2-f3e1e27f395a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "print('hello databricks')" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "986274a5-ab4f-45b8-88f9-953d105af4eb", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Notebook Basics", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/README.md b/README.md index 07e748f..c81d2d5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,16 @@ # Databricks Certified Data Engineer Associate +Databricks Certified Data Engineer Associate - Preparation This repository contains the resources of the preparation course for Databricks Data Engineer Associate certification exam on Udemy: +
+https://www.udemy.com/course/databricks-certified-data-engineer-associate/?referralCode=F0FA48E9A0546C975F14. +
+
-https://www.udemy.com/course/databricks-certified-data-engineer-associate/?referralCode=F0FA48E9A0546C975F14.
+## Practice Exams -To import these resources into your Databricks workspace, clone this repository via Databricks Repos. +Practice Exams: Databricks Certified Data Engineer Associate +Practice exams for this certification are available in the following Udemy course: +
+https://www.udemy.com/course/practice-exams-databricks-certified-data-engineer-associate/?referralCode=9AA679C03D1F51B2C956.