|
1 | | -# Fabric notebook source |
2 | | - |
3 | | -# METADATA ******************** |
4 | | - |
5 | | -# META { |
6 | | -# META "kernel_info": { |
7 | | -# META "name": "synapse_pyspark" |
8 | | -# META }, |
9 | | -# META "dependencies": { |
10 | | -# META "lakehouse": { |
11 | | -# META "default_lakehouse": "14e465e7-8e7f-40b7-ac20-c4b92921259d", |
12 | | -# META "default_lakehouse_name": "PatternsLakehouse", |
13 | | -# META "default_lakehouse_workspace_id": "cdae3a65-8345-4c15-9065-ada4468dde51", |
14 | | -# META "known_lakehouses": [ |
15 | | -# META { |
16 | | -# META "id": "14e465e7-8e7f-40b7-ac20-c4b92921259d" |
17 | | -# META } |
18 | | -# META ] |
19 | | -# META } |
20 | | -# META } |
21 | | -# META } |
22 | | - |
23 | | -# CELL ******************** |
24 | | - |
25 | | -# Import required libraries for schema definition |
26 | | -from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType |
27 | | -from datetime import date, datetime |
28 | | - |
29 | | -# Define table names for the Healthcare industry domain |
30 | | -# These 3 tables form a relational model: patients and doctors linked through appointments |
31 | | -PATIENTS_TABLE = "patients" |
32 | | -DOCTORS_TABLE = "doctors" |
33 | | -APPOINTMENTS_TABLE = "appointments" |
34 | | - |
35 | | -# ------------------------------------------------------- |
36 | | -# Define schemas explicitly — reused for both table creation and data insertion |
37 | | -# to ensure type consistency and avoid merge field errors |
38 | | -# ------------------------------------------------------- |
39 | | - |
40 | | -patients_schema = StructType([ |
41 | | - StructField("patient_id", IntegerType(), False), |
42 | | - StructField("first_name", StringType(), False), |
43 | | - StructField("last_name", StringType(), False), |
44 | | - StructField("date_of_birth", TimestampType(), True), |
45 | | - StructField("gender", StringType(), True), |
46 | | - StructField("phone_number", StringType(), True), |
47 | | - StructField("email", StringType(), True) |
48 | | -]) |
49 | | - |
50 | | -doctors_schema = StructType([ |
51 | | - StructField("doctor_id", IntegerType(), False), |
52 | | - StructField("first_name", StringType(), False), |
53 | | - StructField("last_name", StringType(), False), |
54 | | - StructField("specialty", StringType(), True), |
55 | | - StructField("phone_number", StringType(), True), |
56 | | - StructField("email", StringType(), True) |
57 | | -]) |
58 | | - |
59 | | -appointments_schema = StructType([ |
60 | | - StructField("appointment_id", IntegerType(), False), |
61 | | - StructField("patient_id", IntegerType(), False), |
62 | | - StructField("doctor_id", IntegerType(), False), |
63 | | - StructField("appointment_date", TimestampType(), False), |
64 | | - StructField("reason", StringType(), True), |
65 | | - StructField("status", StringType(), True) |
66 | | -]) |
67 | | - |
68 | | -# ------------------------------------------------------- |
69 | | -# Insert data using saveAsTable which writes Delta files AND registers |
70 | | -# tables in the Lakehouse metastore. The default Lakehouse is attached |
71 | | -# via the notebook's META dependencies block, so simple table names resolve |
72 | | -# correctly. Per Fabric docs: df.write.mode("overwrite").format("delta").saveAsTable(name) |
73 | | -# ------------------------------------------------------- |
74 | | - |
75 | | -patients_data = [ |
76 | | - (1, "Alice", "Johnson", datetime(1985, 3, 15), "Female", "555-0101", "alice.johnson@email.com"), |
77 | | - (2, "Bob", "Smith", datetime(1990, 7, 22), "Male", "555-0102", "bob.smith@email.com"), |
78 | | - (3, "Carol", "Williams", datetime(1978, 11, 8), "Female", "555-0103", "carol.williams@email.com"), |
79 | | - (4, "David", "Brown", datetime(2001, 1, 30), "Male", "555-0104", "david.brown@email.com"), |
80 | | - (5, "Eva", "Davis", datetime(1995, 6, 12), "Female", "555-0105", "eva.davis@email.com") |
81 | | -] |
82 | | -patients_df = spark.createDataFrame(patients_data, patients_schema) |
83 | | -patients_df.write.format("delta").mode("overwrite").saveAsTable(PATIENTS_TABLE) |
84 | | -print(f"Inserted {patients_df.count()} rows into '{PATIENTS_TABLE}'.") |
85 | | - |
86 | | -doctors_data = [ |
87 | | - (1, "Sarah", "Mitchell", "Cardiology", "555-0201", "sarah.mitchell@hospital.com"), |
88 | | - (2, "James", "Anderson", "Neurology", "555-0202", "james.anderson@hospital.com"), |
89 | | - (3, "Emily", "Thompson", "Pediatrics", "555-0203", "emily.thompson@hospital.com") |
90 | | -] |
91 | | -doctors_df = spark.createDataFrame(doctors_data, doctors_schema) |
92 | | -doctors_df.write.format("delta").mode("overwrite").saveAsTable(DOCTORS_TABLE) |
93 | | -print(f"Inserted {doctors_df.count()} rows into '{DOCTORS_TABLE}'.") |
94 | | - |
95 | | -appointments_data = [ |
96 | | - (1, 1, 1, datetime(2026, 4, 10), "Annual checkup", "Completed"), |
97 | | - (2, 2, 2, datetime(2026, 4, 11), "Headache consultation", "Completed"), |
98 | | - (3, 3, 3, datetime(2026, 4, 12), "Child wellness visit", "Scheduled"), |
99 | | - (4, 4, 1, datetime(2026, 4, 15), "Chest pain follow-up", "Scheduled"), |
100 | | - (5, 5, 2, datetime(2026, 4, 18), "Neurological evaluation", "Scheduled"), |
101 | | - (6, 1, 3, datetime(2026, 4, 20), "Flu symptoms", "Scheduled") |
102 | | -] |
103 | | -appointments_df = spark.createDataFrame(appointments_data, appointments_schema) |
104 | | -appointments_df.write.format("delta").mode("overwrite").saveAsTable(APPOINTMENTS_TABLE) |
105 | | -print(f"Inserted {appointments_df.count()} rows into '{APPOINTMENTS_TABLE}'.") |
106 | | - |
107 | | -print("All tables created and data loaded successfully.") |
108 | | - |
109 | | - |
110 | | - |
111 | | -# METADATA ******************** |
112 | | - |
113 | | -# META { |
114 | | -# META "language": "python", |
115 | | -# META "language_group": "synapse_pyspark" |
116 | | -# META } |
| 1 | +# Fabric notebook source |
| 2 | + |
| 3 | +# METADATA ******************** |
| 4 | + |
| 5 | +# META { |
| 6 | +# META "kernel_info": { |
| 7 | +# META "name": "synapse_pyspark" |
| 8 | +# META }, |
| 9 | +# META "dependencies": { |
| 10 | +# META "lakehouse": { |
| 11 | +# META "default_lakehouse": "c185283c-9dd9-4e40-a17c-aa6303e3a2e9", |
| 12 | +# META "default_lakehouse_name": "PatternsLakehouse", |
| 13 | +# META "default_lakehouse_workspace_id": "d7270f11-feba-4990-baa6-d45e47f23737", |
| 14 | +# META "known_lakehouses": [ |
| 15 | +# META { |
| 16 | +# META "id": "c185283c-9dd9-4e40-a17c-aa6303e3a2e9" |
| 17 | +# META } |
| 18 | +# META ] |
| 19 | +# META } |
| 20 | +# META } |
| 21 | +# META } |
| 22 | + |
| 23 | +# CELL ******************** |
| 24 | + |
| 25 | +# Import required libraries for schema definition |
| 26 | +from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType |
| 27 | +from datetime import date, datetime |
| 28 | + |
| 29 | +# Define table names for the Healthcare industry domain |
| 30 | +# These 3 tables form a relational model: patients and doctors linked through appointments |
| 31 | +PATIENTS_TABLE = "patients" |
| 32 | +DOCTORS_TABLE = "doctors" |
| 33 | +APPOINTMENTS_TABLE = "appointments" |
| 34 | + |
| 35 | +# ------------------------------------------------------- |
| 36 | +# Define schemas explicitly — reused for both table creation and data insertion |
| 37 | +# to ensure type consistency and avoid merge field errors |
| 38 | +# ------------------------------------------------------- |
| 39 | + |
| 40 | +patients_schema = StructType([ |
| 41 | + StructField("patient_id", IntegerType(), False), |
| 42 | + StructField("first_name", StringType(), False), |
| 43 | + StructField("last_name", StringType(), False), |
| 44 | + StructField("date_of_birth", TimestampType(), True), |
| 45 | + StructField("gender", StringType(), True), |
| 46 | + StructField("phone_number", StringType(), True), |
| 47 | + StructField("email", StringType(), True) |
| 48 | +]) |
| 49 | + |
| 50 | +doctors_schema = StructType([ |
| 51 | + StructField("doctor_id", IntegerType(), False), |
| 52 | + StructField("first_name", StringType(), False), |
| 53 | + StructField("last_name", StringType(), False), |
| 54 | + StructField("specialty", StringType(), True), |
| 55 | + StructField("phone_number", StringType(), True), |
| 56 | + StructField("email", StringType(), True) |
| 57 | +]) |
| 58 | + |
| 59 | +appointments_schema = StructType([ |
| 60 | + StructField("appointment_id", IntegerType(), False), |
| 61 | + StructField("patient_id", IntegerType(), False), |
| 62 | + StructField("doctor_id", IntegerType(), False), |
| 63 | + StructField("appointment_date", TimestampType(), False), |
| 64 | + StructField("reason", StringType(), True), |
| 65 | + StructField("status", StringType(), True) |
| 66 | +]) |
| 67 | + |
| 68 | +# ------------------------------------------------------- |
| 69 | +# Insert data using saveAsTable which writes Delta files AND registers |
| 70 | +# tables in the Lakehouse metastore. The default Lakehouse is attached |
| 71 | +# via the notebook's META dependencies block, so simple table names resolve |
| 72 | +# correctly. Per Fabric docs: df.write.mode("overwrite").format("delta").saveAsTable(name) |
| 73 | +# ------------------------------------------------------- |
| 74 | + |
| 75 | +patients_data = [ |
| 76 | + (1, "Alice", "Johnson", datetime(1985, 3, 15), "Female", "555-0101", "alice.johnson@email.com"), |
| 77 | + (2, "Bob", "Smith", datetime(1990, 7, 22), "Male", "555-0102", "bob.smith@email.com"), |
| 78 | + (3, "Carol", "Williams", datetime(1978, 11, 8), "Female", "555-0103", "carol.williams@email.com"), |
| 79 | + (4, "David", "Brown", datetime(2001, 1, 30), "Male", "555-0104", "david.brown@email.com"), |
| 80 | + (5, "Eva", "Davis", datetime(1995, 6, 12), "Female", "555-0105", "eva.davis@email.com") |
| 81 | +] |
| 82 | +patients_df = spark.createDataFrame(patients_data, patients_schema) |
| 83 | +patients_df.write.format("delta").mode("overwrite").saveAsTable(PATIENTS_TABLE) |
| 84 | +print(f"Inserted {patients_df.count()} rows into '{PATIENTS_TABLE}'.") |
| 85 | + |
| 86 | +doctors_data = [ |
| 87 | + (1, "Sarah", "Mitchell", "Cardiology", "555-0201", "sarah.mitchell@hospital.com"), |
| 88 | + (2, "James", "Anderson", "Neurology", "555-0202", "james.anderson@hospital.com"), |
| 89 | + (3, "Emily", "Thompson", "Pediatrics", "555-0203", "emily.thompson@hospital.com") |
| 90 | +] |
| 91 | +doctors_df = spark.createDataFrame(doctors_data, doctors_schema) |
| 92 | +doctors_df.write.format("delta").mode("overwrite").saveAsTable(DOCTORS_TABLE) |
| 93 | +print(f"Inserted {doctors_df.count()} rows into '{DOCTORS_TABLE}'.") |
| 94 | + |
| 95 | +appointments_data = [ |
| 96 | + (1, 1, 1, datetime(2026, 4, 10), "Annual checkup", "Completed"), |
| 97 | + (2, 2, 2, datetime(2026, 4, 11), "Headache consultation", "Completed"), |
| 98 | + (3, 3, 3, datetime(2026, 4, 12), "Child wellness visit", "Scheduled"), |
| 99 | + (4, 4, 1, datetime(2026, 4, 15), "Chest pain follow-up", "Scheduled"), |
| 100 | + (5, 5, 2, datetime(2026, 4, 18), "Neurological evaluation", "Scheduled"), |
| 101 | + (6, 1, 3, datetime(2026, 4, 20), "Flu symptoms", "Scheduled") |
| 102 | +] |
| 103 | +appointments_df = spark.createDataFrame(appointments_data, appointments_schema) |
| 104 | +appointments_df.write.format("delta").mode("overwrite").saveAsTable(APPOINTMENTS_TABLE) |
| 105 | +print(f"Inserted {appointments_df.count()} rows into '{APPOINTMENTS_TABLE}'.") |
| 106 | + |
| 107 | +print("All tables created and data loaded successfully.") |
| 108 | + |
| 109 | + |
| 110 | + |
| 111 | +# METADATA ******************** |
| 112 | + |
| 113 | +# META { |
| 114 | +# META "language": "python", |
| 115 | +# META "language_group": "synapse_pyspark" |
| 116 | +# META } |
0 commit comments