Merge pull request #1 from mag1cfrog/first-lesson

mag1cfrog · web-flow · commit fb306a82b53c · 2025-06-16T10:43:08.000-07:00
test develop on the first lesson
diff --git a/docs/Gemfile b/docs/Gemfile
@@ -1,13 +1,11 @@
 source "https://rubygems.org"
 
-gem "jekyll", "~> 4.4" # Or the version compatible with GitHub Pages if you know it
-gem "minima", "~> 2.5" # The theme specified in _config.yml
+gem "jekyll", "~> 4.4"
+gem "jekyll-theme-chirpy", "~> 7.3"
 
-# Add other Jekyll plugins you might use here, for example:
-# gem "jekyll-feed", "~> 0.12"
-# gem "jekyll-seo-tag", "~> 2.7"
-
-# If you're using GitHub Pages, it's often useful to include the github-pages gem
-# group :jekyll_plugins do
-#   gem "github-pages", group: :jekyll_plugins
-# end
+# Chirpy theme plugins
+gem "jekyll-paginate"
+gem "jekyll-redirect-from"
+gem "jekyll-archives"
+gem "jekyll-sitemap"
+gem "jekyll-include-cache"
diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock
@@ -73,12 +73,28 @@ GEM
       safe_yaml (~> 1.0)
       terminal-table (>= 1.8, < 4.0)
       webrick (~> 1.7)
+    jekyll-archives (2.3.0)
+      jekyll (>= 3.6, < 5.0)
     jekyll-feed (0.17.0)
       jekyll (>= 3.7, < 5.0)
+    jekyll-include-cache (0.2.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-paginate (1.1.0)
+    jekyll-redirect-from (0.16.0)
+      jekyll (>= 3.3, < 5.0)
     jekyll-sass-converter (3.1.0)
       sass-embedded (~> 1.75)
     jekyll-seo-tag (2.8.0)
       jekyll (>= 3.8, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-theme-chirpy (7.3.0)
+      jekyll (~> 4.3)
+      jekyll-archives (~> 2.2)
+      jekyll-include-cache (~> 0.2)
+      jekyll-paginate (~> 1.1)
+      jekyll-seo-tag (~> 2.8)
+      jekyll-sitemap (~> 1.4)
     jekyll-watch (2.2.1)
       listen (~> 3.0)
     json (2.12.2)
@@ -91,10 +107,6 @@ GEM
       rb-fsevent (~> 0.10, >= 0.10.3)
       rb-inotify (~> 0.9, >= 0.9.10)
     mercenary (0.4.0)
-    minima (2.5.2)
-      jekyll (>= 3.5, < 5.0)
-      jekyll-feed (~> 0.9)
-      jekyll-seo-tag (~> 2.1)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
     public_suffix (6.0.2)
@@ -164,7 +176,13 @@ PLATFORMS
 
 DEPENDENCIES
   jekyll (~> 4.4)
-  minima (~> 2.5)
+  jekyll-archives
+  jekyll-feed
+  jekyll-include-cache
+  jekyll-paginate
+  jekyll-redirect-from
+  jekyll-sitemap
+  jekyll-theme-chirpy (~> 7.3)
 
 BUNDLED WITH
    2.6.9
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -1,5 +1,99 @@
-title: "Spark Tuning Notes"
-description: "Reflective lessons learned tuning Apache Spark"
-baseurl: "/spark-tuning-notes"
+# Site settings
+lang: en
+title: Spark Tuning Notes
+tagline: Reflective lessons learned tuning Apache Spark
+description: >-
+  A collection of practical lessons learned while optimizing Apache Spark jobs,
+  covering real-world performance challenges and their solutions.
+
 url: "https://mag1cfrog.github.io"
-theme: minima
+baseurl: "/spark-tuning-notes"
+
+# Theme
+theme: jekyll-theme-chirpy
+
+# Social
+github:
+  username: mag1cfrog
+
+social:
+  name: Hanbo Wang
+  email: harrywong2017@gmail.com
+  links:
+    - https://github.com/mag1cfrog
+
+# Site Author
+author:
+  name: Hanbo Wang
+
+  github: mag1cfrog
+  bio: Data Engineer specializing in Apache Spark optimization
+
+# Build settings
+timezone: America/Los_Angeles
+
+avatar: "/assets/img/avatar.png"
+
+paginate: 10
+paginate_path: "/posts/page:num/"
+
+# Plugins
+plugins:
+  - jekyll-paginate
+  - jekyll-redirect-from
+  - jekyll-archives
+  - jekyll-sitemap
+  - jekyll-include-cache
+
+# Exclude from processing.
+# The following items will not be processed, by default.
+# Any item listed under the `exclude:` key here will be automatically added to
+# the internal list.
+#
+# Excluded items can be processed by explicitly listing the directories or
+# their entries' file path in the `include:` list.
+#
+exclude:
+  - .sass-cache/
+  - .jekyll-cache/
+  - gemfiles/
+  - Gemfile
+  - Gemfile.lock
+  - node_modules/
+  - vendor/bundle/
+  - vendor/cache/
+  - vendor/gems/
+  - vendor/ruby/
+
+future: true
+
+# Archives
+jekyll-archives:
+  enabled: [categories, tags]
+  layouts:
+    category: category
+    tag: tag
+  permalinks:
+    category: /categories/:name/
+    tag: /tags/:name/
+
+# Defaults
+defaults:
+  - scope:
+      path: ""
+      type: posts
+    values:
+      layout: post
+      comments: false
+      toc: true
+      permalink: /posts/:title/
+  - scope:
+      path: _drafts
+    values:
+      comments: false
+  - scope:
+      path: ""
+      type: tabs
+    values:
+      layout: page
+      permalink: /:title/
diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html
diff --git a/docs/_posts/2025-06-13-first-lesson.md b/docs/_posts/2025-06-13-first-lesson.md
@@ -1,16 +1,38 @@
 ---
-layout: default
 title: "First Lesson: Partition vs Broadcast"
-date: 2025-06-13
+date: 2025-06-13 12:00:00 -0800
+categories:
+  - spark
+  - optimization
+tags:
+  - performance
+excerpt: "How switching from partitioned joins to broadcast joins reduced shuffle writes from 8GB to 500MB"
 ---
 
-## First Lesson: Partition vs Broadcast
 
-When joining a large 10 GB DataFrame with a small 200 MB lookup table, I discovered that:
 
-- Using a **broadcast join** with `spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "300MB")` dropped shuffle write from 8 GB to 500 MB.
-- Conversely, relying on default partitioned joins caused redundant shuffles across executors.
+We have a Spark job on Databricks that would join a enormous table (player tracking records of coordindates on frames) with a dimensional table (position number to player uid mapping), and then do some de-dup using window function.
+
+## The Deduplication Challenge
+
+The core logic involved a window function to handle duplicate records (some psedudo SQL):
+
+```sql
+WITH ranked AS (
+  SELECT 
+    tracking.*,
+    lineup.fielder_id,
+    lineup.position_alpha,
+    ROW_NUMBER() OVER (
+      PARTITION BY game_id, pitch_uid, position_num, event_time
+      ORDER BY processed_year DESC, processed_month DESC, processed_day DESC
+    ) AS rn
+  FROM hawkeye_tracking tracking
+  JOIN hawkeye_lineup lineup ON (...)
+)
+SELECT * EXCEPT (rn)
+FROM ranked 
+WHERE rn = 1
+```
+
 
-<aside class="callout">
-💡 **Tip:** Always tune `spark.sql.shuffle.partitions = executors * cores_per_executor` after switching join strategies.
-</aside>
diff --git a/docs/assets/css/style.css b/docs/assets/css/style.css
@@ -5,11 +5,35 @@ body {
   margin: auto;
   padding: 1rem;
 }
-pre {
-  background: #f5f5f5;
-  padding: 0.5rem;
-  overflow-x: auto;
+
+/* Override the theme's pre styling with more specific selectors */
+.content pre[class*="language-"] {
+  background: #f5f5f5 !important;
+  border: 1px solid #e0e0e0 !important;
+  border-radius: 6px !important;
+  padding: 1em !important;
+  margin: 1.5em 0 !important;
+  line-height: 1.5 !important;
 }
+
+/* Also target the code element inside pre */
+.content pre[class*="language-"] code {
+  background: none !important;
+  padding: 0 !important;
+  color: #333333 !important;
+  font-size: 0.9em !important;
+}
+
+/* Fallback for any plain pre tags */
+.content pre:not([class]) {
+  background: #f5f5f5 !important;
+  border: 1px solid #e0e0e0 !important;
+  border-radius: 6px !important;
+  padding: 1em !important;
+  margin: 1.5em 0 !important;
+  line-height: 1.5 !important;
+}
+
 .callout {
   border-left: 4px solid #007acc;
   background: #f0f8ff;
diff --git a/docs/assets/img/avatar.png b/docs/assets/img/avatar.png
diff --git a/docs/index.html b/docs/index.html
@@ -0,0 +1,3 @@
+---
+layout: home
+---
diff --git a/docs/index.md b/docs/index.md