Skip to content

Commit fb306a8

Browse files
authored
Merge pull request #1 from mag1cfrog/first-lesson
test develop on the first lesson
2 parents 6d1dbbc + c38c0d2 commit fb306a8

9 files changed

Lines changed: 191 additions & 62 deletions

File tree

docs/Gemfile

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
source "https://rubygems.org"
22

3-
gem "jekyll", "~> 4.4" # Or the version compatible with GitHub Pages if you know it
4-
gem "minima", "~> 2.5" # The theme specified in _config.yml
3+
gem "jekyll", "~> 4.4"
4+
gem "jekyll-theme-chirpy", "~> 7.3"
55

6-
# Add other Jekyll plugins you might use here, for example:
7-
# gem "jekyll-feed", "~> 0.12"
8-
# gem "jekyll-seo-tag", "~> 2.7"
9-
10-
# If you're using GitHub Pages, it's often useful to include the github-pages gem
11-
# group :jekyll_plugins do
12-
# gem "github-pages", group: :jekyll_plugins
13-
# end
6+
# Chirpy theme plugins
7+
gem "jekyll-paginate"
8+
gem "jekyll-redirect-from"
9+
gem "jekyll-archives"
10+
gem "jekyll-sitemap"
11+
gem "jekyll-include-cache"

docs/Gemfile.lock

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,28 @@ GEM
7373
safe_yaml (~> 1.0)
7474
terminal-table (>= 1.8, < 4.0)
7575
webrick (~> 1.7)
76+
jekyll-archives (2.3.0)
77+
jekyll (>= 3.6, < 5.0)
7678
jekyll-feed (0.17.0)
7779
jekyll (>= 3.7, < 5.0)
80+
jekyll-include-cache (0.2.1)
81+
jekyll (>= 3.7, < 5.0)
82+
jekyll-paginate (1.1.0)
83+
jekyll-redirect-from (0.16.0)
84+
jekyll (>= 3.3, < 5.0)
7885
jekyll-sass-converter (3.1.0)
7986
sass-embedded (~> 1.75)
8087
jekyll-seo-tag (2.8.0)
8188
jekyll (>= 3.8, < 5.0)
89+
jekyll-sitemap (1.4.0)
90+
jekyll (>= 3.7, < 5.0)
91+
jekyll-theme-chirpy (7.3.0)
92+
jekyll (~> 4.3)
93+
jekyll-archives (~> 2.2)
94+
jekyll-include-cache (~> 0.2)
95+
jekyll-paginate (~> 1.1)
96+
jekyll-seo-tag (~> 2.8)
97+
jekyll-sitemap (~> 1.4)
8298
jekyll-watch (2.2.1)
8399
listen (~> 3.0)
84100
json (2.12.2)
@@ -91,10 +107,6 @@ GEM
91107
rb-fsevent (~> 0.10, >= 0.10.3)
92108
rb-inotify (~> 0.9, >= 0.9.10)
93109
mercenary (0.4.0)
94-
minima (2.5.2)
95-
jekyll (>= 3.5, < 5.0)
96-
jekyll-feed (~> 0.9)
97-
jekyll-seo-tag (~> 2.1)
98110
pathutil (0.16.2)
99111
forwardable-extended (~> 2.6)
100112
public_suffix (6.0.2)
@@ -164,7 +176,13 @@ PLATFORMS
164176

165177
DEPENDENCIES
166178
jekyll (~> 4.4)
167-
minima (~> 2.5)
179+
jekyll-archives
180+
jekyll-feed
181+
jekyll-include-cache
182+
jekyll-paginate
183+
jekyll-redirect-from
184+
jekyll-sitemap
185+
jekyll-theme-chirpy (~> 7.3)
168186

169187
BUNDLED WITH
170188
2.6.9

docs/_config.yml

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,99 @@
1-
title: "Spark Tuning Notes"
2-
description: "Reflective lessons learned tuning Apache Spark"
3-
baseurl: "/spark-tuning-notes"
1+
# Site settings
2+
lang: en
3+
title: Spark Tuning Notes
4+
tagline: Reflective lessons learned tuning Apache Spark
5+
description: >-
6+
A collection of practical lessons learned while optimizing Apache Spark jobs,
7+
covering real-world performance challenges and their solutions.
8+
49
url: "https://mag1cfrog.github.io"
5-
theme: minima
10+
baseurl: "/spark-tuning-notes"
11+
12+
# Theme
13+
theme: jekyll-theme-chirpy
14+
15+
# Social
16+
github:
17+
username: mag1cfrog
18+
19+
social:
20+
name: Hanbo Wang
21+
email: harrywong2017@gmail.com
22+
links:
23+
- https://github.com/mag1cfrog
24+
25+
# Site Author
26+
author:
27+
name: Hanbo Wang
28+
29+
github: mag1cfrog
30+
bio: Data Engineer specializing in Apache Spark optimization
31+
32+
# Build settings
33+
timezone: America/Los_Angeles
34+
35+
avatar: "/assets/img/avatar.png"
36+
37+
paginate: 10
38+
paginate_path: "/posts/page:num/"
39+
40+
# Plugins
41+
plugins:
42+
- jekyll-paginate
43+
- jekyll-redirect-from
44+
- jekyll-archives
45+
- jekyll-sitemap
46+
- jekyll-include-cache
47+
48+
# Exclude from processing.
49+
# The following items will not be processed, by default.
50+
# Any item listed under the `exclude:` key here will be automatically added to
51+
# the internal list.
52+
#
53+
# Excluded items can be processed by explicitly listing the directories or
54+
# their entries' file path in the `include:` list.
55+
#
56+
exclude:
57+
- .sass-cache/
58+
- .jekyll-cache/
59+
- gemfiles/
60+
- Gemfile
61+
- Gemfile.lock
62+
- node_modules/
63+
- vendor/bundle/
64+
- vendor/cache/
65+
- vendor/gems/
66+
- vendor/ruby/
67+
68+
future: true
69+
70+
# Archives
71+
jekyll-archives:
72+
enabled: [categories, tags]
73+
layouts:
74+
category: category
75+
tag: tag
76+
permalinks:
77+
category: /categories/:name/
78+
tag: /tags/:name/
79+
80+
# Defaults
81+
defaults:
82+
- scope:
83+
path: ""
84+
type: posts
85+
values:
86+
layout: post
87+
comments: false
88+
toc: true
89+
permalink: /posts/:title/
90+
- scope:
91+
path: _drafts
92+
values:
93+
comments: false
94+
- scope:
95+
path: ""
96+
type: tabs
97+
values:
98+
layout: page
99+
permalink: /:title/

docs/_layouts/default.html

Lines changed: 0 additions & 20 deletions
This file was deleted.

docs/_posts/2025-06-13-first-lesson.md

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,38 @@
11
---
2-
layout: default
32
title: "First Lesson: Partition vs Broadcast"
4-
date: 2025-06-13
3+
date: 2025-06-13 12:00:00 -0800
4+
categories:
5+
- spark
6+
- optimization
7+
tags:
8+
- performance
9+
excerpt: "How switching from partitioned joins to broadcast joins reduced shuffle writes from 8GB to 500MB"
510
---
611

7-
## First Lesson: Partition vs Broadcast
812

9-
When joining a large 10 GB DataFrame with a small 200 MB lookup table, I discovered that:
1013

11-
- Using a **broadcast join** with `spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "300MB")` dropped shuffle write from 8 GB to 500 MB.
12-
- Conversely, relying on default partitioned joins caused redundant shuffles across executors.
14+
We have a Spark job on Databricks that would join a enormous table (player tracking records of coordindates on frames) with a dimensional table (position number to player uid mapping), and then do some de-dup using window function.
15+
16+
## The Deduplication Challenge
17+
18+
The core logic involved a window function to handle duplicate records (some psedudo SQL):
19+
20+
```sql
21+
WITH ranked AS (
22+
SELECT
23+
tracking.*,
24+
lineup.fielder_id,
25+
lineup.position_alpha,
26+
ROW_NUMBER() OVER (
27+
PARTITION BY game_id, pitch_uid, position_num, event_time
28+
ORDER BY processed_year DESC, processed_month DESC, processed_day DESC
29+
) AS rn
30+
FROM hawkeye_tracking tracking
31+
JOIN hawkeye_lineup lineup ON (...)
32+
)
33+
SELECT * EXCEPT (rn)
34+
FROM ranked
35+
WHERE rn = 1
36+
```
37+
1338

14-
<aside class="callout">
15-
💡 **Tip:** Always tune `spark.sql.shuffle.partitions = executors * cores_per_executor` after switching join strategies.
16-
</aside>

docs/assets/css/style.css

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,35 @@ body {
55
margin: auto;
66
padding: 1rem;
77
}
8-
pre {
9-
background: #f5f5f5;
10-
padding: 0.5rem;
11-
overflow-x: auto;
8+
9+
/* Override the theme's pre styling with more specific selectors */
10+
.content pre[class*="language-"] {
11+
background: #f5f5f5 !important;
12+
border: 1px solid #e0e0e0 !important;
13+
border-radius: 6px !important;
14+
padding: 1em !important;
15+
margin: 1.5em 0 !important;
16+
line-height: 1.5 !important;
1217
}
18+
19+
/* Also target the code element inside pre */
20+
.content pre[class*="language-"] code {
21+
background: none !important;
22+
padding: 0 !important;
23+
color: #333333 !important;
24+
font-size: 0.9em !important;
25+
}
26+
27+
/* Fallback for any plain pre tags */
28+
.content pre:not([class]) {
29+
background: #f5f5f5 !important;
30+
border: 1px solid #e0e0e0 !important;
31+
border-radius: 6px !important;
32+
padding: 1em !important;
33+
margin: 1.5em 0 !important;
34+
line-height: 1.5 !important;
35+
}
36+
1337
.callout {
1438
border-left: 4px solid #007acc;
1539
background: #f0f8ff;

docs/assets/img/avatar.png

228 KB
Loading

docs/index.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
---
2+
layout: home
3+
---

docs/index.md

Lines changed: 0 additions & 10 deletions
This file was deleted.

0 commit comments

Comments
 (0)