Skip to content

Commit 46811a9

Browse files
committed
presentation
1 parent 4988e7c commit 46811a9

4 files changed

Lines changed: 125 additions & 19 deletions

File tree

talk/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,5 @@
99
*.fls
1010
*.synctex*
1111
*.vrb
12+
13+

talk/led-cube.png

2.03 MB
Loading

talk/presentation.tex

Lines changed: 117 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1-
\documentclass{beamer}
1+
\documentclass[t]{beamer}
22
\usetheme{Madrid} % or another theme
33
\usepackage{tikz}
44
\usepackage{listings}
55
\usepackage{xcolor}
6+
\usepackage[T1]{fontenc}
7+
8+
\beamertemplatenavigationsymbolsempty
69

710
\definecolor{kwblue}{RGB}{0,102,204} % Keywords: blue
811
\definecolor{varblack}{RGB}{32,32,32} % Variables: near-black
@@ -22,7 +25,8 @@
2225
keywordstyle=[2]\color{typepur},
2326
backgroundcolor=\color{codebackground},
2427
breaklines=true,
25-
showstringspaces=false
28+
showstringspaces=false,
29+
columns=flexible
2630
}
2731

2832
\lstMakeShortInline[backgroundcolor=\color{codebackground}]|
@@ -192,36 +196,130 @@
192196
\end{itemize}
193197
\end{frame}
194198

199+
\begin{frame}{LED Cube Visualization}
200+
Take a 64x64x64 array ($\sim 250$k elements) \lstinline|A|, apply a compute-bound op.
201+
202+
\begin{center}
203+
\includegraphics[width=0.6\textwidth]{led-cube.png}
204+
\end{center}
205+
206+
\end{frame}
195207

196208
\begin{frame}[fragile]{Algorithms}
197209
Take a 64x64x64 array ($\sim 250$k elements) \lstinline|A|, apply a compute-bound op.
198-
199-
\only<1>{
200-
\begin{lstlisting}
210+
\begin{lstlisting}
201211
for(auto&& plane : A) {
202-
for(auto&& row : plane) {
203-
for(auto&& elem : row) {
204-
elem = op(elem);
212+
213+
for(auto&& row : plane) {
214+
for(auto&& elem : row) {
215+
elem = op(elem);
216+
}
205217
}
206-
}
218+
207219
}
208-
\end{lstlisting}
220+
\end{lstlisting}
221+
timing: \textbf{2.6 msec}
222+
\end{frame}
223+
224+
\begin{frame}[fragile]{Algorithms (optimized)}
225+
Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op.
226+
\begin{lstlisting}
227+
std::for_each(A.begin(), A.end(),
228+
[](auto&& plane) {
229+
for(auto&& row : plane) {
230+
for(auto&& elem : row) {
231+
elem = op(elem);
232+
}
233+
}
209234
}
235+
);
236+
\end{lstlisting}
237+
timing: \textbf{2.6 msec}
238+
\end{frame}
210239

211-
\only<2>{
212-
\begin{lstlisting}
213-
std::for_each(A.begin(), A.end(), [](auto&& plane) {
214-
for(auto&& row : plane) {
215-
for(auto&& elem : row) {
216-
elem = op(elem);
240+
\begin{frame}[fragile]{Algorithms (optimized)}
241+
Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op.
242+
\begin{lstlisting}
243+
std::for_each(std::execution::par, A.begin(), A.end(),
244+
[](auto&& plane) {
245+
for(auto&& row : plane) {
246+
for(auto&& elem : row) {
247+
elem = op(elem);
248+
}
217249
}
218250
}
219-
});
220-
\end{lstlisting}
251+
);
252+
\end{lstlisting}
253+
timing: \textbf{1.4 msec}
254+
\end{frame}
255+
256+
\begin{frame}[fragile]{Algorithms (optimized)}
257+
Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op.
258+
\begin{lstlisting}
259+
thrust::for_each(thrust::cuda::par, A.begin(), A.end(),
260+
[](auto&& plane) __device__ {
261+
for(auto&& row : plane) {
262+
for(auto&& elem : row) {
263+
elem = op(elem);
264+
}
265+
}
221266
}
267+
);
268+
\end{lstlisting}
269+
timing: \pause doesn't compile! (Thrust is unfriendly to proxy objects)
222270

223-
timing: \textbf{0.0026 sec}
271+
But even if it did compile... \includegraphics[width=0.2\textwidth]{led-cube.png}
224272
\end{frame}
225273

274+
\begin{frame}[fragile]{Algorithms (optimized)}
275+
Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op.
276+
\begin{lstlisting}
277+
thrust::for_each(thrust::cuda::par, A.elements().begin(), A.elements().end(), [] __device__(auto& e) {
278+
e = op(e);
279+
});
280+
\end{lstlisting}
281+
timing: \textbf{0.3 msec}
282+
\end{frame}
283+
284+
\begin{frame}[fragile]{Algorithms (optimized)}
285+
Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op.
286+
\begin{lstlisting}
287+
std::for_each(std::execution::par, A.elements().begin(), A.elements().end(), [] (auto& e) {
288+
e = op(e);
289+
});
290+
\end{lstlisting}
291+
timing: \textbf{0.7 msec}
292+
\end{frame}
293+
294+
\begin{frame}[fragile]{Summary}
295+
296+
\begin{table}
297+
\begin{tabular}{lcc}
298+
\textbf{Algorithm} & \textbf{Time (ms)} & \textbf{Speedup} \\
299+
\hline
300+
Nested loops & 2.6 & 1.0x \\
301+
STL (leading dimension) & 2.6 & 1.0x \\
302+
Parallel STL (leading dimension) & 1.4 & 1.9x \\
303+
\hline
304+
thrust::for\_each ([cuda::par, ] flat elements) & 0.3 & 8.7x \\
305+
std::for\_each (std::par, flat elements) & 0.6 & 3.7x \\
306+
std::for\_each (flat elements) & 3.0 & 0.9x \\
307+
\end{tabular}
308+
\end{table}
309+
310+
\begin{alertblock}{title}
311+
Prefer algorithms to raw loops,
312+
prefer to fuse loops
313+
\end{alertblock}
314+
315+
\begin{lstlisting}
316+
algorithm(... A.elements() ...);
317+
\end{lstlisting}
318+
319+
\end{frame}
320+
321+
322+
\begin{frame}[fragile]{...but I need my 3D structure}
323+
\end{frame}
226324

227325
\end{document}

test/for_each.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,12 @@ auto main() -> int { // NOLINT(bugprone-exception-escape)
105105
#endif
106106
#endif
107107
#endif
108+
{
109+
auto_timer const _{"std::for_each(elements)"};
110+
std::for_each(cpu.elements().begin(), cpu.elements().end(), [](auto&& elem) {
111+
elem += std::sqrt(std::pow(elem, 1.5) + std::sin(elem));
112+
});
113+
}
108114
}
109115

110116
return boost::report_errors();

0 commit comments

Comments
 (0)