|
1 | | -\documentclass{beamer} |
| 1 | +\documentclass[t]{beamer} |
2 | 2 | \usetheme{Madrid} % or another theme |
3 | 3 | \usepackage{tikz} |
4 | 4 | \usepackage{listings} |
5 | 5 | \usepackage{xcolor} |
| 6 | +\usepackage[T1]{fontenc} |
| 7 | + |
| 8 | +\beamertemplatenavigationsymbolsempty |
6 | 9 |
|
7 | 10 | \definecolor{kwblue}{RGB}{0,102,204} % Keywords: blue |
8 | 11 | \definecolor{varblack}{RGB}{32,32,32} % Variables: near-black |
|
22 | 25 | keywordstyle=[2]\color{typepur}, |
23 | 26 | backgroundcolor=\color{codebackground}, |
24 | 27 | breaklines=true, |
25 | | - showstringspaces=false |
| 28 | + showstringspaces=false, |
| 29 | + columns=flexible |
26 | 30 | } |
27 | 31 |
|
28 | 32 | \lstMakeShortInline[backgroundcolor=\color{codebackground}]| |
|
192 | 196 | \end{itemize} |
193 | 197 | \end{frame} |
194 | 198 |
|
| 199 | +\begin{frame}{LED Cube Visualization} |
| 200 | +Take a 64x64x64 array ($\sim 250$k elements) \lstinline|A|, apply a compute-bound op. |
| 201 | + |
| 202 | +\begin{center} |
| 203 | +\includegraphics[width=0.6\textwidth]{led-cube.png} |
| 204 | +\end{center} |
| 205 | + |
| 206 | +\end{frame} |
195 | 207 |
|
196 | 208 | \begin{frame}[fragile]{Algorithms} |
197 | 209 | Take a 64x64x64 array ($\sim 250$k elements) \lstinline|A|, apply a compute-bound op. |
198 | | - |
199 | | - \only<1>{ |
200 | | - \begin{lstlisting} |
| 210 | + \begin{lstlisting} |
201 | 211 | for(auto&& plane : A) { |
202 | | - for(auto&& row : plane) { |
203 | | - for(auto&& elem : row) { |
204 | | - elem = op(elem); |
| 212 | + |
| 213 | + for(auto&& row : plane) { |
| 214 | + for(auto&& elem : row) { |
| 215 | + elem = op(elem); |
| 216 | + } |
205 | 217 | } |
206 | | - } |
| 218 | + |
207 | 219 | } |
208 | | - \end{lstlisting} |
| 220 | + \end{lstlisting} |
| 221 | + timing: \textbf{2.6 msec} |
| 222 | +\end{frame} |
| 223 | + |
| 224 | +\begin{frame}[fragile]{Algorithms (optimized)} |
| 225 | + Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op. |
| 226 | + \begin{lstlisting} |
| 227 | +std::for_each(A.begin(), A.end(), |
| 228 | + [](auto&& plane) { |
| 229 | + for(auto&& row : plane) { |
| 230 | + for(auto&& elem : row) { |
| 231 | + elem = op(elem); |
| 232 | + } |
| 233 | + } |
209 | 234 | } |
| 235 | +); |
| 236 | + \end{lstlisting} |
| 237 | + timing: \textbf{2.6 msec} |
| 238 | +\end{frame} |
210 | 239 |
|
211 | | - \only<2>{ |
212 | | - \begin{lstlisting} |
213 | | -std::for_each(A.begin(), A.end(), [](auto&& plane) { |
214 | | - for(auto&& row : plane) { |
215 | | - for(auto&& elem : row) { |
216 | | - elem = op(elem); |
| 240 | +\begin{frame}[fragile]{Algorithms (optimized)} |
| 241 | + Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op. |
| 242 | + \begin{lstlisting} |
| 243 | +std::for_each(std::execution::par, A.begin(), A.end(), |
| 244 | + [](auto&& plane) { |
| 245 | + for(auto&& row : plane) { |
| 246 | + for(auto&& elem : row) { |
| 247 | + elem = op(elem); |
| 248 | + } |
217 | 249 | } |
218 | 250 | } |
219 | | -}); |
220 | | - \end{lstlisting} |
| 251 | +); |
| 252 | + \end{lstlisting} |
| 253 | + timing: \textbf{1.4 msec} |
| 254 | +\end{frame} |
| 255 | + |
| 256 | +\begin{frame}[fragile]{Algorithms (optimized)} |
| 257 | + Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op. |
| 258 | + \begin{lstlisting} |
| 259 | +thrust::for_each(thrust::cuda::par, A.begin(), A.end(), |
| 260 | + [](auto&& plane) __device__ { |
| 261 | + for(auto&& row : plane) { |
| 262 | + for(auto&& elem : row) { |
| 263 | + elem = op(elem); |
| 264 | + } |
| 265 | + } |
221 | 266 | } |
| 267 | +); |
| 268 | + \end{lstlisting} |
| 269 | + timing: \pause doesn't compile! (Thrust is unfriendly to proxy objects) |
222 | 270 |
|
223 | | - timing: \textbf{0.0026 sec} |
| 271 | + But even if it did compile... \includegraphics[width=0.2\textwidth]{led-cube.png} |
224 | 272 | \end{frame} |
225 | 273 |
|
| 274 | +\begin{frame}[fragile]{Algorithms (optimized)} |
| 275 | + Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op. |
| 276 | + \begin{lstlisting} |
| 277 | +thrust::for_each(thrust::cuda::par, A.elements().begin(), A.elements().end(), [] __device__(auto& e) { |
| 278 | + e = op(e); |
| 279 | +}); |
| 280 | + \end{lstlisting} |
| 281 | + timing: \textbf{0.3 msec} |
| 282 | +\end{frame} |
| 283 | + |
| 284 | +\begin{frame}[fragile]{Algorithms (optimized)} |
| 285 | + Take a 64x64x64 array (\(\sim 250\mathrm{k}\) elements) \lstinline|A|, apply a compute-bound op. |
| 286 | + \begin{lstlisting} |
| 287 | +std::for_each(std::execution::par, A.elements().begin(), A.elements().end(), [] (auto& e) { |
| 288 | + e = op(e); |
| 289 | +}); |
| 290 | + \end{lstlisting} |
| 291 | + timing: \textbf{0.7 msec} |
| 292 | +\end{frame} |
| 293 | + |
| 294 | +\begin{frame}[fragile]{Summary} |
| 295 | + |
| 296 | +\begin{table} |
| 297 | +\begin{tabular}{lcc} |
| 298 | +\textbf{Algorithm} & \textbf{Time (ms)} & \textbf{Speedup} \\ |
| 299 | +\hline |
| 300 | +Nested loops & 2.6 & 1.0x \\ |
| 301 | +STL (leading dimension) & 2.6 & 1.0x \\ |
| 302 | +Parallel STL (leading dimension) & 1.4 & 1.9x \\ |
| 303 | +\hline |
| 304 | +thrust::for\_each ([cuda::par, ] flat elements) & 0.3 & 8.7x \\ |
| 305 | +std::for\_each (std::par, flat elements) & 0.6 & 3.7x \\ |
| 306 | +std::for\_each (flat elements) & 3.0 & 0.9x \\ |
| 307 | +\end{tabular} |
| 308 | +\end{table} |
| 309 | + |
| 310 | +\begin{alertblock}{title} |
| 311 | + Prefer algorithms to raw loops, |
| 312 | + prefer to fuse loops |
| 313 | +\end{alertblock} |
| 314 | + |
| 315 | + \begin{lstlisting} |
| 316 | + algorithm(... A.elements() ...); |
| 317 | + \end{lstlisting} |
| 318 | + |
| 319 | +\end{frame} |
| 320 | + |
| 321 | + |
| 322 | +\begin{frame}[fragile]{...but I need my 3D structure} |
| 323 | +\end{frame} |
226 | 324 |
|
227 | 325 | \end{document} |
0 commit comments