diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu index e6f1798f6ef72..a6b97286d1733 100644 --- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu @@ -123,7 +123,7 @@ template __global__ void EmbedLayerNormKernel( int hidden_size, const int* input_ids, const int* segment_ids, const T* beta, const T* gamma, const T* word_embedding, const T* position_embedding, const T* segment_embedding, - const T epsilon, T* output, T* embedding_sum, const int* position_ids, const bool broadcast_position_ids) { + float epsilon, T* output, T* embedding_sum, const int* position_ids, const bool broadcast_position_ids) { KeyValuePairSum pair_sum; // 1. lookup word and segment of the block // blockIdx.x = position in the sequence @@ -134,7 +134,7 @@ __global__ void EmbedLayerNormKernel( __shared__ int segment_id; __shared__ int position_id; - const T rld = T(1.f / hidden_size); + const float rld = 1.f / hidden_size; const int sequence_position = blockIdx.y * gridDim.x + blockIdx.x; if (threadIdx.x == 0) { word_id = input_ids[sequence_position]; @@ -162,7 +162,7 @@ __global__ void EmbedLayerNormKernel( // the output offset is given by b * (sequence_length * hidden_size) + s * hidden_size const int output_offset = sequence_position * hidden_size; - cub::KeyValuePair thread_data(0, 0); + cub::KeyValuePair thread_data(0.f, 0.f); for (int it = threadIdx.x; it < hidden_size; it += TPB) { const T w(word_embedding[word_offset + it]); @@ -177,8 +177,9 @@ __global__ void EmbedLayerNormKernel( embedding_sum[output_offset + it] = val; } - const T rldval = rld * val; - thread_data = pair_sum(thread_data, cub::KeyValuePair(rldval, rldval * val)); + const float val_f = static_cast(val); + const float rldval = rld * val_f; + thread_data = pair_sum(thread_data, cub::KeyValuePair(rldval, rldval * val_f)); } // 3. layer norm on the sum @@ -190,7 +191,7 @@ Status EmbedSkipLayerNorm( cudaStream_t stream, int hidden_size, int batch_size, int sequence_length, const int* input_ids, const int* segment_ids, const T* beta, const T* gamma, const T* word_embedding, const T* position_embedding, const T* segment_embedding, - const T epsilon, T* output, T* embedding_sum, const int* position_ids, + float epsilon, T* output, T* embedding_sum, const int* position_ids, const bool broadcast_position_ids) { constexpr int tpb = 256; const dim3 grid(sequence_length, batch_size, 1); @@ -238,7 +239,7 @@ Status LaunchEmbedLayerNormKernel( stream, hidden_size, batch_size, sequence_length, input_ids, segment_ids, reinterpret_cast(beta), reinterpret_cast(gamma), reinterpret_cast(word_embedding), reinterpret_cast(position_embedding), - reinterpret_cast(segment_embedding), __float2half_rn(epsilon), + reinterpret_cast(segment_embedding), epsilon, reinterpret_cast(output), reinterpret_cast(embedding_sum), position_ids, broadcast_position_ids); } else { diff --git a/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh b/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh index e5af4aac935ad..0ed2c125405e3 100644 --- a/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh +++ b/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh @@ -74,97 +74,80 @@ struct KeyValuePairSum { const cub::KeyValuePair& b) { return cub::KeyValuePair(a.key + b.key, a.value + b.value); } - - __device__ inline cub::KeyValuePair operator()(const cub::KeyValuePair& a, - const cub::KeyValuePair& b) { - const half2 a2 = __halves2half2(a.key, a.value); - const half2 b2 = __halves2half2(b.key, b.value); - const half2 res = AddHalf2(a2, b2); - return cub::KeyValuePair(__low2half(res), __high2half(res)); - } - - __device__ inline cub::KeyValuePair operator()(const cub::KeyValuePair& a, - const cub::KeyValuePair& b) { - return cub::KeyValuePair(AddHalf2(a.key, b.key), AddHalf2(a.value, b.value)); - } - - __device__ inline cub::KeyValuePair operator()(const cub::KeyValuePair& a, - const cub::KeyValuePair& b) { - const nv_bfloat162 a2 = __halves2bfloat162(a.key, a.value); - const nv_bfloat162 b2 = __halves2bfloat162(b.key, b.value); - const nv_bfloat162 res = AddHalf2(a2, b2); - return cub::KeyValuePair(__low2bfloat16(res), __high2bfloat16(res)); - } }; template __device__ inline void LayerNorm( - const cub::KeyValuePair& thread_data, const int ld, const int offset, const T* beta, - const T* gamma, const T epsilon, T* output) { + const cub::KeyValuePair& thread_data, const int ld, const int offset, const T* beta, + const T* gamma, const float epsilon, T* output) { // Assuming thread_data is already divided by ld + // Uses fp32 accumulation for mean/variance to avoid overflow in fp16/bf16. - using BlockReduce = cub::BlockReduce, TPB>; + using BlockReduce = cub::BlockReduce, TPB>; __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T mu; // mean - __shared__ T rsigma; // 1 / std.dev. + __shared__ float mu; // mean + __shared__ float rsigma; // 1 / std.dev. KeyValuePairSum pair_sum; const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum); if (threadIdx.x == 0) { mu = sum_kv.key; - rsigma = Rsqrt(sum_kv.value - mu * mu + epsilon); + rsigma = rsqrtf(sum_kv.value - mu * mu + epsilon); } __syncthreads(); for (int i = threadIdx.x; i < ld; i += TPB) { const int idx = offset + i; - const T val = output[idx]; - const T g(gamma[i]); - const T b = (nullptr == beta) ? (T)0 : beta[i]; - output[idx] = g * (val - mu) * rsigma + b; + const float val = static_cast(output[idx]); + const float g = static_cast(gamma[i]); + const float b = (nullptr == beta) ? 0.f : static_cast(beta[i]); + output[idx] = static_cast(g * (val - mu) * rsigma + b); } } template __device__ inline void SimplifiedLayerNorm( - const T& thread_data, const int ld, const int offset, const T* gamma, const T epsilon, T* output) { + const float& thread_data, const int ld, const int offset, const T* gamma, const float epsilon, T* output) { // Assuming thread_data is already divided by ld + // Uses fp32 accumulation to avoid overflow in fp16/bf16. - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T rsigma; // 1 / std.dev. + __shared__ float rsigma; // 1 / std.dev. - const T sum = BlockReduce(temp_storage).Sum(thread_data); + const float sum = BlockReduce(temp_storage).Sum(thread_data); if (threadIdx.x == 0) { - rsigma = Rsqrt(sum + epsilon); + rsigma = rsqrtf(sum + epsilon); } __syncthreads(); for (int i = threadIdx.x; i < ld; i += TPB) { const int idx = offset + i; - const T val = output[idx]; - const T g(gamma[i]); - output[idx] = g * val * rsigma; + const float val = static_cast(output[idx]); + const float g = static_cast(gamma[i]); + output[idx] = static_cast(g * val * rsigma); } } template -__device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair& thread_data, +__device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair& thread_data, const int ld, const int idx, const T* beta, const T* gamma, - const T epsilon, T* output) { + const float epsilon, T* output) { // Assuming thread_data is already divided by ld // Small settings: the block covers the leading dimension TPB >= ld. The input // value is available in a register + // Uses fp32 accumulation for mean/variance to avoid overflow in fp16/bf16. using VecT = aligned_vector; - using BlockReduce = cub::BlockReduce, TPB>; + using BlockReduce = cub::BlockReduce, TPB>; __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T mu; // mean - __shared__ T rsigma; // 1 / std.dev. - T beta_v[ILP], gamma_v[ILP], output_v[ILP]; + __shared__ float mu; // mean + __shared__ float rsigma; // 1 / std.dev. + T gamma_v[ILP], output_v[ILP]; const bool is_valid = ILP * threadIdx.x < ld; + T beta_v[ILP]; if (is_valid) { if (beta != nullptr) { VecT* beta_val = reinterpret_cast(&beta_v); @@ -176,20 +159,21 @@ __device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair< } KeyValuePairSum pair_sum; - const cub::KeyValuePair sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum); + const cub::KeyValuePair sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum); if (threadIdx.x == 0) { mu = sum_kv.key; - rsigma = Rsqrt(sum_kv.value - mu * mu + epsilon); + rsigma = rsqrtf(sum_kv.value - mu * mu + epsilon); } __syncthreads(); if (is_valid) { #pragma unroll for (int i = 0; i < ILP; i++) { - output_v[i] = (beta != nullptr) - ? gamma_v[i] * (input_v[i] - mu) * rsigma + beta_v[i] - : gamma_v[i] * (input_v[i] - mu) * rsigma; + const float in_f = static_cast(input_v[i]); + const float g_f = static_cast(gamma_v[i]); + const float b_f = (beta != nullptr) ? static_cast(beta_v[i]) : 0.f; + output_v[i] = static_cast(g_f * (in_f - mu) * rsigma + b_f); } VecT* output_val = reinterpret_cast(&output_v); @@ -198,15 +182,16 @@ __device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair< } template -__device__ inline void SimplifiedLayerNormSmall(const T* input_v, const T& thread_data, const int ld, const int idx, - const T* gamma, const T epsilon, T* output) { +__device__ inline void SimplifiedLayerNormSmall(const T* input_v, const float& thread_data, const int ld, const int idx, + const T* gamma, const float epsilon, T* output) { // Assuming thread_data is already divided by ld // Small settings: the block covers the leading dimension TPB >= ld. The input // value is available in a register + // Uses fp32 accumulation to avoid overflow in fp16/bf16. using VecT = aligned_vector; - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - __shared__ T rsigma; // 1 / std.dev. + __shared__ float rsigma; // 1 / std.dev. const bool is_valid = ILP * threadIdx.x < ld; @@ -217,17 +202,19 @@ __device__ inline void SimplifiedLayerNormSmall(const T* input_v, const T& threa *gamma_val = *reinterpret_cast(&gamma[threadIdx.x * ILP]); } - const T sum = BlockReduce(temp_storage).Sum(thread_data); + const float sum = BlockReduce(temp_storage).Sum(thread_data); if (threadIdx.x == 0) { - rsigma = Rsqrt(sum + epsilon); + rsigma = rsqrtf(sum + epsilon); } __syncthreads(); if (is_valid) { #pragma unroll for (int i = 0; i < ILP; i++) { - output_v[i] = gamma_v[i] * input_v[i] * rsigma; + const float in_f = static_cast(input_v[i]); + const float g_f = static_cast(gamma_v[i]); + output_v[i] = static_cast(g_f * in_f * rsigma); } VecT* output_val = reinterpret_cast(&output_v); diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu index 88ab3b5831afe..7f5169639ff8d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu @@ -37,24 +37,6 @@ namespace contrib { namespace cuda { namespace { -template -T maybe2half(float x); - -template <> -float maybe2half(float x) { - return x; -} - -template <> -half maybe2half(float x) { - return __float2half_rn(x); -} - -template <> -nv_bfloat16 maybe2half(float x) { - return __float2bfloat16_rn(x); -} - // Using only power of 2 numbers will lead to waste of compute for same size such as 768, which is a very common case // in BERT. Ideally we can step by wrap_size * num_unroll, but listing too many steps will cause long compile time. constexpr int kSizes[] = {128, 320, 384, 640, 768, 1024, 1280, 2048, 4096, 5120, 8192}; @@ -90,15 +72,16 @@ bool CanVectorized(void* output, void* sum_output, const void* input, const void template __global__ void SkipLayerNormKernel( - T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta, T epsilon, - const int ld, int skip_size) { - const T reverse_ld = T(1.f / ld); + T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta, + float epsilon, const int ld, int skip_size) { + const float reverse_ld = 1.f / ld; const int offset = blockIdx.x * ld; const bool has_bias = (bias != nullptr); // Reduce sum of x and x^2, and the results are divided by ld. + // Uses fp32 accumulation to avoid overflow in fp16/bf16. KeyValuePairSum pair_sum; - cub::KeyValuePair thread_data(0, 0); + cub::KeyValuePair thread_data(0.f, 0.f); for (int i = threadIdx.x; i < ld; i += TPB) { const int idx = offset + i; @@ -109,8 +92,9 @@ __global__ void SkipLayerNormKernel( } val += skip[idx % skip_size]; - const T rldval = reverse_ld * val; - thread_data = pair_sum(thread_data, cub::KeyValuePair(rldval, rldval * val)); + const float val_f = static_cast(val); + const float rldval = reverse_ld * val_f; + thread_data = pair_sum(thread_data, cub::KeyValuePair(rldval, rldval * val_f)); if (sum_output != nullptr) { sum_output[idx] = val; @@ -129,15 +113,15 @@ __global__ void SkipLayerNormKernel( // Vectorized kernel template __global__ void SkipLayerNormKernelSmall( - T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta, T epsilon, - int ld, int skip_size) { - const T rld = T(1.f / ld); + T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta, + float epsilon, int ld, int skip_size) { + const float rld = 1.f / ld; const int idx = blockIdx.x * ld + threadIdx.x * ILP; using VecT = aligned_vector; T sum_v[ILP]; - cub::KeyValuePair thread_data(T(0.f), T(0.f)); + cub::KeyValuePair thread_data(0.f, 0.f); if (ILP * threadIdx.x < ld) { // load data under this guard to avoid reading out-of-bounds T skip_v[ILP], bias_v[ILP]; @@ -155,8 +139,8 @@ __global__ void SkipLayerNormKernelSmall( *bias_val = *reinterpret_cast(&bias[threadIdx.x * ILP]); } - T rldval_sum = T(0.f); - T rldvalsq_sum = T(0.f); + float rldval_sum = 0.f; + float rldvalsq_sum = 0.f; const bool has_sum_output = (sum_output != nullptr); #pragma unroll @@ -166,16 +150,17 @@ __global__ void SkipLayerNormKernelSmall( } sum_v[i] += skip_v[i]; - const T rldval = rld * sum_v[i]; + const float val_f = static_cast(sum_v[i]); + const float rldval = rld * val_f; rldval_sum += rldval; - rldvalsq_sum += rldval * sum_v[i]; + rldvalsq_sum += rldval * val_f; } if (has_sum_output) { *(reinterpret_cast(&sum_output[idx])) = *reinterpret_cast(&sum_v); } - thread_data = cub::KeyValuePair(rldval_sum, rldvalsq_sum); + thread_data = cub::KeyValuePair(rldval_sum, rldvalsq_sum); } if (Simplified) { @@ -203,11 +188,11 @@ void LaunchSkipLayerNormKernel( #define LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL(num_unroll) \ SkipLayerNormKernelSmall<<>>( \ - output, sum_output, input, skip, bias, gamma, beta, maybe2half(epsilon), ld, skip_size) + output, sum_output, input, skip, bias, gamma, beta, epsilon, ld, skip_size) #define LAUNCH_SKIP_LAYER_NORM_KERNEL() \ SkipLayerNormKernel<<>>( \ - output, sum_output, input, skip, bias, gamma, beta, maybe2half(epsilon), ld, skip_size) + output, sum_output, input, skip, bias, gamma, beta, epsilon, ld, skip_size) #define CASE_NEXT_SIZE(next_size_value) \ case next_size_value: { \ diff --git a/onnxruntime/test/python/transformers/parse_nsys.py b/onnxruntime/test/python/transformers/parse_nsys.py new file mode 100644 index 0000000000000..361e89904efdc --- /dev/null +++ b/onnxruntime/test/python/transformers/parse_nsys.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +""" +Parse nsys SQLite output to extract CUDA kernel timings. + +Usage: + # First, profile with nsys: + nsys profile -o sln_fp16 --export=sqlite python profile_skip_layer_norm.py --mode fp16 --warmup 5 --repeat 100 + nsys profile -o gqa_int8 --export=sqlite python profile_gqa.py --mode int8 --warmup 5 --repeat 10 + + # Then parse the results (using NVTX marker to exclude warmup): + python parse_nsys.py sln_fp16.sqlite --nvtx-range benchmark + python parse_nsys.py gqa_int8.sqlite --nvtx-range benchmark --output results.json + python parse_nsys.py gqa_int8.sqlite --format csv --output results.csv + + # Alternative: skip first N calls per kernel to exclude warmup: + python parse_nsys.py sln_fp16.sqlite --skip-first 5 +""" + +import argparse +import json +import sqlite3 +import sys +from pathlib import Path + + +def parse_nsys_sqlite( + db_path: str, + kernel_patterns: list[str] | None = None, + skip_first: int = 0, + nvtx_range: str | None = None, +) -> list[dict]: + """ + Parse nsys SQLite database and extract kernel timing information. + + Args: + db_path: Path to the .sqlite file exported by nsys + kernel_patterns: List of SQL LIKE patterns to filter kernels (default: onnxruntime patterns) + skip_first: Number of initial kernel calls to skip per kernel type (e.g., to exclude warmup) + nvtx_range: If specified, only include kernels launched within this NVTX range + + Returns: + List of dicts with kernel timing info + """ + if kernel_patterns is None: + kernel_patterns = [ + "%onnxruntime%", + ] + + conn = sqlite3.connect(db_path) + + # Build WHERE clause for kernel patterns using parameterized queries + pattern_placeholders = " OR ".join(["s.value LIKE ?" for _ in kernel_patterns]) + params: list = list(kernel_patterns) + + if nvtx_range: + # Filter kernels that launched within the specified NVTX range + if skip_first > 0: + query = f""" + WITH numbered AS ( + SELECT + s.value as kernel_name, + k.end - k.start as duration_ns, + ROW_NUMBER() OVER (PARTITION BY s.value ORDER BY k.start) as call_num + FROM CUPTI_ACTIVITY_KIND_KERNEL k + JOIN StringIds s ON k.demangledName = s.id + JOIN NVTX_EVENTS n ON k.start >= n.start AND k.start <= n.end + JOIN StringIds ns ON n.textId = ns.id + WHERE ({pattern_placeholders}) AND ns.value = ? + ) + SELECT + kernel_name, + SUM(duration_ns) as total_ns, + COUNT(*) as call_count, + MIN(duration_ns) as min_ns, + MAX(duration_ns) as max_ns, + AVG(duration_ns) as avg_ns + FROM numbered + WHERE call_num > ? + GROUP BY kernel_name + ORDER BY total_ns DESC + """ + params.append(nvtx_range) + params.append(skip_first) + else: + query = f""" + SELECT + s.value as kernel_name, + SUM(k.end - k.start) as total_ns, + COUNT(*) as call_count, + MIN(k.end - k.start) as min_ns, + MAX(k.end - k.start) as max_ns, + AVG(k.end - k.start) as avg_ns + FROM CUPTI_ACTIVITY_KIND_KERNEL k + JOIN StringIds s ON k.demangledName = s.id + JOIN NVTX_EVENTS n ON k.start >= n.start AND k.start <= n.end + JOIN StringIds ns ON n.textId = ns.id + WHERE ({pattern_placeholders}) AND ns.value = ? + GROUP BY s.value + ORDER BY total_ns DESC + """ + params.append(nvtx_range) + elif skip_first > 0: + # Use window function to number calls and skip first N per kernel type + query = f""" + WITH numbered AS ( + SELECT + s.value as kernel_name, + k.end - k.start as duration_ns, + ROW_NUMBER() OVER (PARTITION BY s.value ORDER BY k.start) as call_num + FROM CUPTI_ACTIVITY_KIND_KERNEL k + JOIN StringIds s ON k.demangledName = s.id + WHERE {pattern_placeholders} + ) + SELECT + kernel_name, + SUM(duration_ns) as total_ns, + COUNT(*) as call_count, + MIN(duration_ns) as min_ns, + MAX(duration_ns) as max_ns, + AVG(duration_ns) as avg_ns + FROM numbered + WHERE call_num > ? + GROUP BY kernel_name + ORDER BY total_ns DESC + """ + params.append(skip_first) + else: + # Original query without skipping + query = f""" + SELECT + s.value as kernel_name, + SUM(k.end - k.start) as total_ns, + COUNT(*) as call_count, + MIN(k.end - k.start) as min_ns, + MAX(k.end - k.start) as max_ns, + AVG(k.end - k.start) as avg_ns + FROM CUPTI_ACTIVITY_KIND_KERNEL k + JOIN StringIds s ON k.demangledName = s.id + WHERE {pattern_placeholders} + GROUP BY s.value + ORDER BY total_ns DESC + """ + + results = [] + try: + cursor = conn.execute(query, params) + rows = cursor.fetchall() + for row in rows: + results.append( + { + "kernel_name": row[0], + "total_ms": row[1] / 1e6, # ns to ms + "call_count": row[2], + "min_us": row[3] / 1e3, # ns to us + "max_us": row[4] / 1e3, + "avg_us": row[5] / 1e3, + } + ) + except sqlite3.OperationalError as e: + print(f"SQL Error: {e}", file=sys.stderr) + + conn.close() + return results + + +def list_all_kernels(db_path: str) -> list[str]: + """List all kernel names in the database for debugging.""" + conn = sqlite3.connect(db_path) + + try: + # Join with StringIds to get actual kernel names + cursor = conn.execute(""" + SELECT DISTINCT s.value + FROM CUPTI_ACTIVITY_KIND_KERNEL k + JOIN StringIds s ON k.demangledName = s.id + ORDER BY s.value + """) + return [row[0] for row in cursor.fetchall()] + except sqlite3.OperationalError as e: + print(f"SQL Error: {e}", file=sys.stderr) + return [] + finally: + conn.close() + + +def format_kernel_name(kernel_name: str) -> str: + prefix_list = [ + "void onnxruntime::contrib::cuda::", + "void onnxruntime::", + "onnxruntime::contrib::cuda::", + "onnxruntime::", + ] + for prefix in prefix_list: + if kernel_name.startswith(prefix): + return kernel_name[len(prefix) :] + return kernel_name + + +def format_table(results: list[dict], prefix: str) -> str: + """Format results as a human-readable table.""" + if not results: + return "No matching kernels found." + + kernel_name_len_limit = 64 + lines = [] + lines.append( + f"{prefix}{'Kernel Name':<{kernel_name_len_limit}} {'Total(ms)':>10} {'Calls':>8} {'Avg(us)':>10} {'Min(us)':>10} {'Max(us)':>10}" + ) + lines.append("-" * 120) + + for r in results: + kernel_name = format_kernel_name(r["kernel_name"]) + name = ( + kernel_name[:kernel_name_len_limit] + "..." if len(kernel_name) > kernel_name_len_limit - 3 else kernel_name + ) + lines.append( + f"{name:<{kernel_name_len_limit}} {r['total_ms']:>10.3f} {r['call_count']:>8d} {r['avg_us']:>10.2f} {r['min_us']:>10.2f} {r['max_us']:>10.2f}" + ) + + return "\n".join(lines) + + +def format_csv(results: list[dict]) -> str: + """Format results as CSV.""" + lines = ["kernel_name,total_ms,call_count,avg_us,min_us,max_us"] + for r in results: + lines.append( + f'"{r["kernel_name"]}",{r["total_ms"]:.6f},{r["call_count"]},{r["avg_us"]:.3f},{r["min_us"]:.3f},{r["max_us"]:.3f}' + ) + return "\n".join(lines) + + +def main(): + parser = argparse.ArgumentParser( + description="Parse nsys SQLite output for CUDA kernel timings", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Profile and parse (using NVTX range to exclude warmup): + nsys profile -o sln --export=sqlite python profile_skip_layer_norm.py --warmup 5 --repeat 100 + python parse_nsys.py sln.sqlite --nvtx-range benchmark + + # Alternative: skip first N warmup calls per kernel: + python parse_nsys.py sln.sqlite --skip-first 5 + + # Export to JSON: + python parse_nsys.py sln.sqlite --nvtx-range benchmark --format json --output results.json + + # List all kernels (for debugging): + python parse_nsys.py sln.sqlite --list-kernels + """, + ) + parser.add_argument("sqlite_file", help="Path to nsys SQLite export file") + parser.add_argument( + "--format", choices=["table", "json", "csv"], default="table", help="Output format (default: table)" + ) + parser.add_argument("--output", "-o", help="Output file (default: stdout)") + parser.add_argument("--list-kernels", action="store_true", help="List all kernel names in the database") + parser.add_argument("--pattern", action="append", help="Add custom kernel name pattern (SQL LIKE syntax)") + parser.add_argument("--tag", default="", help="Tag for kernel name in output table. Example tag: 'fp16' or 'int8'") + parser.add_argument( + "--nvtx-range", + metavar="NAME", + help="Only include kernels launched within this NVTX range (e.g., 'benchmark')", + ) + parser.add_argument( + "--skip-first", + type=int, + default=0, + metavar="N", + help="Skip first N calls per kernel type (e.g., to exclude warmup iterations)", + ) + + args = parser.parse_args() + + if not Path(args.sqlite_file).exists(): + print(f"Error: File not found: {args.sqlite_file}", file=sys.stderr) + sys.exit(1) + + if args.list_kernels: + kernels = list_all_kernels(args.sqlite_file) + print(f"Found {len(kernels)} unique kernels:") + for k in kernels: + print(f" {k}") + return + + # Parse kernel timings + patterns = args.pattern if args.pattern else None + results = parse_nsys_sqlite(args.sqlite_file, patterns, skip_first=args.skip_first, nvtx_range=args.nvtx_range) + + if args.nvtx_range: + print(f"(Filtering kernels within NVTX range: '{args.nvtx_range}')\n") + elif args.skip_first > 0: + print(f"(Skipping first {args.skip_first} calls per kernel)\n") + + # Format output + if args.format == "json": + output = json.dumps(results, indent=2) + elif args.format == "csv": + output = format_csv(results) + else: + output = format_table(results, args.tag + " " if args.tag else "") + + # Write output + if args.output: + with open(args.output, "w") as f: + f.write(output) + print(f"Results written to {args.output}") + else: + print(output) + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/test/python/transformers/profile_skip_layer_norm.py b/onnxruntime/test/python/transformers/profile_skip_layer_norm.py new file mode 100644 index 0000000000000..272a06227f653 --- /dev/null +++ b/onnxruntime/test/python/transformers/profile_skip_layer_norm.py @@ -0,0 +1,176 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +""" +Profiling script for SkipLayerNormalization CUDA kernel. + +Usage: + cd onnxruntime/test/python/transformers + python profile_skip_layer_norm.py + + # Profile with Nsight Systems (timeline analysis) and extract kernel timings: + nsys profile -o sln_fp16 --export=sqlite python profile_skip_layer_norm.py --mode fp16 --warmup 5 --repeat 100 + python parse_nsys.py sln_fp16.sqlite --nvtx-range benchmark + +""" + +import argparse +import os +import tempfile +import time + +import numpy as np +from onnx import TensorProto, helper, save_model + +import onnxruntime as ort + +# Optional NVTX support for nsys range markers +try: + import nvtx + + HAS_NVTX = True +except ImportError: + HAS_NVTX = False + + class DummyNvtxRange: + def __init__(self, name): + pass + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + class nvtx: # noqa: N801 + @staticmethod + def annotate(name, color=None): + return DummyNvtxRange(name) + + +def create_skip_layer_norm_model(batch_size, seq_len, hidden_size, data_type, simplified=False): + """Create an ONNX model with a single SkipLayerNormalization op.""" + onnx_type = TensorProto.FLOAT16 if data_type == np.float16 else TensorProto.FLOAT + + input_tensor = helper.make_tensor_value_info("INPUT", onnx_type, [batch_size, seq_len, hidden_size]) + skip_tensor = helper.make_tensor_value_info("SKIP", onnx_type, [batch_size, seq_len, hidden_size]) + gamma_tensor = helper.make_tensor_value_info("GAMMA", onnx_type, [hidden_size]) + beta_tensor = helper.make_tensor_value_info("BETA", onnx_type, [hidden_size]) + bias_tensor = helper.make_tensor_value_info("BIAS", onnx_type, [hidden_size]) + + output_tensor = helper.make_tensor_value_info("OUTPUT", onnx_type, [batch_size, seq_len, hidden_size]) + + op_type = "SkipSimplifiedLayerNormalization" if simplified else "SkipLayerNormalization" + if simplified: + inputs = ["INPUT", "SKIP", "GAMMA", "BIAS"] + input_list = [input_tensor, skip_tensor, gamma_tensor, bias_tensor] + else: + inputs = ["INPUT", "SKIP", "GAMMA", "BETA", "BIAS"] + input_list = [input_tensor, skip_tensor, gamma_tensor, beta_tensor, bias_tensor] + + node = helper.make_node( + op_type, + inputs=inputs, + outputs=["OUTPUT", "", "", ""], + domain="com.microsoft", + epsilon=1e-5, + ) + + graph = helper.make_graph([node], "skip_layer_norm_profile", input_list, [output_tensor]) + + opset_imports = [ + helper.make_opsetid("", 17), + helper.make_opsetid("com.microsoft", 1), + ] + + model = helper.make_model(graph, opset_imports=opset_imports) + model.ir_version = 7 + return model + + +def run_profiling(args): + """Run profiling for SkipLayerNormalization.""" + data_type = np.float16 if args.mode == "fp16" else np.float32 + + print(f"\n{'=' * 70}") + print("SkipLayerNormalization Profiling") + print(f"{'=' * 70}") + print(f"Config: batch={args.batch_size}, seq_len={args.seq_len}, hidden_size={args.hidden_size}") + print(f" mode={args.mode}, simplified={args.simplified}") + print(f" warmup={args.warmup}, repeat={args.repeat}") + print(f"{'=' * 70}\n") + + model = create_skip_layer_norm_model( + args.batch_size, args.seq_len, args.hidden_size, data_type, simplified=args.simplified + ) + + with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f: + model_path = f.name + save_model(model, model_path) + + try: + sess_opt = ort.SessionOptions() + sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL + sess = ort.InferenceSession(model_path, sess_options=sess_opt, providers=["CUDAExecutionProvider"]) + + # Create inputs + np.random.seed(42) + if args.simplified: + feeds = { + "INPUT": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type), + "SKIP": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type), + "GAMMA": np.random.rand(args.hidden_size).astype(data_type), + "BIAS": np.random.rand(args.hidden_size).astype(data_type), + } + else: + feeds = { + "INPUT": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type), + "SKIP": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type), + "GAMMA": np.random.rand(args.hidden_size).astype(data_type), + "BETA": np.random.rand(args.hidden_size).astype(data_type), + "BIAS": np.random.rand(args.hidden_size).astype(data_type), + } + + # Warmup + with nvtx.annotate("warmup", color="yellow"): + for _ in range(args.warmup): + sess.run(None, feeds) + + # Benchmark with NVTX annotation + with nvtx.annotate("benchmark", color="green"): + start = time.perf_counter() + for _ in range(args.repeat): + sess.run(None, feeds) + end = time.perf_counter() + + avg_ms = (end - start) * 1000 / args.repeat + elem_size = 2 if data_type == np.float16 else 4 + total_elements = args.batch_size * args.seq_len * args.hidden_size + bytes_transferred = 4 * total_elements * elem_size + throughput_gbps = bytes_transferred / (avg_ms * 1e-3) / 1e9 + + print(f" Average time: {avg_ms:.4f} ms") + print(f" Throughput: {throughput_gbps:.2f} GB/s") + + finally: + os.unlink(model_path) + + +def main(): + parser = argparse.ArgumentParser(description="Profile SkipLayerNormalization CUDA kernel") + parser.add_argument("--mode", choices=["fp16", "fp32"], default="fp16", help="Data type") + parser.add_argument("--batch-size", type=int, default=1, help="Batch size") + parser.add_argument("--seq-len", type=int, default=2048, help="Sequence length") + parser.add_argument("--hidden-size", type=int, default=4096, help="Hidden size") + parser.add_argument("--simplified", action="store_true", help="Use SkipSimplifiedLayerNormalization") + parser.add_argument("--warmup", type=int, default=5, help="Warmup iterations") + parser.add_argument("--repeat", type=int, default=100, help="Benchmark iterations") + + args = parser.parse_args() + run_profiling(args) + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/test/python/transformers/profile_skip_layer_norm.sh b/onnxruntime/test/python/transformers/profile_skip_layer_norm.sh new file mode 100755 index 0000000000000..0e638fc13ea19 --- /dev/null +++ b/onnxruntime/test/python/transformers/profile_skip_layer_norm.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +# +# Profile SkipLayerNormalization CUDA kernel with nsys. +# +# Usage: +# ./profile_skip_layer_norm.sh # Profile with defaults (fp16, H=4096) +# ./profile_skip_layer_norm.sh --hidden-size 8192 # Different hidden size +# ./profile_skip_layer_norm.sh --fp32 --batch-size 4 # FP32 mode, batch=4 +# + +set -e +set -o pipefail + +# Default parameters +BATCH_SIZE="" +SEQ_LEN="" +HIDDEN_SIZE="" +MODE="--mode fp16" +SIMPLIFIED="" +OUTPUT_NAME="sln_profile" + +while [[ "$#" -gt 0 ]]; do + case $1 in + --batch-size) + BATCH_SIZE="--batch-size $2" + shift + ;; + --seq-len) + SEQ_LEN="--seq-len $2" + shift + ;; + --hidden-size) + HIDDEN_SIZE="--hidden-size $2" + shift + ;; + --fp32) + MODE="--mode fp32" + ;; + --simplified) + SIMPLIFIED="--simplified" + ;; + -o|--output) + OUTPUT_NAME="$2" + shift + ;; + *) + echo "Unknown option: $1" + echo "Usage: $0 [--batch-size N] [--seq-len N] [--hidden-size N] [--fp32] [--simplified] [-o NAME]" + exit 1 + ;; + esac + shift +done + +EXTRA_ARGS="${BATCH_SIZE} ${SEQ_LEN} ${HIDDEN_SIZE} ${MODE} ${SIMPLIFIED}" + +# Check nvtx availability (optional, for NVTX range markers) +HAVE_NVTX=0 +if python -c "import nvtx" 2>/dev/null; then + HAVE_NVTX=1 +else + echo "Note: 'nvtx' package not installed. NVTX range markers will be disabled." + echo " Install with: pip install nvtx" + echo " Falling back to --skip-first to exclude warmup iterations." +fi + +echo "" +echo "========================================" +echo " Profiling: SkipLayerNormalization" +echo "========================================" +rm -f "${OUTPUT_NAME}.nsys-rep" "${OUTPUT_NAME}.sqlite" +nsys profile -o "${OUTPUT_NAME}" --export=sqlite \ + python profile_skip_layer_norm.py --warmup 5 --repeat 100 $EXTRA_ARGS +echo "" +echo "---- Kernel results ----" +if [[ "$HAVE_NVTX" -eq 1 ]]; then + python parse_nsys.py "${OUTPUT_NAME}.sqlite" --nvtx-range benchmark +else + python parse_nsys.py "${OUTPUT_NAME}.sqlite" --skip-first 5 +fi + +echo "" +echo "Done."