diff --git a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
index e6f1798f6ef72..a6b97286d1733 100644
--- a/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/embed_layer_norm_impl.cu
@@ -123,7 +123,7 @@ template <typename T, unsigned TPB>
 __global__ void EmbedLayerNormKernel(
     int hidden_size, const int* input_ids, const int* segment_ids, const T* beta, const T* gamma,
     const T* word_embedding, const T* position_embedding, const T* segment_embedding,
-    const T epsilon, T* output, T* embedding_sum, const int* position_ids, const bool broadcast_position_ids) {
+    float epsilon, T* output, T* embedding_sum, const int* position_ids, const bool broadcast_position_ids) {
   KeyValuePairSum pair_sum;
   // 1. lookup word and segment of the block
   // blockIdx.x = position in the sequence
@@ -134,7 +134,7 @@ __global__ void EmbedLayerNormKernel(
   __shared__ int segment_id;
   __shared__ int position_id;
 
-  const T rld = T(1.f / hidden_size);
+  const float rld = 1.f / hidden_size;
   const int sequence_position = blockIdx.y * gridDim.x + blockIdx.x;
   if (threadIdx.x == 0) {
     word_id = input_ids[sequence_position];
@@ -162,7 +162,7 @@ __global__ void EmbedLayerNormKernel(
   // the output offset is given by b * (sequence_length * hidden_size) + s * hidden_size
   const int output_offset = sequence_position * hidden_size;
 
-  cub::KeyValuePair<T, T> thread_data(0, 0);
+  cub::KeyValuePair<float, float> thread_data(0.f, 0.f);
 
   for (int it = threadIdx.x; it < hidden_size; it += TPB) {
     const T w(word_embedding[word_offset + it]);
@@ -177,8 +177,9 @@ __global__ void EmbedLayerNormKernel(
       embedding_sum[output_offset + it] = val;
     }
 
-    const T rldval = rld * val;
-    thread_data = pair_sum(thread_data, cub::KeyValuePair<T, T>(rldval, rldval * val));
+    const float val_f = static_cast<float>(val);
+    const float rldval = rld * val_f;
+    thread_data = pair_sum(thread_data, cub::KeyValuePair<float, float>(rldval, rldval * val_f));
   }
 
   // 3. layer norm on the sum
@@ -190,7 +191,7 @@ Status EmbedSkipLayerNorm(
     cudaStream_t stream, int hidden_size, int batch_size, int sequence_length,
     const int* input_ids, const int* segment_ids, const T* beta, const T* gamma,
     const T* word_embedding, const T* position_embedding, const T* segment_embedding,
-    const T epsilon, T* output, T* embedding_sum, const int* position_ids,
+    float epsilon, T* output, T* embedding_sum, const int* position_ids,
     const bool broadcast_position_ids) {
   constexpr int tpb = 256;
   const dim3 grid(sequence_length, batch_size, 1);
@@ -238,7 +239,7 @@ Status LaunchEmbedLayerNormKernel(
         stream, hidden_size, batch_size, sequence_length, input_ids, segment_ids,
         reinterpret_cast<const half*>(beta), reinterpret_cast<const half*>(gamma),
         reinterpret_cast<const half*>(word_embedding), reinterpret_cast<const half*>(position_embedding),
-        reinterpret_cast<const half*>(segment_embedding), __float2half_rn(epsilon),
+        reinterpret_cast<const half*>(segment_embedding), epsilon,
         reinterpret_cast<half*>(output), reinterpret_cast<half*>(embedding_sum), position_ids,
         broadcast_position_ids);
   } else {
diff --git a/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh b/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh
index e5af4aac935ad..0ed2c125405e3 100644
--- a/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh
+++ b/onnxruntime/contrib_ops/cuda/bert/layer_norm.cuh
@@ -74,97 +74,80 @@ struct KeyValuePairSum {
                                                                const cub::KeyValuePair<float, float>& b) {
     return cub::KeyValuePair<float, float>(a.key + b.key, a.value + b.value);
   }
-
-  __device__ inline cub::KeyValuePair<half, half> operator()(const cub::KeyValuePair<half, half>& a,
-                                                             const cub::KeyValuePair<half, half>& b) {
-    const half2 a2 = __halves2half2(a.key, a.value);
-    const half2 b2 = __halves2half2(b.key, b.value);
-    const half2 res = AddHalf2(a2, b2);
-    return cub::KeyValuePair<half, half>(__low2half(res), __high2half(res));
-  }
-
-  __device__ inline cub::KeyValuePair<half2, half2> operator()(const cub::KeyValuePair<half2, half2>& a,
-                                                               const cub::KeyValuePair<half2, half2>& b) {
-    return cub::KeyValuePair<half2, half2>(AddHalf2(a.key, b.key), AddHalf2(a.value, b.value));
-  }
-
-  __device__ inline cub::KeyValuePair<nv_bfloat16, nv_bfloat16> operator()(const cub::KeyValuePair<nv_bfloat16, nv_bfloat16>& a,
-                                                                           const cub::KeyValuePair<nv_bfloat16, nv_bfloat16>& b) {
-    const nv_bfloat162 a2 = __halves2bfloat162(a.key, a.value);
-    const nv_bfloat162 b2 = __halves2bfloat162(b.key, b.value);
-    const nv_bfloat162 res = AddHalf2(a2, b2);
-    return cub::KeyValuePair<nv_bfloat16, nv_bfloat16>(__low2bfloat16(res), __high2bfloat16(res));
-  }
 };
 
 template <typename T, int TPB>
 __device__ inline void LayerNorm(
-    const cub::KeyValuePair<T, T>& thread_data, const int ld, const int offset, const T* beta,
-    const T* gamma, const T epsilon, T* output) {
+    const cub::KeyValuePair<float, float>& thread_data, const int ld, const int offset, const T* beta,
+    const T* gamma, const float epsilon, T* output) {
   // Assuming thread_data is already divided by ld
+  // Uses fp32 accumulation for mean/variance to avoid overflow in fp16/bf16.
 
-  using BlockReduce = cub::BlockReduce<cub::KeyValuePair<T, T>, TPB>;
+  using BlockReduce = cub::BlockReduce<cub::KeyValuePair<float, float>, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T mu;      // mean
-  __shared__ T rsigma;  // 1 / std.dev.
+  __shared__ float mu;      // mean
+  __shared__ float rsigma;  // 1 / std.dev.
 
   KeyValuePairSum pair_sum;
   const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
 
   if (threadIdx.x == 0) {
     mu = sum_kv.key;
-    rsigma = Rsqrt(sum_kv.value - mu * mu + epsilon);
+    rsigma = rsqrtf(sum_kv.value - mu * mu + epsilon);
   }
   __syncthreads();
 
   for (int i = threadIdx.x; i < ld; i += TPB) {
     const int idx = offset + i;
-    const T val = output[idx];
-    const T g(gamma[i]);
-    const T b = (nullptr == beta) ? (T)0 : beta[i];
-    output[idx] = g * (val - mu) * rsigma + b;
+    const float val = static_cast<float>(output[idx]);
+    const float g = static_cast<float>(gamma[i]);
+    const float b = (nullptr == beta) ? 0.f : static_cast<float>(beta[i]);
+    output[idx] = static_cast<T>(g * (val - mu) * rsigma + b);
   }
 }
 
 template <typename T, int TPB>
 __device__ inline void SimplifiedLayerNorm(
-    const T& thread_data, const int ld, const int offset, const T* gamma, const T epsilon, T* output) {
+    const float& thread_data, const int ld, const int offset, const T* gamma, const float epsilon, T* output) {
   // Assuming thread_data is already divided by ld
+  // Uses fp32 accumulation to avoid overflow in fp16/bf16.
 
-  using BlockReduce = cub::BlockReduce<T, TPB>;
+  using BlockReduce = cub::BlockReduce<float, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T rsigma;  // 1 / std.dev.
+  __shared__ float rsigma;  // 1 / std.dev.
 
-  const T sum = BlockReduce(temp_storage).Sum(thread_data);
+  const float sum = BlockReduce(temp_storage).Sum(thread_data);
 
   if (threadIdx.x == 0) {
-    rsigma = Rsqrt(sum + epsilon);
+    rsigma = rsqrtf(sum + epsilon);
   }
   __syncthreads();
 
   for (int i = threadIdx.x; i < ld; i += TPB) {
     const int idx = offset + i;
-    const T val = output[idx];
-    const T g(gamma[i]);
-    output[idx] = g * val * rsigma;
+    const float val = static_cast<float>(output[idx]);
+    const float g = static_cast<float>(gamma[i]);
+    output[idx] = static_cast<T>(g * val * rsigma);
   }
 }
 
 template <typename T, int TPB, int ILP>
-__device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair<T, T>& thread_data,
+__device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair<float, float>& thread_data,
                                       const int ld, const int idx, const T* beta, const T* gamma,
-                                      const T epsilon, T* output) {
+                                      const float epsilon, T* output) {
   // Assuming thread_data is already divided by ld
   // Small settings: the block covers the leading dimension TPB >= ld. The input
   // value is available in a register
+  // Uses fp32 accumulation for mean/variance to avoid overflow in fp16/bf16.
   using VecT = aligned_vector<T, ILP>;
-  using BlockReduce = cub::BlockReduce<cub::KeyValuePair<T, T>, TPB>;
+  using BlockReduce = cub::BlockReduce<cub::KeyValuePair<float, float>, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T mu;      // mean
-  __shared__ T rsigma;  // 1 / std.dev.
-  T beta_v[ILP], gamma_v[ILP], output_v[ILP];
+  __shared__ float mu;      // mean
+  __shared__ float rsigma;  // 1 / std.dev.
+  T gamma_v[ILP], output_v[ILP];
 
   const bool is_valid = ILP * threadIdx.x < ld;
+  T beta_v[ILP];
   if (is_valid) {
     if (beta != nullptr) {
       VecT* beta_val = reinterpret_cast<VecT*>(&beta_v);
@@ -176,20 +159,21 @@ __device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair<
   }
 
   KeyValuePairSum pair_sum;
-  const cub::KeyValuePair<T, T> sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
+  const cub::KeyValuePair<float, float> sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
 
   if (threadIdx.x == 0) {
     mu = sum_kv.key;
-    rsigma = Rsqrt(sum_kv.value - mu * mu + epsilon);
+    rsigma = rsqrtf(sum_kv.value - mu * mu + epsilon);
   }
   __syncthreads();
 
   if (is_valid) {
 #pragma unroll
     for (int i = 0; i < ILP; i++) {
-      output_v[i] = (beta != nullptr)
-                        ? gamma_v[i] * (input_v[i] - mu) * rsigma + beta_v[i]
-                        : gamma_v[i] * (input_v[i] - mu) * rsigma;
+      const float in_f = static_cast<float>(input_v[i]);
+      const float g_f = static_cast<float>(gamma_v[i]);
+      const float b_f = (beta != nullptr) ? static_cast<float>(beta_v[i]) : 0.f;
+      output_v[i] = static_cast<T>(g_f * (in_f - mu) * rsigma + b_f);
     }
 
     VecT* output_val = reinterpret_cast<VecT*>(&output_v);
@@ -198,15 +182,16 @@ __device__ inline void LayerNormSmall(const T* input_v, const cub::KeyValuePair<
 }
 
 template <typename T, int TPB, int ILP>
-__device__ inline void SimplifiedLayerNormSmall(const T* input_v, const T& thread_data, const int ld, const int idx,
-                                                const T* gamma, const T epsilon, T* output) {
+__device__ inline void SimplifiedLayerNormSmall(const T* input_v, const float& thread_data, const int ld, const int idx,
+                                                const T* gamma, const float epsilon, T* output) {
   // Assuming thread_data is already divided by ld
   // Small settings: the block covers the leading dimension TPB >= ld. The input
   // value is available in a register
+  // Uses fp32 accumulation to avoid overflow in fp16/bf16.
   using VecT = aligned_vector<T, ILP>;
-  using BlockReduce = cub::BlockReduce<T, TPB>;
+  using BlockReduce = cub::BlockReduce<float, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T rsigma;  // 1 / std.dev.
+  __shared__ float rsigma;  // 1 / std.dev.
 
   const bool is_valid = ILP * threadIdx.x < ld;
 
@@ -217,17 +202,19 @@ __device__ inline void SimplifiedLayerNormSmall(const T* input_v, const T& threa
     *gamma_val = *reinterpret_cast<const VecT*>(&gamma[threadIdx.x * ILP]);
   }
 
-  const T sum = BlockReduce(temp_storage).Sum(thread_data);
+  const float sum = BlockReduce(temp_storage).Sum(thread_data);
 
   if (threadIdx.x == 0) {
-    rsigma = Rsqrt(sum + epsilon);
+    rsigma = rsqrtf(sum + epsilon);
   }
   __syncthreads();
 
   if (is_valid) {
 #pragma unroll
     for (int i = 0; i < ILP; i++) {
-      output_v[i] = gamma_v[i] * input_v[i] * rsigma;
+      const float in_f = static_cast<float>(input_v[i]);
+      const float g_f = static_cast<float>(gamma_v[i]);
+      output_v[i] = static_cast<T>(g_f * in_f * rsigma);
     }
 
     VecT* output_val = reinterpret_cast<VecT*>(&output_v);
diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu
index 88ab3b5831afe..7f5169639ff8d 100644
--- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu
@@ -37,24 +37,6 @@ namespace contrib {
 namespace cuda {
 
 namespace {
-template <typename T>
-T maybe2half(float x);
-
-template <>
-float maybe2half(float x) {
-  return x;
-}
-
-template <>
-half maybe2half(float x) {
-  return __float2half_rn(x);
-}
-
-template <>
-nv_bfloat16 maybe2half(float x) {
-  return __float2bfloat16_rn(x);
-}
-
 // Using only power of 2 numbers will lead to waste of compute for same size such as 768, which is a very common case
 // in BERT. Ideally we can step by wrap_size * num_unroll, but listing too many steps will cause long compile time.
 constexpr int kSizes[] = {128, 320, 384, 640, 768, 1024, 1280, 2048, 4096, 5120, 8192};
@@ -90,15 +72,16 @@ bool CanVectorized(void* output, void* sum_output, const void* input, const void
 
 template <typename T, unsigned TPB, bool Simplified>
 __global__ void SkipLayerNormKernel(
-    T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta, T epsilon,
-    const int ld, int skip_size) {
-  const T reverse_ld = T(1.f / ld);
+    T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta,
+    float epsilon, const int ld, int skip_size) {
+  const float reverse_ld = 1.f / ld;
   const int offset = blockIdx.x * ld;
   const bool has_bias = (bias != nullptr);
 
   // Reduce sum of x and x^2, and the results are divided by ld.
+  // Uses fp32 accumulation to avoid overflow in fp16/bf16.
   KeyValuePairSum pair_sum;
-  cub::KeyValuePair<T, T> thread_data(0, 0);
+  cub::KeyValuePair<float, float> thread_data(0.f, 0.f);
 
   for (int i = threadIdx.x; i < ld; i += TPB) {
     const int idx = offset + i;
@@ -109,8 +92,9 @@ __global__ void SkipLayerNormKernel(
     }
     val += skip[idx % skip_size];
 
-    const T rldval = reverse_ld * val;
-    thread_data = pair_sum(thread_data, cub::KeyValuePair<T, T>(rldval, rldval * val));
+    const float val_f = static_cast<float>(val);
+    const float rldval = reverse_ld * val_f;
+    thread_data = pair_sum(thread_data, cub::KeyValuePair<float, float>(rldval, rldval * val_f));
 
     if (sum_output != nullptr) {
       sum_output[idx] = val;
@@ -129,15 +113,15 @@ __global__ void SkipLayerNormKernel(
 // Vectorized kernel
 template <typename T, unsigned TPB, int ILP, bool Simplified>
 __global__ void SkipLayerNormKernelSmall(
-    T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta, T epsilon,
-    int ld, int skip_size) {
-  const T rld = T(1.f / ld);
+    T* output, T* sum_output, const T* input, const T* skip, const T* bias, const T* gamma, const T* beta,
+    float epsilon, int ld, int skip_size) {
+  const float rld = 1.f / ld;
   const int idx = blockIdx.x * ld + threadIdx.x * ILP;
 
   using VecT = aligned_vector<T, ILP>;
   T sum_v[ILP];
 
-  cub::KeyValuePair<T, T> thread_data(T(0.f), T(0.f));
+  cub::KeyValuePair<float, float> thread_data(0.f, 0.f);
 
   if (ILP * threadIdx.x < ld) {  // load data under this guard to avoid reading out-of-bounds
     T skip_v[ILP], bias_v[ILP];
@@ -155,8 +139,8 @@ __global__ void SkipLayerNormKernelSmall(
       *bias_val = *reinterpret_cast<const VecT*>(&bias[threadIdx.x * ILP]);
     }
 
-    T rldval_sum = T(0.f);
-    T rldvalsq_sum = T(0.f);
+    float rldval_sum = 0.f;
+    float rldvalsq_sum = 0.f;
     const bool has_sum_output = (sum_output != nullptr);
 
 #pragma unroll
@@ -166,16 +150,17 @@ __global__ void SkipLayerNormKernelSmall(
       }
       sum_v[i] += skip_v[i];
 
-      const T rldval = rld * sum_v[i];
+      const float val_f = static_cast<float>(sum_v[i]);
+      const float rldval = rld * val_f;
       rldval_sum += rldval;
-      rldvalsq_sum += rldval * sum_v[i];
+      rldvalsq_sum += rldval * val_f;
     }
 
     if (has_sum_output) {
       *(reinterpret_cast<VecT*>(&sum_output[idx])) = *reinterpret_cast<VecT*>(&sum_v);
     }
 
-    thread_data = cub::KeyValuePair<T, T>(rldval_sum, rldvalsq_sum);
+    thread_data = cub::KeyValuePair<float, float>(rldval_sum, rldvalsq_sum);
   }
 
   if (Simplified) {
@@ -203,11 +188,11 @@ void LaunchSkipLayerNormKernel(
 
 #define LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL(num_unroll)                                                  \
   SkipLayerNormKernelSmall<T, block_size, num_unroll, Simplified><<<grid_size, block_size, 0, stream>>>( \
-      output, sum_output, input, skip, bias, gamma, beta, maybe2half<T>(epsilon), ld, skip_size)
+      output, sum_output, input, skip, bias, gamma, beta, epsilon, ld, skip_size)
 
 #define LAUNCH_SKIP_LAYER_NORM_KERNEL()                                                 \
   SkipLayerNormKernel<T, block_size, Simplified><<<grid_size, block_size, 0, stream>>>( \
-      output, sum_output, input, skip, bias, gamma, beta, maybe2half<T>(epsilon), ld, skip_size)
+      output, sum_output, input, skip, bias, gamma, beta, epsilon, ld, skip_size)
 
 #define CASE_NEXT_SIZE(next_size_value)                                         \
   case next_size_value: {                                                       \
diff --git a/onnxruntime/test/python/transformers/parse_nsys.py b/onnxruntime/test/python/transformers/parse_nsys.py
new file mode 100644
index 0000000000000..361e89904efdc
--- /dev/null
+++ b/onnxruntime/test/python/transformers/parse_nsys.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""
+Parse nsys SQLite output to extract CUDA kernel timings.
+
+Usage:
+  # First, profile with nsys:
+  nsys profile -o sln_fp16 --export=sqlite python profile_skip_layer_norm.py --mode fp16 --warmup 5 --repeat 100
+  nsys profile -o gqa_int8 --export=sqlite python profile_gqa.py --mode int8 --warmup 5 --repeat 10
+
+  # Then parse the results (using NVTX marker to exclude warmup):
+  python parse_nsys.py sln_fp16.sqlite --nvtx-range benchmark
+  python parse_nsys.py gqa_int8.sqlite --nvtx-range benchmark --output results.json
+  python parse_nsys.py gqa_int8.sqlite --format csv --output results.csv
+
+  # Alternative: skip first N calls per kernel to exclude warmup:
+  python parse_nsys.py sln_fp16.sqlite --skip-first 5
+"""
+
+import argparse
+import json
+import sqlite3
+import sys
+from pathlib import Path
+
+
+def parse_nsys_sqlite(
+    db_path: str,
+    kernel_patterns: list[str] | None = None,
+    skip_first: int = 0,
+    nvtx_range: str | None = None,
+) -> list[dict]:
+    """
+    Parse nsys SQLite database and extract kernel timing information.
+
+    Args:
+        db_path: Path to the .sqlite file exported by nsys
+        kernel_patterns: List of SQL LIKE patterns to filter kernels (default: onnxruntime patterns)
+        skip_first: Number of initial kernel calls to skip per kernel type (e.g., to exclude warmup)
+        nvtx_range: If specified, only include kernels launched within this NVTX range
+
+    Returns:
+        List of dicts with kernel timing info
+    """
+    if kernel_patterns is None:
+        kernel_patterns = [
+            "%onnxruntime%",
+        ]
+
+    conn = sqlite3.connect(db_path)
+
+    # Build WHERE clause for kernel patterns using parameterized queries
+    pattern_placeholders = " OR ".join(["s.value LIKE ?" for _ in kernel_patterns])
+    params: list = list(kernel_patterns)
+
+    if nvtx_range:
+        # Filter kernels that launched within the specified NVTX range
+        if skip_first > 0:
+            query = f"""
+            WITH numbered AS (
+                SELECT
+                    s.value as kernel_name,
+                    k.end - k.start as duration_ns,
+                    ROW_NUMBER() OVER (PARTITION BY s.value ORDER BY k.start) as call_num
+                FROM CUPTI_ACTIVITY_KIND_KERNEL k
+                JOIN StringIds s ON k.demangledName = s.id
+                JOIN NVTX_EVENTS n ON k.start >= n.start AND k.start <= n.end
+                JOIN StringIds ns ON n.textId = ns.id
+                WHERE ({pattern_placeholders}) AND ns.value = ?
+            )
+            SELECT
+                kernel_name,
+                SUM(duration_ns) as total_ns,
+                COUNT(*) as call_count,
+                MIN(duration_ns) as min_ns,
+                MAX(duration_ns) as max_ns,
+                AVG(duration_ns) as avg_ns
+            FROM numbered
+            WHERE call_num > ?
+            GROUP BY kernel_name
+            ORDER BY total_ns DESC
+            """
+            params.append(nvtx_range)
+            params.append(skip_first)
+        else:
+            query = f"""
+            SELECT
+                s.value as kernel_name,
+                SUM(k.end - k.start) as total_ns,
+                COUNT(*) as call_count,
+                MIN(k.end - k.start) as min_ns,
+                MAX(k.end - k.start) as max_ns,
+                AVG(k.end - k.start) as avg_ns
+            FROM CUPTI_ACTIVITY_KIND_KERNEL k
+            JOIN StringIds s ON k.demangledName = s.id
+            JOIN NVTX_EVENTS n ON k.start >= n.start AND k.start <= n.end
+            JOIN StringIds ns ON n.textId = ns.id
+            WHERE ({pattern_placeholders}) AND ns.value = ?
+            GROUP BY s.value
+            ORDER BY total_ns DESC
+            """
+            params.append(nvtx_range)
+    elif skip_first > 0:
+        # Use window function to number calls and skip first N per kernel type
+        query = f"""
+        WITH numbered AS (
+            SELECT
+                s.value as kernel_name,
+                k.end - k.start as duration_ns,
+                ROW_NUMBER() OVER (PARTITION BY s.value ORDER BY k.start) as call_num
+            FROM CUPTI_ACTIVITY_KIND_KERNEL k
+            JOIN StringIds s ON k.demangledName = s.id
+            WHERE {pattern_placeholders}
+        )
+        SELECT
+            kernel_name,
+            SUM(duration_ns) as total_ns,
+            COUNT(*) as call_count,
+            MIN(duration_ns) as min_ns,
+            MAX(duration_ns) as max_ns,
+            AVG(duration_ns) as avg_ns
+        FROM numbered
+        WHERE call_num > ?
+        GROUP BY kernel_name
+        ORDER BY total_ns DESC
+        """
+        params.append(skip_first)
+    else:
+        # Original query without skipping
+        query = f"""
+        SELECT
+            s.value as kernel_name,
+            SUM(k.end - k.start) as total_ns,
+            COUNT(*) as call_count,
+            MIN(k.end - k.start) as min_ns,
+            MAX(k.end - k.start) as max_ns,
+            AVG(k.end - k.start) as avg_ns
+        FROM CUPTI_ACTIVITY_KIND_KERNEL k
+        JOIN StringIds s ON k.demangledName = s.id
+        WHERE {pattern_placeholders}
+        GROUP BY s.value
+        ORDER BY total_ns DESC
+        """
+
+    results = []
+    try:
+        cursor = conn.execute(query, params)
+        rows = cursor.fetchall()
+        for row in rows:
+            results.append(
+                {
+                    "kernel_name": row[0],
+                    "total_ms": row[1] / 1e6,  # ns to ms
+                    "call_count": row[2],
+                    "min_us": row[3] / 1e3,  # ns to us
+                    "max_us": row[4] / 1e3,
+                    "avg_us": row[5] / 1e3,
+                }
+            )
+    except sqlite3.OperationalError as e:
+        print(f"SQL Error: {e}", file=sys.stderr)
+
+    conn.close()
+    return results
+
+
+def list_all_kernels(db_path: str) -> list[str]:
+    """List all kernel names in the database for debugging."""
+    conn = sqlite3.connect(db_path)
+
+    try:
+        # Join with StringIds to get actual kernel names
+        cursor = conn.execute("""
+            SELECT DISTINCT s.value
+            FROM CUPTI_ACTIVITY_KIND_KERNEL k
+            JOIN StringIds s ON k.demangledName = s.id
+            ORDER BY s.value
+        """)
+        return [row[0] for row in cursor.fetchall()]
+    except sqlite3.OperationalError as e:
+        print(f"SQL Error: {e}", file=sys.stderr)
+        return []
+    finally:
+        conn.close()
+
+
+def format_kernel_name(kernel_name: str) -> str:
+    prefix_list = [
+        "void onnxruntime::contrib::cuda::",
+        "void onnxruntime::",
+        "onnxruntime::contrib::cuda::",
+        "onnxruntime::",
+    ]
+    for prefix in prefix_list:
+        if kernel_name.startswith(prefix):
+            return kernel_name[len(prefix) :]
+    return kernel_name
+
+
+def format_table(results: list[dict], prefix: str) -> str:
+    """Format results as a human-readable table."""
+    if not results:
+        return "No matching kernels found."
+
+    kernel_name_len_limit = 64
+    lines = []
+    lines.append(
+        f"{prefix}{'Kernel Name':<{kernel_name_len_limit}} {'Total(ms)':>10} {'Calls':>8} {'Avg(us)':>10} {'Min(us)':>10} {'Max(us)':>10}"
+    )
+    lines.append("-" * 120)
+
+    for r in results:
+        kernel_name = format_kernel_name(r["kernel_name"])
+        name = (
+            kernel_name[:kernel_name_len_limit] + "..." if len(kernel_name) > kernel_name_len_limit - 3 else kernel_name
+        )
+        lines.append(
+            f"{name:<{kernel_name_len_limit}} {r['total_ms']:>10.3f} {r['call_count']:>8d} {r['avg_us']:>10.2f} {r['min_us']:>10.2f} {r['max_us']:>10.2f}"
+        )
+
+    return "\n".join(lines)
+
+
+def format_csv(results: list[dict]) -> str:
+    """Format results as CSV."""
+    lines = ["kernel_name,total_ms,call_count,avg_us,min_us,max_us"]
+    for r in results:
+        lines.append(
+            f'"{r["kernel_name"]}",{r["total_ms"]:.6f},{r["call_count"]},{r["avg_us"]:.3f},{r["min_us"]:.3f},{r["max_us"]:.3f}'
+        )
+    return "\n".join(lines)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Parse nsys SQLite output for CUDA kernel timings",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Profile and parse (using NVTX range to exclude warmup):
+  nsys profile -o sln --export=sqlite python profile_skip_layer_norm.py --warmup 5 --repeat 100
+  python parse_nsys.py sln.sqlite --nvtx-range benchmark
+
+  # Alternative: skip first N warmup calls per kernel:
+  python parse_nsys.py sln.sqlite --skip-first 5
+
+  # Export to JSON:
+  python parse_nsys.py sln.sqlite --nvtx-range benchmark --format json --output results.json
+
+  # List all kernels (for debugging):
+  python parse_nsys.py sln.sqlite --list-kernels
+        """,
+    )
+    parser.add_argument("sqlite_file", help="Path to nsys SQLite export file")
+    parser.add_argument(
+        "--format", choices=["table", "json", "csv"], default="table", help="Output format (default: table)"
+    )
+    parser.add_argument("--output", "-o", help="Output file (default: stdout)")
+    parser.add_argument("--list-kernels", action="store_true", help="List all kernel names in the database")
+    parser.add_argument("--pattern", action="append", help="Add custom kernel name pattern (SQL LIKE syntax)")
+    parser.add_argument("--tag", default="", help="Tag for kernel name in output table. Example tag: 'fp16' or 'int8'")
+    parser.add_argument(
+        "--nvtx-range",
+        metavar="NAME",
+        help="Only include kernels launched within this NVTX range (e.g., 'benchmark')",
+    )
+    parser.add_argument(
+        "--skip-first",
+        type=int,
+        default=0,
+        metavar="N",
+        help="Skip first N calls per kernel type (e.g., to exclude warmup iterations)",
+    )
+
+    args = parser.parse_args()
+
+    if not Path(args.sqlite_file).exists():
+        print(f"Error: File not found: {args.sqlite_file}", file=sys.stderr)
+        sys.exit(1)
+
+    if args.list_kernels:
+        kernels = list_all_kernels(args.sqlite_file)
+        print(f"Found {len(kernels)} unique kernels:")
+        for k in kernels:
+            print(f"  {k}")
+        return
+
+    # Parse kernel timings
+    patterns = args.pattern if args.pattern else None
+    results = parse_nsys_sqlite(args.sqlite_file, patterns, skip_first=args.skip_first, nvtx_range=args.nvtx_range)
+
+    if args.nvtx_range:
+        print(f"(Filtering kernels within NVTX range: '{args.nvtx_range}')\n")
+    elif args.skip_first > 0:
+        print(f"(Skipping first {args.skip_first} calls per kernel)\n")
+
+    # Format output
+    if args.format == "json":
+        output = json.dumps(results, indent=2)
+    elif args.format == "csv":
+        output = format_csv(results)
+    else:
+        output = format_table(results, args.tag + " " if args.tag else "")
+
+    # Write output
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+        print(f"Results written to {args.output}")
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/python/transformers/profile_skip_layer_norm.py b/onnxruntime/test/python/transformers/profile_skip_layer_norm.py
new file mode 100644
index 0000000000000..272a06227f653
--- /dev/null
+++ b/onnxruntime/test/python/transformers/profile_skip_layer_norm.py
@@ -0,0 +1,176 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+"""
+Profiling script for SkipLayerNormalization CUDA kernel.
+
+Usage:
+  cd onnxruntime/test/python/transformers
+  python profile_skip_layer_norm.py
+
+  # Profile with Nsight Systems (timeline analysis) and extract kernel timings:
+  nsys profile -o sln_fp16 --export=sqlite python profile_skip_layer_norm.py --mode fp16 --warmup 5 --repeat 100
+  python parse_nsys.py sln_fp16.sqlite --nvtx-range benchmark
+
+"""
+
+import argparse
+import os
+import tempfile
+import time
+
+import numpy as np
+from onnx import TensorProto, helper, save_model
+
+import onnxruntime as ort
+
+# Optional NVTX support for nsys range markers
+try:
+    import nvtx
+
+    HAS_NVTX = True
+except ImportError:
+    HAS_NVTX = False
+
+    class DummyNvtxRange:
+        def __init__(self, name):
+            pass
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *args):
+            pass
+
+    class nvtx:  # noqa: N801
+        @staticmethod
+        def annotate(name, color=None):
+            return DummyNvtxRange(name)
+
+
+def create_skip_layer_norm_model(batch_size, seq_len, hidden_size, data_type, simplified=False):
+    """Create an ONNX model with a single SkipLayerNormalization op."""
+    onnx_type = TensorProto.FLOAT16 if data_type == np.float16 else TensorProto.FLOAT
+
+    input_tensor = helper.make_tensor_value_info("INPUT", onnx_type, [batch_size, seq_len, hidden_size])
+    skip_tensor = helper.make_tensor_value_info("SKIP", onnx_type, [batch_size, seq_len, hidden_size])
+    gamma_tensor = helper.make_tensor_value_info("GAMMA", onnx_type, [hidden_size])
+    beta_tensor = helper.make_tensor_value_info("BETA", onnx_type, [hidden_size])
+    bias_tensor = helper.make_tensor_value_info("BIAS", onnx_type, [hidden_size])
+
+    output_tensor = helper.make_tensor_value_info("OUTPUT", onnx_type, [batch_size, seq_len, hidden_size])
+
+    op_type = "SkipSimplifiedLayerNormalization" if simplified else "SkipLayerNormalization"
+    if simplified:
+        inputs = ["INPUT", "SKIP", "GAMMA", "BIAS"]
+        input_list = [input_tensor, skip_tensor, gamma_tensor, bias_tensor]
+    else:
+        inputs = ["INPUT", "SKIP", "GAMMA", "BETA", "BIAS"]
+        input_list = [input_tensor, skip_tensor, gamma_tensor, beta_tensor, bias_tensor]
+
+    node = helper.make_node(
+        op_type,
+        inputs=inputs,
+        outputs=["OUTPUT", "", "", ""],
+        domain="com.microsoft",
+        epsilon=1e-5,
+    )
+
+    graph = helper.make_graph([node], "skip_layer_norm_profile", input_list, [output_tensor])
+
+    opset_imports = [
+        helper.make_opsetid("", 17),
+        helper.make_opsetid("com.microsoft", 1),
+    ]
+
+    model = helper.make_model(graph, opset_imports=opset_imports)
+    model.ir_version = 7
+    return model
+
+
+def run_profiling(args):
+    """Run profiling for SkipLayerNormalization."""
+    data_type = np.float16 if args.mode == "fp16" else np.float32
+
+    print(f"\n{'=' * 70}")
+    print("SkipLayerNormalization Profiling")
+    print(f"{'=' * 70}")
+    print(f"Config: batch={args.batch_size}, seq_len={args.seq_len}, hidden_size={args.hidden_size}")
+    print(f"        mode={args.mode}, simplified={args.simplified}")
+    print(f"        warmup={args.warmup}, repeat={args.repeat}")
+    print(f"{'=' * 70}\n")
+
+    model = create_skip_layer_norm_model(
+        args.batch_size, args.seq_len, args.hidden_size, data_type, simplified=args.simplified
+    )
+
+    with tempfile.NamedTemporaryFile(suffix=".onnx", delete=False) as f:
+        model_path = f.name
+        save_model(model, model_path)
+
+    try:
+        sess_opt = ort.SessionOptions()
+        sess_opt.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
+        sess = ort.InferenceSession(model_path, sess_options=sess_opt, providers=["CUDAExecutionProvider"])
+
+        # Create inputs
+        np.random.seed(42)
+        if args.simplified:
+            feeds = {
+                "INPUT": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type),
+                "SKIP": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type),
+                "GAMMA": np.random.rand(args.hidden_size).astype(data_type),
+                "BIAS": np.random.rand(args.hidden_size).astype(data_type),
+            }
+        else:
+            feeds = {
+                "INPUT": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type),
+                "SKIP": np.random.rand(args.batch_size, args.seq_len, args.hidden_size).astype(data_type),
+                "GAMMA": np.random.rand(args.hidden_size).astype(data_type),
+                "BETA": np.random.rand(args.hidden_size).astype(data_type),
+                "BIAS": np.random.rand(args.hidden_size).astype(data_type),
+            }
+
+        # Warmup
+        with nvtx.annotate("warmup", color="yellow"):
+            for _ in range(args.warmup):
+                sess.run(None, feeds)
+
+        # Benchmark with NVTX annotation
+        with nvtx.annotate("benchmark", color="green"):
+            start = time.perf_counter()
+            for _ in range(args.repeat):
+                sess.run(None, feeds)
+            end = time.perf_counter()
+
+        avg_ms = (end - start) * 1000 / args.repeat
+        elem_size = 2 if data_type == np.float16 else 4
+        total_elements = args.batch_size * args.seq_len * args.hidden_size
+        bytes_transferred = 4 * total_elements * elem_size
+        throughput_gbps = bytes_transferred / (avg_ms * 1e-3) / 1e9
+
+        print(f"  Average time: {avg_ms:.4f} ms")
+        print(f"  Throughput:   {throughput_gbps:.2f} GB/s")
+
+    finally:
+        os.unlink(model_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Profile SkipLayerNormalization CUDA kernel")
+    parser.add_argument("--mode", choices=["fp16", "fp32"], default="fp16", help="Data type")
+    parser.add_argument("--batch-size", type=int, default=1, help="Batch size")
+    parser.add_argument("--seq-len", type=int, default=2048, help="Sequence length")
+    parser.add_argument("--hidden-size", type=int, default=4096, help="Hidden size")
+    parser.add_argument("--simplified", action="store_true", help="Use SkipSimplifiedLayerNormalization")
+    parser.add_argument("--warmup", type=int, default=5, help="Warmup iterations")
+    parser.add_argument("--repeat", type=int, default=100, help="Benchmark iterations")
+
+    args = parser.parse_args()
+    run_profiling(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/onnxruntime/test/python/transformers/profile_skip_layer_norm.sh b/onnxruntime/test/python/transformers/profile_skip_layer_norm.sh
new file mode 100755
index 0000000000000..0e638fc13ea19
--- /dev/null
+++ b/onnxruntime/test/python/transformers/profile_skip_layer_norm.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+#
+# Profile SkipLayerNormalization CUDA kernel with nsys.
+#
+# Usage:
+#   ./profile_skip_layer_norm.sh                           # Profile with defaults (fp16, H=4096)
+#   ./profile_skip_layer_norm.sh --hidden-size 8192        # Different hidden size
+#   ./profile_skip_layer_norm.sh --fp32 --batch-size 4     # FP32 mode, batch=4
+#
+
+set -e
+set -o pipefail
+
+# Default parameters
+BATCH_SIZE=""
+SEQ_LEN=""
+HIDDEN_SIZE=""
+MODE="--mode fp16"
+SIMPLIFIED=""
+OUTPUT_NAME="sln_profile"
+
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --batch-size)
+            BATCH_SIZE="--batch-size $2"
+            shift
+            ;;
+        --seq-len)
+            SEQ_LEN="--seq-len $2"
+            shift
+            ;;
+        --hidden-size)
+            HIDDEN_SIZE="--hidden-size $2"
+            shift
+            ;;
+        --fp32)
+            MODE="--mode fp32"
+            ;;
+        --simplified)
+            SIMPLIFIED="--simplified"
+            ;;
+        -o|--output)
+            OUTPUT_NAME="$2"
+            shift
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: $0 [--batch-size N] [--seq-len N] [--hidden-size N] [--fp32] [--simplified] [-o NAME]"
+            exit 1
+            ;;
+    esac
+    shift
+done
+
+EXTRA_ARGS="${BATCH_SIZE} ${SEQ_LEN} ${HIDDEN_SIZE} ${MODE} ${SIMPLIFIED}"
+
+# Check nvtx availability (optional, for NVTX range markers)
+HAVE_NVTX=0
+if python -c "import nvtx" 2>/dev/null; then
+    HAVE_NVTX=1
+else
+    echo "Note: 'nvtx' package not installed. NVTX range markers will be disabled."
+    echo "      Install with: pip install nvtx"
+    echo "      Falling back to --skip-first to exclude warmup iterations."
+fi
+
+echo ""
+echo "========================================"
+echo "  Profiling: SkipLayerNormalization"
+echo "========================================"
+rm -f "${OUTPUT_NAME}.nsys-rep" "${OUTPUT_NAME}.sqlite"
+nsys profile -o "${OUTPUT_NAME}" --export=sqlite \
+    python profile_skip_layer_norm.py --warmup 5 --repeat 100 $EXTRA_ARGS
+echo ""
+echo "---- Kernel results ----"
+if [[ "$HAVE_NVTX" -eq 1 ]]; then
+    python parse_nsys.py "${OUTPUT_NAME}.sqlite" --nvtx-range benchmark
+else
+    python parse_nsys.py "${OUTPUT_NAME}.sqlite" --skip-first 5
+fi
+
+echo ""
+echo "Done."