CUDALibrarySamples/cuBLASLt/LtSgemmSimpleAutoTuning/sample_cublasLt_LtSgemmSimpleAutoTuning.cu at main · NVIDIA/CUDALibrarySamples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
 * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cstdio>
#include <vector>
#include <algorithm>

#include <cublasLt.h>
#include <cuda_runtime_api.h>

#include "sample_cublasLt_LtSgemmSimpleAutoTuning.h"
#include "helpers.h"

float median(std::vector<float> &times) {
    const size_t size = times.size();
    if (size == 0) {
        return 0;
    }

    std::sort(times.begin(), times.end());

    const size_t mid = size / 2;
    if (size % 2 == 0) {
        return (times[mid] + times[mid - 1]) / 2;
    } else {
        return times[mid];
    }
}

/// Sample wrapper executing single precision gemm algorithm auto tuning by querying cublasLt heuristics for best
/// algorithms, iterate over the results and pick the algorithm that have the best performance for the given problem
///
/// pointer mode is always host, to change it configure the appropriate matmul descriptor attribute
/// matmul is not using cublas handle's configuration of math mode, here tensor ops are implicitly allowed; to change
/// this configure appropriate attribute in the preference handle
void LtSgemmSimpleAutoTuning(cublasLtHandle_t ltHandle,
                             cublasOperation_t transa,
                             cublasOperation_t transb,
                             int m,
                             int n,
                             int k,
                             const float *alpha, /* host pointer */
                             const float *A,
                             int lda,
                             const float *B,
                             int ldb,
                             const float *beta, /* host pointer */
                             float *C,
                             int ldc,
                             void *workspace,
                             size_t workspaceSize,
                             cublasLtMatmulAlgo_t &algo) {
    cublasLtMatmulDesc_t operationDesc = NULL;
    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
    cublasLtMatmulPreference_t preference = NULL;

    const int requestedAlgoCount = 8;
    int returnedResults = 0;
    cublasLtMatmulHeuristicResult_t heuristicResult[requestedAlgoCount] = {0};
    int bestAlgoIdx = 0;
    float time = 0;
    float bestAlgoTime = 0;
    cudaStream_t stream;
    cudaEvent_t startEvent, stopEvent;

    // create operation desciriptor; see cublasLtMatmulDescAttributes_t for details about defaults; here we just need to
    // set the transforms for A and B
    checkCublasStatus(cublasLtMatmulDescCreate(&operationDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F));
    checkCublasStatus(
        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(transa)));
    checkCublasStatus(
        cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(transb)));

    // create matrix descriptors, we are good with the details here so no need to set any extra attributes
    checkCublasStatus(cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_32F, transa == CUBLAS_OP_N ? m : k,
                                                 transa == CUBLAS_OP_N ? k : m, lda));
    checkCublasStatus(cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_32F, transb == CUBLAS_OP_N ? k : n,
                                                 transb == CUBLAS_OP_N ? n : k, ldb));
    checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32F, m, n, ldc));

    // create preference handle; here we could use extra attributes to disable tensor ops or to make sure algo selected
    // will work with badly aligned A, B, C; here for simplicity we just assume A,B,C are always well aligned (e.g.
    // directly come from cudaMalloc)
    checkCublasStatus(cublasLtMatmulPreferenceCreate(&preference));
    checkCublasStatus(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
                                                           &workspaceSize, sizeof(workspaceSize)));

    // we just need the best available heuristic to try and run matmul. There is no guarantee this will work, e.g. if A
    // is badly aligned, you can request more (e.g. 32) algos and try to run them one by one until something works
    checkCublasStatus(cublasLtMatmulAlgoGetHeuristic(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, preference,
                                                     requestedAlgoCount, heuristicResult, &returnedResults));

    if (returnedResults == 0) {
        checkCublasStatus(CUBLAS_STATUS_NOT_SUPPORTED);
    }

    checkCudaStatus(cudaStreamCreate(&stream));
    checkCudaStatus(cudaEventCreate(&startEvent));
    checkCudaStatus(cudaEventCreate(&stopEvent));

    constexpr int repeatAlgoCheck = 5;
    std::vector<float> algoTimes(repeatAlgoCheck);

    for (int algoIdx = 0; algoIdx < returnedResults; algoIdx++) {
        for (int checkIdx = 0; checkIdx < repeatAlgoCheck; checkIdx++) {
            checkCudaStatus(cudaEventRecord(startEvent, stream));

            checkCublasStatus(cublasLtMatmul(ltHandle, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C,
                                             Cdesc, &heuristicResult[algoIdx].algo, workspace, workspaceSize, stream));

            checkCudaStatus(cudaEventRecord(stopEvent, stream));
            checkCudaStatus(cudaEventSynchronize(stopEvent));
            checkCudaStatus(cudaEventElapsedTime(&time, startEvent, stopEvent));
            algoTimes[checkIdx] = time;
        }

        time = median(algoTimes);

        if (algoIdx == 0 || time < bestAlgoTime) {
            bestAlgoTime = time;
            bestAlgoIdx = algoIdx;
        }
    }

    memcpy(&algo, &heuristicResult[bestAlgoIdx].algo, sizeof(algo));

    // descriptors are no longer needed as all GPU work was already enqueued
    if (preference) checkCublasStatus(cublasLtMatmulPreferenceDestroy(preference));
    if (Cdesc) checkCublasStatus(cublasLtMatrixLayoutDestroy(Cdesc));
    if (Bdesc) checkCublasStatus(cublasLtMatrixLayoutDestroy(Bdesc));
    if (Adesc) checkCublasStatus(cublasLtMatrixLayoutDestroy(Adesc));
    if (operationDesc) cublasLtMatmulDescDestroy(operationDesc);
    if (stream) checkCudaStatus(cudaStreamDestroy(stream));
    if (startEvent) checkCudaStatus(cudaEventDestroy(startEvent));
    if (stopEvent) checkCudaStatus(cudaEventDestroy(stopEvent));
}