From 626bad5c8fd222ea4cce5c22e7efd281110e47b9 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 6 Jul 2018 15:29:02 +0800 Subject: [PATCH 01/49] Fix building. The build is performed by cmake. Add missing header and use cblas/cblas.h instead of cblas.h. --- src/common/utils.h | 4 ++-- src/cpu/h2o4gpuglm.cpp | 1 + src/cpu/h2o4gpukmeans.cpp | 2 +- src/gpu/h2o4gpuglm.cu | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/common/utils.h b/src/common/utils.h index 77b15827d..10c97ac03 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -4,7 +4,7 @@ */ #pragma once #include -#include "cblas.h" +#include "cblas/cblas.h" template void self_dot(std::vector array_in, int n, int dim, @@ -18,4 +18,4 @@ void compute_distances(std::vector data_in, void compute_distances(std::vector data_in, std::vector centroids_in, std::vector &pairwise_distances, - int n, int dim, int k); \ No newline at end of file + int n, int dim, int k); diff --git a/src/cpu/h2o4gpuglm.cpp b/src/cpu/h2o4gpuglm.cpp index 39b664621..953428400 100644 --- a/src/cpu/h2o4gpuglm.cpp +++ b/src/cpu/h2o4gpuglm.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/src/cpu/h2o4gpukmeans.cpp b/src/cpu/h2o4gpukmeans.cpp index acc4c105d..45d3940c9 100644 --- a/src/cpu/h2o4gpukmeans.cpp +++ b/src/cpu/h2o4gpukmeans.cpp @@ -10,7 +10,7 @@ #include #include //#include "mkl.h" -#include "cblas.h" +#include "cblas/cblas.h" #include #include diff --git a/src/gpu/h2o4gpuglm.cu b/src/gpu/h2o4gpuglm.cu index ec16a828e..3eafafac3 100644 --- a/src/gpu/h2o4gpuglm.cu +++ b/src/gpu/h2o4gpuglm.cu @@ -10,6 +10,7 @@ #include #include +#include #include #include From a7cf46263eaa1dff4922f9581c216b07c507665f Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 6 Jul 2018 23:31:51 +0800 Subject: [PATCH 02/49] Enable building tests with cmake and make swig optional. * Add test target to CMakeLists.txt. * Fix include path in cu files in tests/cpp/gpu/kmeans. * Make SWIG an optional dependencies. --- CMakeLists.txt | 82 ++++++++----------- src/swig/CMakeLists.txt | 48 +++++++++++ tests/cpp/gpu/kmeans/test_kmeans_centroids.cu | 2 +- tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu | 4 +- tests/cpp/gpu/kmeans/test_kmeans_labels.cu | 2 +- 5 files changed, 84 insertions(+), 54 deletions(-) create mode 100644 src/swig/CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index c88dd0502..eb8264337 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,13 +7,12 @@ SET_DEFAULT_CONFIGURATION_RELEASE() FIND_PACKAGE(OpenMP) FIND_PACKAGE(BLAS REQUIRED) -FIND_PACKAGE(SWIG REQUIRED) -FIND_PACKAGE(PythonLibs REQUIRED) # SWIG - -INCLUDE(${SWIG_USE_FILE}) #============= OPTIONS & SETTINGS OPTION(USE_CUDA "Build with GPU acceleration" ON) +OPTION(USE_SWIG "Use swig to generate language bindings." ON) +OPTION(BUILD_TESTS "Build tests." ON) +OPTION(USE_SYSTEM_GTEST "Use system google tests." ON) OPTION(DEV_BUILD "Dev build" OFF) # Compiler flags @@ -22,12 +21,6 @@ SET(CMAKE_CXX_STANDARD_REQUIRED ON) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") -# PythonLibs' PYTHON_INCLUDE_PATH doesn't take into account virtualenv etc. -# Open to suggestions how to do this better. -EXECUTE_PROCESS(COMMAND python -c "import numpy; print(numpy.get_include())" - OUTPUT_VARIABLE PYTHON_INCLUDE_PATH_CUST - OUTPUT_STRIP_TRAILING_WHITESPACE) - if(OpenMP_CXX_FOUND OR OPENMP_FOUND) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() @@ -49,13 +42,10 @@ FILE(GLOB_RECURSE COMMON_SOURCES ) INCLUDE_DIRECTORIES( - src/include - src/cpu/include - # Here and not in target_include_directories b/c cmake < 3.7 which we use in Dockerfiles does not support it - src/gpu/include - ${PYTHON_INCLUDE_PATH} - ${PYTHON_INCLUDE_PATH_CUST} -) + src/include + src/cpu/include + # Here and not in target_include_directories b/c cmake < 3.7 which we use in Dockerfiles does not support it + src/gpu/include) ADD_LIBRARY(commonh2o4gpu OBJECT ${COMMON_SOURCES}) #============= BUILD COMMON CPU/GPU CODE @@ -70,25 +60,6 @@ ADD_LIBRARY(cpuh2o4gpu STATIC ${CPU_SOURCES} $) TARGET_LINK_LIBRARIES(cpuh2o4gpu ${BLAS_LIBRARIES}) #============= BUILD CPU LIBRARY -#============= SWIG -SET(CMAKE_SWIG_FLAGS -Werror) -#============= SWIG - -#============= CPU SWIG -SET_SOURCE_FILES_PROPERTIES(src/swig/ch2o4gpu_cpu.i PROPERTIES CPLUSPLUS ON) - -if (${CMAKE_VERSION} VERSION_LESS "3.8.0") - SWIG_ADD_MODULE(ch2o4gpu_cpu python src/swig/ch2o4gpu_cpu.i) -else() - SWIG_ADD_LIBRARY(ch2o4gpu_cpu LANGUAGE python SOURCES src/swig/ch2o4gpu_cpu.i) -endif() - -SWIG_LINK_LIBRARIES(ch2o4gpu_cpu cpuh2o4gpu ${PYTHON_LIBRARIES}) - -SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_cpu_REAL_NAME} PROPERTIES - LINK_FLAGS ${OpenMP_CXX_FLAGS}) -#============= CPU SWIG - if(USE_CUDA) FIND_PACKAGE(CUDA 8.0 REQUIRED) FIND_PACKAGE(NVML REQUIRED) @@ -140,18 +111,29 @@ if(USE_CUDA) ${NVTX_LIBRARY} ${NVML_LIBRARY}) #============= BUILD GPU LIBRARY - - #============= GPU SWIG - SET_SOURCE_FILES_PROPERTIES(src/swig/ch2o4gpu_gpu.i PROPERTIES CPLUSPLUS ON) - - if (${CMAKE_VERSION} VERSION_LESS "3.8.0") - SWIG_ADD_MODULE(ch2o4gpu_gpu python src/swig/ch2o4gpu_gpu.i) - else() - SWIG_ADD_LIBRARY(ch2o4gpu_gpu LANGUAGE python SOURCES src/swig/ch2o4gpu_gpu.i) - endif() - SWIG_LINK_LIBRARIES(ch2o4gpu_gpu gpuh2o4gpu ${PYTHON_LIBRARIES}) - - SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_gpu_REAL_NAME} PROPERTIES - LINK_FLAGS ${OpenMP_CXX_FLAGS}) - #============= GPU SWIG endif() + +if(USE_SWIG) + add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/swig) +endif(USE_SWIG) + +#============= Tests +if(BUILD_TESTS) + ENABLE_TESTING() + if (USE_SYSTEM_GTEST) + FIND_PACKAGE(GTest REQUIRED) + else () + ADD_SUBDIRECTORY(${CMAKE_CURRENT_LIST_DIR}/tests/googletest) + SET(GTEST_LIBRARIES gtest gtest_main) + endif (USE_SYSTEM_GTEST) + if (USE_CUDA) + FILE(GLOB_RECURSE CUDA_TEST_SOURCES "tests/cpp/*.cu") + CUDA_COMPILE(CUDA_TEST_OBJS ${CUDA_TEST_SOURCES}) + endif(USE_CUDA) + + ADD_EXECUTABLE(test-h2o4gpu ${CUDA_TEST_OBJS}) + TARGET_LINK_LIBRARIES(test-h2o4gpu ${GTEST_LIBRARIES}) + TARGET_LINK_LIBRARIES(test-h2o4gpu gpuh2o4gpu) + ADD_TEST(TestH2O4GPU test-h2o4gpu) +endif(BUILD_TESTS) +#============= Tests diff --git a/src/swig/CMakeLists.txt b/src/swig/CMakeLists.txt new file mode 100644 index 000000000..3ee50c0c4 --- /dev/null +++ b/src/swig/CMakeLists.txt @@ -0,0 +1,48 @@ +FIND_PACKAGE(SWIG REQUIRED) +FIND_PACKAGE(PythonLibs REQUIRED) + +INCLUDE(${SWIG_USE_FILE}) +# PythonLibs' PYTHON_INCLUDE_PATH doesn't take into account virtualenv etc. +# Open to suggestions how to do this better. +include_directories( + ${PYTHON_INCLUDE_PATH} + ${PYTHON_INCLUDE_PATH_CUST}) + +EXECUTE_PROCESS(COMMAND python -c "import numpy; print(numpy.get_include())" + OUTPUT_VARIABLE PYTHON_INCLUDE_PATH_CUST + OUTPUT_STRIP_TRAILING_WHITESPACE) + +#============= SWIG +SET(CMAKE_SWIG_FLAGS -Werror) +#============= SWIG + +#============= CPU SWIG +SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_cpu.i PROPERTIES CPLUSPLUS ON) + +if (${CMAKE_VERSION} VERSION_LESS "3.8.0") + SWIG_ADD_MODULE(ch2o4gpu_cpu python ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_cpu.i) +else() + SWIG_ADD_LIBRARY(ch2o4gpu_cpu LANGUAGE python SOURCES ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_cpu.i) +endif() + +SWIG_LINK_LIBRARIES(ch2o4gpu_cpu cpuh2o4gpu ${PYTHON_LIBRARIES}) + +SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_cpu_REAL_NAME} PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) +#============= CPU SWIG + +#============= GPU SWIG +if(USE_CUDA) + SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_gpu.i PROPERTIES CPLUSPLUS ON) + + if (${CMAKE_VERSION} VERSION_LESS "3.8.0") + SWIG_ADD_MODULE(ch2o4gpu_gpu python ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_gpu.i) + else() + SWIG_ADD_LIBRARY(ch2o4gpu_gpu LANGUAGE python SOURCES ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_gpu.i) + endif() + SWIG_LINK_LIBRARIES(ch2o4gpu_gpu gpuh2o4gpu ${PYTHON_LIBRARIES}) + + SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_gpu_REAL_NAME} PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) +endif(USE_CUDA) +#============= GPU SWIG diff --git a/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu b/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu index c8bdc53fd..2a1bad0bf 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu @@ -1,6 +1,6 @@ #include "gtest/gtest.h" -#include "../src/gpu/kmeans/kmeans_centroids.h" +#include "../../../../src/gpu/kmeans/kmeans_centroids.h" #include TEST(KMeansCentroids, CalculateCentroids) { diff --git a/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu b/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu index a02eb00c0..ddc30d525 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu @@ -1,7 +1,7 @@ #include "gtest/gtest.h" -#include "../src/gpu/kmeans/kmeans_h2o4gpu.h" -#include "../src/gpu/kmeans/kmeans_labels.h" +#include "../../../../src/gpu/kmeans/kmeans_h2o4gpu.h" +#include "../../../../src/gpu/kmeans/kmeans_labels.h" #include TEST(KMeans, CountsPerCentroids) { diff --git a/tests/cpp/gpu/kmeans/test_kmeans_labels.cu b/tests/cpp/gpu/kmeans/test_kmeans_labels.cu index f702681bc..aa54c8f1c 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_labels.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_labels.cu @@ -1,6 +1,6 @@ #include "gtest/gtest.h" -#include "../src/gpu/kmeans/kmeans_labels.h" +#include "../../../../src/gpu/kmeans/kmeans_labels.h" #include #include From c39c1bb964b1f62c410136a6250c2cedf6fa2e40 Mon Sep 17 00:00:00 2001 From: fis Date: Sun, 22 Jul 2018 16:43:18 +0800 Subject: [PATCH 03/49] Enable specifying CUDA compute capability. --- CMakeLists.txt | 129 +++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 64 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb8264337..910ab0c88 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,26 +20,26 @@ SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_CXX_STANDARD_REQUIRED ON) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") +SET(GPU_COMPUTE_VER "" CACHE STRING + "Semicolon separated list of compute versions to be built against, e.g. -DGPU_COMPUTE_VER='35;61'") if(OpenMP_CXX_FOUND OR OPENMP_FOUND) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() #============= OPTIONS & SETTINGS # TODO probably to be removed after POGS is out in favor of XGboost GLM ADD_DEFINITIONS( - -D_GITHASH_=0 - -DH2O4GPU_DOUBLE - -DH2O4GPU_SINGLE -) + -D_GITHASH_=0 + -DH2O4GPU_DOUBLE + -DH2O4GPU_SINGLE) #============= BUILD COMMON CPU/GPU CODE FILE(GLOB_RECURSE COMMON_SOURCES - src/common/*.cpp - src/common/*.h - src/interface_c/*.cpp - src/interface_c/*.h - ) + src/common/*.cpp + src/common/*.h + src/interface_c/*.cpp + src/interface_c/*.h) INCLUDE_DIRECTORIES( src/include @@ -52,69 +52,70 @@ ADD_LIBRARY(commonh2o4gpu OBJECT ${COMMON_SOURCES}) #============= BUILD CPU LIBRARY FILE(GLOB_RECURSE CPU_SOURCES - src/cpu/*.cpp - src/cpu/*.h - ) + src/cpu/*.cpp + src/cpu/*.h) ADD_LIBRARY(cpuh2o4gpu STATIC ${CPU_SOURCES} $) TARGET_LINK_LIBRARIES(cpuh2o4gpu ${BLAS_LIBRARIES}) #============= BUILD CPU LIBRARY if(USE_CUDA) - FIND_PACKAGE(CUDA 8.0 REQUIRED) - FIND_PACKAGE(NVML REQUIRED) - - #============= BUILD GPU LIBRARY - ADD_DEFINITIONS( - -DCUDA_MAJOR=${CUDA_VERSION_MAJOR} - -DHAVECUDA - ) - - if(DEV_BUILD) - MESSAGE(STATUS "Building DEVELOPER compute capability version.") - SET(GPU_COMPUTE_VER 61 CACHE STRING - "Space separated list of compute versions to be built against") - else() - MESSAGE(STATUS "Building RELEASE compute capability version.") - SET(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING - "Space separated list of compute versions to be built against") - endif() - - if(((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) AND NOT DEV_BUILD) - MESSAGE(STATUS "CUDA 9.0 detected, adding Volta compute capability (7.0).") - SET(GPU_COMPUTE_VER "${GPU_COMPUTE_VER};70") - endif() - - SET(GENCODE_FLAGS "") - FORMAT_GENCODE_FLAGS("${GPU_COMPUTE_VER}" GENCODE_FLAGS) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -w;") - - FILE(GLOB_RECURSE GPU_SOURCES - src/*.cu - src/*.cuh - src/common/*.cpp - src/common/*.h - ) - - CUDA_ADD_LIBRARY(gpuh2o4gpu ${GPU_SOURCES} $ STATIC) - - if($ENV{USENVTX}) - MESSAGE(STATUS "Building with NVTX support on.") - SET(NVTX_LIBRARY nvToolsExt) - endif() - - TARGET_LINK_LIBRARIES(gpuh2o4gpu - ${CUDA_CUBLAS_LIBRARIES} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${BLAS_LIBRARIES} - ${NVTX_LIBRARY} - ${NVML_LIBRARY}) - #============= BUILD GPU LIBRARY + FIND_PACKAGE(CUDA 8.0 REQUIRED) + FIND_PACKAGE(NVML REQUIRED) + + #============= BUILD GPU LIBRARY + ADD_DEFINITIONS( + -DCUDA_MAJOR=${CUDA_VERSION_MAJOR} + -DHAVECUDA + ) + + if(DEV_BUILD AND NOT GPU_COMPUTE_VER) + MESSAGE(STATUS "Building DEVELOPER compute capability version.") + SET(GPU_COMPUTE_VER 61) + elseif(NOT GPU_COMPUTE_VER) + MESSAGE(STATUS "Building RELEASE compute capability version.") + SET(GPU_COMPUTE_VER 35;50;52;60;61) + endif() + + if(((CUDA_VERSION_MAJOR EQUAL 9) + OR (CUDA_VERSION_MAJOR GREATER 9)) + AND NOT DEV_BUILD + AND NOT GPU_COMPUTE_VER) + MESSAGE(STATUS "CUDA 9.0 detected, adding Volta compute capability (7.0).") + SET(GPU_COMPUTE_VER "${GPU_COMPUTE_VER};70") + endif() + + SET(GENCODE_FLAGS "") + FORMAT_GENCODE_FLAGS("${GPU_COMPUTE_VER}" GENCODE_FLAGS) + MESSAGE("CUDA architecture flags ${GENCODE_FLAGS}") + + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -w;") + + FILE(GLOB_RECURSE GPU_SOURCES + src/*.cu + src/*.cuh + src/common/*.cpp + src/common/*.h) + + CUDA_ADD_LIBRARY(gpuh2o4gpu ${GPU_SOURCES} $ STATIC) + + if($ENV{USENVTX}) + MESSAGE(STATUS "Building with NVTX support on.") + SET(NVTX_LIBRARY nvToolsExt) + endif() + + TARGET_LINK_LIBRARIES(gpuh2o4gpu + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${BLAS_LIBRARIES} + ${NVTX_LIBRARY} + ${NVML_LIBRARY}) + #============= BUILD GPU LIBRARY endif() if(USE_SWIG) - add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/swig) + ADD_SUBDIRECTORY(${CMAKE_CURRENT_LIST_DIR}/src/swig) endif(USE_SWIG) #============= Tests From e28cd4b66cbdbe204ba07b588cd545cad2cce448 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 13 Jul 2018 15:09:50 +0800 Subject: [PATCH 04/49] [WIP, KMeans||] skeleton for the algorithm. --- CMakeLists.txt | 9 +- src/gpu/kmeans/array.cu | 139 ++ src/gpu/kmeans/array.cuh | 107 ++ src/gpu/kmeans/kmeans_general.h | 42 + src/gpu/kmeans/kmeans_h2o4gpu.cu | 1653 +++++++++++----------- src/gpu/kmeans/kmeans_init.cu | 136 ++ src/gpu/kmeans/kmeans_init.cuh | 35 + tests/cpp/gpu/kmeans/test_kmeans_init.cu | 49 + 8 files changed, 1349 insertions(+), 821 deletions(-) create mode 100644 src/gpu/kmeans/array.cu create mode 100644 src/gpu/kmeans/array.cuh create mode 100644 src/gpu/kmeans/kmeans_init.cu create mode 100644 src/gpu/kmeans/kmeans_init.cuh create mode 100644 tests/cpp/gpu/kmeans/test_kmeans_init.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 910ab0c88..a140f8d04 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,6 +62,7 @@ TARGET_LINK_LIBRARIES(cpuh2o4gpu ${BLAS_LIBRARIES}) if(USE_CUDA) FIND_PACKAGE(CUDA 8.0 REQUIRED) FIND_PACKAGE(NVML REQUIRED) + find_package(Eigen3 3.3 REQUIRED NO_MODULE) #============= BUILD GPU LIBRARY ADD_DEFINITIONS( @@ -92,6 +93,7 @@ if(USE_CUDA) SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -w;") FILE(GLOB_RECURSE GPU_SOURCES + src/*.cu.cc src/*.cu src/*.cuh src/common/*.cpp @@ -104,13 +106,15 @@ if(USE_CUDA) SET(NVTX_LIBRARY nvToolsExt) endif() + include_directories(${EIGEN3_INCLUDE_DIR}) TARGET_LINK_LIBRARIES(gpuh2o4gpu ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDA_cusparse_LIBRARY} ${BLAS_LIBRARIES} ${NVTX_LIBRARY} - ${NVML_LIBRARY}) + ${NVML_LIBRARY} + Eigen3::Eigen) #============= BUILD GPU LIBRARY endif() @@ -133,8 +137,9 @@ if(BUILD_TESTS) endif(USE_CUDA) ADD_EXECUTABLE(test-h2o4gpu ${CUDA_TEST_OBJS}) - TARGET_LINK_LIBRARIES(test-h2o4gpu ${GTEST_LIBRARIES}) TARGET_LINK_LIBRARIES(test-h2o4gpu gpuh2o4gpu) + TARGET_LINK_LIBRARIES(test-h2o4gpu ${GTEST_LIBRARIES}) + ADD_TEST(TestH2O4GPU test-h2o4gpu) endif(BUILD_TESTS) #============= Tests diff --git a/src/gpu/kmeans/array.cu b/src/gpu/kmeans/array.cu new file mode 100644 index 000000000..1296ae45d --- /dev/null +++ b/src/gpu/kmeans/array.cu @@ -0,0 +1,139 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include +#include + +#include "array.cuh" +#include "kmeans_general.h" + +namespace H2O4GPU { +namespace Array { + +template +CUDAArray::CUDAArray() { + CUBLAS_CHECK(cublasCreate(&blas_handle)); +} + +template +CUDAArray::CUDAArray(size_t _size) { + this->_d_vector.resize(_size); + CUBLAS_CHECK(cublasCreate(&blas_handle)); +} + +template +CUDAArray::CUDAArray(Dims _other) { + _dims = _other; + _d_vector.resize(_dims[0] * _dims[1]); + CUBLAS_CHECK(cublasCreate(&blas_handle)); +} + +template +CUDAArray::CUDAArray(const thrust::device_vector& _d_vec, + const Dims _dims) { + this->_d_vector = _d_vec; + this->_dims = _dims; + CUBLAS_CHECK(cublasCreate(&blas_handle)); +} + +template +CUDAArray::~CUDAArray() { + // if (blas_handle != NULL) + // CUBLAS_CHECK(cublasDestroy(blas_handle)); +} + +template +void CUDAArray::operator=(const CUDAArray& _other) { + _dims = _other._dims; + _d_vector = _other._d_vector; + + for (size_t i = 0; i < _d_vector.size(); ++i) { + std::cout << _d_vector[i] << ' '; + } + std::cout << std::endl; +} + +template +void CUDAArray::print() const { + std::cout << "Array: ["; + for (size_t i = 0; i < 4; ++i) { + std::cout << _dims[i] << ", "; + } + std::cout << "\b\b]" << std::endl; + for (size_t i = 0; i < _dims[0]; ++i) { + for (size_t j = 0; j < _dims[1]; ++j) { + std::cout << _d_vector[i*_dims[0]+j] << ' '; + } + std::cout << std::endl; + } + std::cout << std::endl; +} + +// return 1 row +template +CUDAArray CUDAArray::index(size_t _idx) { + + Dims new_dim (1, _dims[1], 0, 0); + CUDAArray result (new_dim); + thrust::device_vector _row (_dims[1]); + + thrust::copy(_d_vector.begin() + _idx * _dims[1], + _d_vector.begin() + (_idx+1) * _dims[1], + result._d_vector.begin()); + return result; +} + +template +T* CUDAArray::get() { + return _d_vector.data().get(); +} + +template +thrust::device_vector& CUDAArray::device_vector() { + return _d_vector; +} + +template +size_t CUDAArray::stride() { + return _stride; +} + +template +size_t CUDAArray::size () const { + return _h_vector.size(); +} + +template +size_t CUDAArray::n_gpu() const { + return _n_gpu; +} + +template +Dims CUDAArray::dims() const { + return _dims; +} + + +#define INSTANTIATE(T) \ + template CUDAArray::CUDAArray(); \ + template CUDAArray::CUDAArray(size_t _size); \ + template CUDAArray::CUDAArray(const thrust::device_vector& _d_vec, \ + const Dims _dims); \ + template CUDAArray::~CUDAArray(); \ + template void CUDAArray::operator=(const CUDAArray& _other); \ + template void CUDAArray::print() const; \ + template CUDAArray CUDAArray::index(size_t dim0); \ + template T * CUDAArray::get(); \ + template thrust::device_vector& CUDAArray::device_vector(); \ + template size_t CUDAArray::stride(); \ + template size_t CUDAArray::size () const; \ + template size_t CUDAArray::n_gpu() const; \ + template Dims CUDAArray::dims() const; \ + +INSTANTIATE(float) +INSTANTIATE(double) + +} // namespace H204GPU +} // namespace Array diff --git a/src/gpu/kmeans/array.cuh b/src/gpu/kmeans/array.cuh new file mode 100644 index 000000000..5cf069bf8 --- /dev/null +++ b/src/gpu/kmeans/array.cuh @@ -0,0 +1,107 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef CUDA_ARRAY_H_ +#define CUDA_ARRAY_H_ + +#include +#include + +namespace H2O4GPU { +namespace Array { + +constexpr float esp = 1e-16f; + +struct Dims { + size_t dims[4]; + Dims() { + for (size_t i = 0; i < 4; ++i) { + dims[i] = 0; + } + } + Dims(size_t _dims[4]) { + for (size_t i = 0; i < 4; ++i) { + dims[i] = _dims[i]; + } + } + Dims (size_t d0, size_t d1, size_t d2, size_t d3) { + dims[0] = d0; + dims[1] = d1; + dims[2] = d2; + dims[3] = d3; + } + size_t operator[](size_t _idx) const { + return dims[_idx]; + } + void operator=(const Dims& _other) { + for (size_t i = 0; i < 4; ++i) { + dims[i] = _other.dims[i]; + } + } +}; + +template +class CUDAArray { + private: + thrust::host_vector _h_vector; + thrust::device_vector _d_vector; + + Dims _dims; + + bool _is_synced; + + size_t _stride; + + size_t _n_gpu; + + cublasHandle_t blas_handle; + + public: + CUDAArray(); + CUDAArray(size_t _size); + CUDAArray(Dims _dims); + CUDAArray(const thrust::device_vector& _d_vec, const Dims _dims); + + virtual ~CUDAArray(); + + void operator=(const CUDAArray& _other); + + void print() const; + + CUDAArray index(size_t dim0); + + thrust::device_vector& device_vector(); + + size_t stride(); + + size_t size () const; + + size_t n_gpu() const; + + T* get(); + + Dims dims () const; +}; + + +template +CUDAArray div(CUDAArray _lhs, T _rhs) { + if (_rhs < esp) { + throw std::runtime_error("Value under flow"); + } + // cublasScal(blas_handle, lhs); +} + +template +CUDAArray min_element(CUDAArray& _value) { + T result = thrust::min_element + (_value._d_vector.begin(), _value._d_vector.end()); + return result; +} + +} // namespace Array +} // namespace H2O4GPU + +#endif diff --git a/src/gpu/kmeans/kmeans_general.h b/src/gpu/kmeans/kmeans_general.h index 6f1a13e91..a64901a7d 100644 --- a/src/gpu/kmeans/kmeans_general.h +++ b/src/gpu/kmeans/kmeans_general.h @@ -4,6 +4,7 @@ */ #pragma once #include "../../common/logger.h" +#include "stdio.h" #define MAX_NGPUS 16 #define VERBOSE 0 @@ -24,3 +25,44 @@ exit(EXIT_FAILURE); \ } \ } while(0) + +#define CUBLAS_CHECK(cmd) do { \ + cublasStatus_t status = cmd; \ + if ( status != CUBLAS_STATUS_SUCCESS) { \ + const char* errmsg = nullptr; \ + switch(status) { \ + case CUBLAS_STATUS_NOT_INITIALIZED: \ + errmsg = "library not initialized"; \ + break; \ + \ + case CUBLAS_STATUS_ALLOC_FAILED: \ + errmsg = "resource allocation failed"; \ + break; \ + \ + case CUBLAS_STATUS_INVALID_VALUE: \ + errmsg = "an invalid numeric value was used as an argument"; \ + break; \ + \ + case CUBLAS_STATUS_ARCH_MISMATCH: \ + errmsg = "an absent device architectural feature is required"; \ + break; \ + \ + case CUBLAS_STATUS_MAPPING_ERROR: \ + errmsg = "an access to GPU memory space failed"; \ + break; \ + \ + case CUBLAS_STATUS_EXECUTION_FAILED: \ + errmsg = "the GPU program failed to execute"; \ + break; \ + \ + case CUBLAS_STATUS_INTERNAL_ERROR: \ + errmsg = "an internal operation failed"; \ + break; \ + \ + default: \ + errmsg = "unknown error"; \ + break; \ + } \ + printf("%s", errmsg); \ + } \ + } while (false) diff --git a/src/gpu/kmeans/kmeans_h2o4gpu.cu b/src/gpu/kmeans/kmeans_h2o4gpu.cu index d05750a86..e9a183e56 100644 --- a/src/gpu/kmeans/kmeans_h2o4gpu.cu +++ b/src/gpu/kmeans/kmeans_h2o4gpu.cu @@ -1,37 +1,37 @@ /*! - * Copyright 2017-2018 H2O.ai, Inc. + * copyright 2017-2018 H2O.ai, Inc. * License Apache License Version 2.0 (see LICENSE for details) */ -#include -#include -#include -#include -#include -#include +#include "../../common/utils.h" #include "cuda.h" -#include -#include -#include "solver/kmeans.h" -#include "kmeans_impl.h" #include "kmeans_general.h" #include "kmeans_h2o4gpu.h" -#include +#include "kmeans_impl.h" +#include "solver/kmeans.h" #include -#include -#include #include -#include "../../common/utils.h" +#include +#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include /** * METHODS FOR DATA COPYING AND GENERATION */ -template +template void random_data(int verbose, thrust::device_vector &array, int m, int n) { thrust::host_vector host_array(m * n); for (int i = 0; i < m * n; i++) { - host_array[i] = (T) rand() / (T) RAND_MAX; + host_array[i] = (T)rand() / (T)RAND_MAX; } array = host_array; } @@ -48,22 +48,23 @@ void random_data(int verbose, thrust::device_vector &array, int m, int n) { * @param npergpu * @param d */ -template -void copy_data(int verbose, const char ord, thrust::device_vector &array, const T *srcdata, - int q, int n, size_t npergpu, int d) { +template +void copy_data(int verbose, const char ord, thrust::device_vector &array, + const T *srcdata, int q, int n, size_t npergpu, int d) { if (ord == 'c') { thrust::host_vector host_array(npergpu * d); log_debug(verbose, "Copy data COL ORDER -> ROW ORDER"); for (size_t i = 0; i < npergpu * d; i++) { - size_t indexi = i % d; // col + size_t indexi = i % d; // col size_t indexj = i / d + q * npergpu; // row (shifted by which gpu) host_array[i] = srcdata[indexi * n + indexj]; } array = host_array; } else { log_debug(verbose, "Copy data ROW ORDER not changed"); - thrust::host_vector host_array(srcdata + q * npergpu * d, srcdata + q * npergpu * d + npergpu * d); + thrust::host_vector host_array(srcdata + q * npergpu * d, + srcdata + q * npergpu * d + npergpu * d); array = host_array; } } @@ -81,16 +82,18 @@ void copy_data(int verbose, const char ord, thrust::device_vector &array, con * @param npergpu * @param d */ -template -void copy_data_shuffled(int verbose, std::vector v, const char ord, thrust::device_vector &array, - const T *srcdata, int q, int n, int npergpu, int d) { +template +void copy_data_shuffled(int verbose, std::vector v, const char ord, + thrust::device_vector &array, const T *srcdata, + int q, int n, int npergpu, int d) { thrust::host_vector host_array(npergpu * d); if (ord == 'c') { log_debug(verbose, "Copy data shuffle COL ORDER -> ROW ORDER"); for (int i = 0; i < npergpu; i++) { for (size_t j = 0; j < d; j++) { - host_array[i * d + j] = srcdata[v[q * npergpu + i] + j * n]; // shift by which gpu + host_array[i * d + j] = + srcdata[v[q * npergpu + i] + j * n]; // shift by which gpu } } } else { @@ -98,16 +101,18 @@ void copy_data_shuffled(int verbose, std::vector v, const char ord, thrust: for (int i = 0; i < npergpu; i++) { for (size_t j = 0; j < d; j++) { - host_array[i * d + j] = srcdata[v[q * npergpu + i] * d + j]; // shift by which gpu + host_array[i * d + j] = + srcdata[v[q * npergpu + i] * d + j]; // shift by which gpu } } } array = host_array; } -template -void copy_centroids_shuffled(int verbose, std::vector v, const char ord, thrust::device_vector &array, - const T *srcdata, int n, int k, int d) { +template +void copy_centroids_shuffled(int verbose, std::vector v, const char ord, + thrust::device_vector &array, const T *srcdata, + int n, int k, int d) { copy_data_shuffled(verbose, v, ord, array, srcdata, 0, n, k, d); } @@ -125,30 +130,34 @@ void copy_centroids_shuffled(int verbose, std::vector v, const char ord, th * @param d * @param k */ -template +template void random_centroids(int verbose, int seed, const char ord, - thrust::device_vector &array, const T *srcdata, - int q, int n, int npergpu, int d, int k) { + thrust::device_vector &array, const T *srcdata, int q, + int n, int npergpu, int d, int k) { thrust::host_vector host_array(k * d); if (seed < 0) { - std::random_device rd; //Will be used to obtain a seed for the random number engine + std::random_device + rd; // Will be used to obtain a seed for the random number engine seed = rd(); } std::mt19937 gen(seed); - std::uniform_int_distribution<> dis(0, n - 1); // random i in range from 0..n-1 (i.e. only 1 gpu gets centroids) + std::uniform_int_distribution<> dis( + 0, + n - 1); // random i in range from 0..n-1 (i.e. only 1 gpu gets centroids) if (ord == 'c') { log_debug(verbose, "Random centroids COL ORDER -> ROW ORDER"); for (int i = 0; i < k; i++) { // clusters - size_t reali = dis(gen); // + q*npergpu; // row sampled (called indexj above) + size_t reali = + dis(gen); // + q*npergpu; // row sampled (called indexj above) for (size_t j = 0; j < d; j++) { // cols host_array[i * d + j] = srcdata[reali + j * n]; } } } else { log_debug(verbose, "Random centroids ROW ORDER not changed"); - for (int i = 0; i < k; i++) { // rows - size_t reali = dis(gen); // + q*npergpu ; // row sampled + for (int i = 0; i < k; i++) { // rows + size_t reali = dis(gen); // + q*npergpu ; // row sampled for (size_t j = 0; j < d; j++) { // cols host_array[i * d + j] = srcdata[reali * d + j]; } @@ -157,968 +166,974 @@ void random_centroids(int verbose, int seed, const char ord, array = host_array; } -/** - * KMEANS METHODS FIT, PREDICT, TRANSFORM - */ + /** + * KMEANS METHODS FIT, PREDICT, TRANSFORM + */ -#define __HBAR__ \ - "----------------------------------------------------------------------------\n" +#define __HBAR__ \ + "--------------------------------------------------------------------------" \ + "--\n" namespace h2o4gpukmeans { -template -int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, const char ord, - int k, int max_iterations, int init_from_data, - T threshold, - const T *srcdata, T **pred_centroids, int **pred_labels); - - template - int pick_point_idx_weighted( - int seed, - std::vector *data, - thrust::host_vector weights) { - T weighted_sum = 0; - - for(int i = 0; i < weights.size(); i++) { - if(data) { - weighted_sum += (data->data()[i] * weights.data()[i]); - } else { - weighted_sum += weights.data()[i]; - } - } +template +int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, size_t rows, + size_t cols, const char ord, int k, int max_iterations, + int init_from_data, T threshold, const T *srcdata, + T **pred_centroids, int **pred_labels); - T best_prob = 0.0; - int best_prob_idx = 0; +template +int pick_point_idx_weighted(int seed, std::vector *data, + thrust::host_vector weights) { + T weighted_sum = 0; - std::mt19937 mt(seed); - std::uniform_real_distribution<> dist(0.0, 1.0); + for (int i = 0; i < weights.size(); i++) { + if (data) { + weighted_sum += (data->data()[i] * weights.data()[i]); + } else { + weighted_sum += weights.data()[i]; + } + } - int i = 0; - for(i = 0; i <= weights.size(); i++) { - if(weights.size() == i) { - break; - } + T best_prob = 0.0; + int best_prob_idx = 0; - T prob_threshold = (T) dist(mt); + std::mt19937 mt(seed); + std::uniform_real_distribution<> dist(0.0, 1.0); - T data_val = weights.data()[i]; - if (data) { - data_val *= data->data()[i]; - } + int i = 0; + for (i = 0; i <= weights.size(); i++) { + if (weights.size() == i) { + break; + } - T prob_x = (data_val / weighted_sum); + T prob_threshold = (T)dist(mt); - if(prob_x > prob_threshold) { - break; - } + T data_val = weights.data()[i]; + if (data) { + data_val *= data->data()[i]; + } - if (prob_x >= best_prob) { - best_prob = prob_x; - best_prob_idx = i; - } + T prob_x = (data_val / weighted_sum); + + if (prob_x > prob_threshold) { + break; } - return weights.size() == i ? best_prob_idx : i; + if (prob_x >= best_prob) { + best_prob = prob_x; + best_prob_idx = i; + } } - /** - * Copies cols records, starting at position idx*cols from data to centroids. Removes them afterwards from data. - * Removes record from weights at position idx. - * @tparam T - * @param idx - * @param cols - * @param data - * @param weights - * @param centroids - */ - template - void add_centroid(int idx, int cols, - thrust::host_vector &data, - thrust::host_vector &weights, - std::vector ¢roids) { - for (int i = 0; i < cols; i++) { - centroids.push_back(data[idx * cols + i]); - } - weights[idx] = 0; + return weights.size() == i ? best_prob_idx : i; +} + +/** + * Copies cols records, starting at position idx*cols from data to centroids. + * Removes them afterwards from data. Removes record from weights at position + * idx. + * @tparam T + * @param idx + * @param cols + * @param data + * @param weights + * @param centroids + */ +template +void add_centroid(int idx, int cols, thrust::host_vector &data, + thrust::host_vector &weights, std::vector ¢roids) { + for (int i = 0; i < cols; i++) { + centroids.push_back(data[idx * cols + i]); } + weights[idx] = 0; +} - /** - * K-Means++ algorithm - * @tparam T - * @param seed - * @param data - * @param weights - * @param k - * @param cols - * @param centroids - */ - template - void kmeans_plus_plus( - int verbose, - int seed, - thrust::host_vector data, - thrust::host_vector weights, - int k, - int cols, - thrust::host_vector ¢roids) { - - std::vector std_centroids(0); - std_centroids.reserve(k * cols); - - int centroid_idx = pick_point_idx_weighted( - seed, - (std::vector *) NULL, - weights - ); +/** + * K-Means++ algorithm + * @tparam T + * @param seed + * @param data + * @param weights + * @param k + * @param cols + * @param centroids + */ +template +void kmeans_plus_plus(int verbose, int seed, thrust::host_vector data, + thrust::host_vector weights, int k, int cols, + thrust::host_vector ¢roids) { - add_centroid(centroid_idx, cols, data, weights, std_centroids); + std::vector std_centroids(0); + std_centroids.reserve(k * cols); + + int centroid_idx = + pick_point_idx_weighted(seed, (std::vector *)NULL, weights); - std::vector best_pairwise_distances(data.size() / cols); // one for each row in data - std::vector std_data(data.begin(), data.end()); + add_centroid(centroid_idx, cols, data, weights, std_centroids); - compute_distances(std_data, - std_centroids, - best_pairwise_distances, - data.size() / cols, cols, 1); + std::vector best_pairwise_distances(data.size() / + cols); // one for each row in data + std::vector std_data(data.begin(), data.end()); - std::vector curr_pairwise_distances( std_data.size() / cols); + compute_distances(std_data, std_centroids, best_pairwise_distances, + data.size() / cols, cols, 1); - for (int iter = 0; iter < k - 1; iter++) { - log_verbose(verbose, "KMeans++ - Iteraton %d/%d.", iter, k-1); + std::vector curr_pairwise_distances(std_data.size() / cols); - centroid_idx = pick_point_idx_weighted( - seed, - &best_pairwise_distances, - weights - ); + for (int iter = 0; iter < k - 1; iter++) { + log_verbose(verbose, "KMeans++ - Iteraton %d/%d.", iter, k - 1); - add_centroid(centroid_idx, cols, data, weights, std_centroids); + centroid_idx = + pick_point_idx_weighted(seed, &best_pairwise_distances, weights); - std::vector most_recent_centroids; - most_recent_centroids.reserve(cols); - add_centroid(centroid_idx, cols, data, weights, most_recent_centroids); + add_centroid(centroid_idx, cols, data, weights, std_centroids); - best_pairwise_distances[centroid_idx] = 0; + std::vector most_recent_centroids; + most_recent_centroids.reserve(cols); + add_centroid(centroid_idx, cols, data, weights, most_recent_centroids); - compute_distances(std_data, - most_recent_centroids, - curr_pairwise_distances, - std_data.size() / cols, cols, 1); + best_pairwise_distances[centroid_idx] = 0; - for (int i = 0; i < curr_pairwise_distances.size(); i++) { - best_pairwise_distances[i] = std::min(curr_pairwise_distances[i], best_pairwise_distances[i]); - } + compute_distances(std_data, most_recent_centroids, curr_pairwise_distances, + std_data.size() / cols, cols, 1); - std::fill(curr_pairwise_distances.begin(), curr_pairwise_distances.end(), (T)0.0); + for (int i = 0; i < curr_pairwise_distances.size(); i++) { + best_pairwise_distances[i] = + std::min(curr_pairwise_distances[i], best_pairwise_distances[i]); } - centroids.assign(std_centroids.begin(), std_centroids.end()); + std::fill(curr_pairwise_distances.begin(), curr_pairwise_distances.end(), + (T)0.0); } - template - struct min_calc_functor { - T* all_costs_ptr; - T* min_costs_ptr; - T max = std::numeric_limits::max(); - int potential_k_rows; - int rows_per_run; - - min_calc_functor(T* _all_costs_ptr, T* _min_costs_ptr, int _potential_k_rows, int _rows_per_run) { - all_costs_ptr = _all_costs_ptr; - min_costs_ptr = _min_costs_ptr; - potential_k_rows = _potential_k_rows; - rows_per_run = _rows_per_run; - } + centroids.assign(std_centroids.begin(), std_centroids.end()); +} - __host__ __device__ - void operator()(int idx) const { - T best = max; - for (int j = 0; j < potential_k_rows; j++) { - best = min(best, std::abs(all_costs_ptr[j * rows_per_run + idx])); - } - min_costs_ptr[idx] = min(min_costs_ptr[idx], best); - } - }; +template struct min_calc_functor { + T *all_costs_ptr; + T *min_costs_ptr; + T max = std::numeric_limits::max(); + int potential_k_rows; + int rows_per_run; + + min_calc_functor(T *_all_costs_ptr, T *_min_costs_ptr, int _potential_k_rows, + int _rows_per_run) { + all_costs_ptr = _all_costs_ptr; + min_costs_ptr = _min_costs_ptr; + potential_k_rows = _potential_k_rows; + rows_per_run = _rows_per_run; + } - /** - * K-Means|| initialization method implementation as described in "Scalable K-Means++". - * - * This is a probabilistic method, which tries to choose points as much spread out as possible as centroids. - * - * In case it finds more than k centroids a K-Means++ algorithm is ran on potential centroids to pick k best suited ones. - * - * http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf - * - * @tparam T - * @param verbose - * @param seed - * @param ord - * @param data - * @param data_dots - * @param centroids - * @param rows - * @param cols - * @param k - * @param num_gpu - * @param threshold - */ - template - thrust::host_vector kmeans_parallel(int verbose, int seed, const char ord, - thrust::device_vector **data, - thrust::device_vector **data_dots, - size_t rows, int cols, int k, int num_gpu, T threshold) { - if (seed < 0) { - std::random_device rd; - int seed = rd(); + __host__ __device__ void operator()(int idx) const { + T best = max; + for (int j = 0; j < potential_k_rows; j++) { + best = min(best, std::abs(all_costs_ptr[j * rows_per_run + idx])); } + min_costs_ptr[idx] = min(min_costs_ptr[idx], best); + } +}; - size_t rows_per_gpu = rows / num_gpu; - - std::mt19937 gen(seed); - std::uniform_int_distribution<> dis(0, rows - 1); - // Find the position (GPU idx and idx on that GPU) of the initial centroid - int first_center = dis(gen); - int first_center_idx = first_center % rows_per_gpu; - int first_center_gpu = first_center / rows_per_gpu; +/** + * K-Means|| initialization method implementation as described in "Scalable + * K-Means++". + * + * This is a probabilistic method, which tries to choose points as much spread + * out as possible as centroids. + * + * In case it finds more than k centroids a K-Means++ algorithm is ran on + * potential centroids to pick k best suited ones. + * + * http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf + * + * @tparam T + * @param verbose + * @param seed + * @param ord + * @param data + * @param data_dots + * @param centroids + * @param rows + * @param cols + * @param k + * @param num_gpu + * @param threshold + */ +template +thrust::host_vector kmeans_parallel(int verbose, int seed, const char ord, + thrust::device_vector **data, + thrust::device_vector **data_dots, + size_t rows, int cols, int k, + int num_gpu, T threshold) { + if (seed < 0) { + std::random_device rd; + int seed = rd(); + } - log_verbose(verbose, "KMeans|| - Initial centroid %d on GPU %d.", first_center_idx, first_center_gpu); + size_t rows_per_gpu = rows / num_gpu; - // Copies the initial centroid to potential centroids vector. That vector will store all potential centroids found - // in the previous iteration. - thrust::host_vector h_potential_centroids(cols); - std::vector> h_potential_centroids_per_gpu(num_gpu); + std::mt19937 gen(seed); + std::uniform_int_distribution<> dis(0, rows - 1); + + // Find the position (GPU idx and idx on that GPU) of the initial centroid + int first_center = dis(gen); + int first_center_gpu = first_center / rows_per_gpu; // gpu id + int first_center_idx = first_center % rows_per_gpu; // id on that gpu + + log_verbose(verbose, "KMeans|| - Initial centroid %d on GPU %d.", + first_center_idx, first_center_gpu); + + // Copies the initial centroid to potential centroids vector. That vector will + // store all potential centroids found in the previous iteration. + thrust::host_vector h_potential_centroids(cols); + std::vector> h_potential_centroids_per_gpu(num_gpu); + + CUDACHECK(cudaSetDevice(first_center_gpu)); + + // copy the first center to h_potential_centroids + thrust::copy((*data[first_center_gpu]).begin() + first_center_idx * cols, + (*data[first_center_gpu]).begin() + + (first_center_idx + 1) * cols, + h_potential_centroids.begin()); + + thrust::host_vector h_all_potential_centroids = h_potential_centroids; + + // Initial the cost-to-potential-centroids and + // cost-to-closest-potential-centroid matrices. Initial cost is +infinity + std::vector> d_min_costs(num_gpu); + for (int q = 0; q < num_gpu; q++) { + CUDACHECK(cudaSetDevice(q)); + d_min_costs[q].resize(rows_per_gpu); + thrust::fill(d_min_costs[q].begin(), d_min_costs[q].end(), + std::numeric_limits::max()); + } - CUDACHECK(cudaSetDevice(first_center_gpu)); + double t0 = timer(); - thrust::copy( - (*data[first_center_gpu]).begin() + first_center_idx * cols, - (*data[first_center_gpu]).begin() + (first_center_idx + 1) * cols, - h_potential_centroids.begin() - ); + // The original white paper claims 8 should be enough + int max_iter = std::min(8, (int)(2 + log(k))); + for (int counter = 0; counter < max_iter; counter++) { + log_verbose(verbose, "KMeans|| - Iteration %d.", counter); + T total_min_cost = 0.0; - thrust::host_vector h_all_potential_centroids = h_potential_centroids; + int new_potential_centroids = 0; +#pragma omp parallel for + for (int i = 0; i < num_gpu; i++) { + CUDACHECK(cudaSetDevice(i)); + + thrust::device_vector d_potential_centroids = h_potential_centroids; + + int potential_k_rows = d_potential_centroids.size() / cols; + + // Compute all the costs to each potential centroid from previous + // iteration + thrust::device_vector centroid_dots(potential_k_rows); + + kmeans::detail::batch_calculate_distances( + + verbose, 0, rows_per_gpu, cols, potential_k_rows, *data[i], + d_potential_centroids, *data_dots[i], centroid_dots, + + [&](int rows_per_run, size_t offset, + thrust::device_vector &pairwise_distances) { + // Find the closest potential center cost for each row + auto min_cost_counter = thrust::make_counting_iterator(0); + auto all_costs_ptr = + thrust::raw_pointer_cast(pairwise_distances.data()); + auto min_costs_ptr = + thrust::raw_pointer_cast(d_min_costs[i].data() + offset); + thrust::for_each( + min_cost_counter, min_cost_counter + rows_per_run, + // Functor instead of a lambda b/c nvcc is complaining about + // nesting a __device__ lambda inside a regular lambda + min_calc_functor(all_costs_ptr, min_costs_ptr, + potential_k_rows, rows_per_run)); + }); + } - // Initial the cost-to-potential-centroids and cost-to-closest-potential-centroid matrices. Initial cost is +infinity - std::vector> d_min_costs(num_gpu); - for (int q = 0; q < num_gpu; q++) { - CUDACHECK(cudaSetDevice(q)); - d_min_costs[q].resize(rows_per_gpu); - thrust::fill(d_min_costs[q].begin(), d_min_costs[q].end(), std::numeric_limits::max()); + for (int i = 0; i < num_gpu; i++) { + CUDACHECK(cudaSetDevice(i)); + total_min_cost += + thrust::reduce(d_min_costs[i].begin(), d_min_costs[i].end()); } - double t0 = timer(); + log_verbose(verbose, "KMeans|| - Total min cost from centers %g.", + total_min_cost); - // The original white paper claims 8 should be enough - int max_iter = std::min(8, (int)(2 + log(k)) ); - for (int counter = 0; counter < max_iter; counter++) { - log_verbose(verbose, "KMeans|| - Iteration %d.", counter); - T total_min_cost = 0.0; + if (total_min_cost == (T)0.0) { + continue; + } - int new_potential_centroids = 0; + std::set copy_from_gpus; #pragma omp parallel for - for (int i = 0; i < num_gpu; i++) { - CUDACHECK(cudaSetDevice(i)); - - thrust::device_vector d_potential_centroids = h_potential_centroids; - - int potential_k_rows = d_potential_centroids.size() / cols; - - // Compute all the costs to each potential centroid from previous iteration - thrust::device_vector centroid_dots(potential_k_rows); - - kmeans::detail::batch_calculate_distances(verbose, 0, rows_per_gpu, cols, potential_k_rows, - *data[i], d_potential_centroids, *data_dots[i], centroid_dots, - [&](int rows_per_run, size_t offset, thrust::device_vector &pairwise_distances) { - // Find the closest potential center cost for each row - auto min_cost_counter = thrust::make_counting_iterator(0); - auto all_costs_ptr = thrust::raw_pointer_cast(pairwise_distances.data()); - auto min_costs_ptr = thrust::raw_pointer_cast(d_min_costs[i].data() + offset); - thrust::for_each(min_cost_counter, - min_cost_counter + rows_per_run, - // Functor instead of a lambda b/c nvcc is complaining about - // nesting a __device__ lambda inside a regular lambda - min_calc_functor(all_costs_ptr, min_costs_ptr, potential_k_rows, rows_per_run)); - } - ); + for (int i = 0; i < num_gpu; i++) { + CUDACHECK(cudaSetDevice(i)); + + // Count how many potential centroids there are using probabilities + // The further the row is from the closest cluster center the higher the + // probability + auto pot_cent_filter_counter = thrust::make_counting_iterator(0); + auto min_costs_ptr = thrust::raw_pointer_cast(d_min_costs[i].data()); + int pot_cent_num = thrust::count_if( + pot_cent_filter_counter, pot_cent_filter_counter + rows_per_gpu, + [=] __device__(int idx) { + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + int device; + cudaGetDevice(&device); + rng.discard(idx + device * rows_per_gpu); + T prob_threshold = (T)dist(rng); + + T prob_x = ((2.0 * k * min_costs_ptr[idx]) / total_min_cost); + + return prob_x > prob_threshold; + }); + + log_debug(verbose, "KMeans|| - Potential centroids on GPU %d = %d.", i, + pot_cent_num); + + if (pot_cent_num > 0) { + copy_from_gpus.insert(i); + + // Copy all potential cluster centers + thrust::device_vector d_new_potential_centroids(pot_cent_num * cols); + + auto range = thrust::make_counting_iterator(0); + thrust::copy_if( + (*data[i]).begin(), (*data[i]).end(), range, + d_new_potential_centroids.begin(), + + [=] __device__(int idx) { + int row = idx / cols; + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + int device; + cudaGetDevice(&device); + rng.discard(row + device * rows_per_gpu); + T prob_threshold = (T)dist(rng); + + T prob_x = ((2.0 * k * min_costs_ptr[row]) / total_min_cost); + + return prob_x > prob_threshold; + }); + + h_potential_centroids_per_gpu[i].clear(); + h_potential_centroids_per_gpu[i].resize( + d_new_potential_centroids.size()); + + new_potential_centroids += d_new_potential_centroids.size(); + + thrust::copy(d_new_potential_centroids.begin(), + d_new_potential_centroids.end(), + h_potential_centroids_per_gpu[i].begin()); } + } - for (int i = 0; i < num_gpu; i++) { - CUDACHECK(cudaSetDevice(i)); - total_min_cost += thrust::reduce( - d_min_costs[i].begin(), - d_min_costs[i].end() - ); - } + log_verbose(verbose, "KMeans|| - New potential centroids %d.", + new_potential_centroids); - log_verbose(verbose, "KMeans|| - Total min cost from centers %g.", total_min_cost); + // Gather potential cluster centers from all GPUs + if (new_potential_centroids > 0) { + h_potential_centroids.clear(); + h_potential_centroids.resize(new_potential_centroids); - if(total_min_cost == (T) 0.0) { - continue; - } + int old_pot_centroids_size = h_all_potential_centroids.size(); + h_all_potential_centroids.resize(old_pot_centroids_size + + new_potential_centroids); - std::set copy_from_gpus; -#pragma omp parallel for + int offset = 0; for (int i = 0; i < num_gpu; i++) { - CUDACHECK(cudaSetDevice(i)); - - // Count how many potential centroids there are using probabilities - // The further the row is from the closest cluster center the higher the probability - auto pot_cent_filter_counter = thrust::make_counting_iterator(0); - auto min_costs_ptr = thrust::raw_pointer_cast(d_min_costs[i].data()); - int pot_cent_num = thrust::count_if( - pot_cent_filter_counter, - pot_cent_filter_counter + rows_per_gpu, [=]__device__(int idx){ - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - int device; - cudaGetDevice(&device); - rng.discard(idx + device * rows_per_gpu); - T prob_threshold = (T) dist(rng); - - T prob_x = (( 2.0 * k * min_costs_ptr[idx]) / total_min_cost); - - return prob_x > prob_threshold; - } - ); - - log_debug(verbose, "KMeans|| - Potential centroids on GPU %d = %d.", i, pot_cent_num); - - if (pot_cent_num > 0) { - copy_from_gpus.insert(i); - - // Copy all potential cluster centers - thrust::device_vector d_new_potential_centroids(pot_cent_num * cols); - - auto range = thrust::make_counting_iterator(0); - thrust::copy_if( - (*data[i]).begin(), (*data[i]).end(), range, - d_new_potential_centroids.begin(), [=] __device__(int idx){ - int row = idx / cols; - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - int device; - cudaGetDevice(&device); - rng.discard(row + device * rows_per_gpu); - T prob_threshold = (T) dist(rng); - - T prob_x = (( 2.0 * k * min_costs_ptr[row]) / total_min_cost); - - return prob_x > prob_threshold; - }); - - h_potential_centroids_per_gpu[i].clear(); - h_potential_centroids_per_gpu[i].resize(d_new_potential_centroids.size()); - - new_potential_centroids += d_new_potential_centroids.size(); - - thrust::copy( - d_new_potential_centroids.begin(), - d_new_potential_centroids.end(), - h_potential_centroids_per_gpu[i].begin() - ); - + if (copy_from_gpus.find(i) != copy_from_gpus.end()) { + thrust::copy(h_potential_centroids_per_gpu[i].begin(), + h_potential_centroids_per_gpu[i].end(), + h_potential_centroids.begin() + offset); + offset += h_potential_centroids_per_gpu[i].size(); } - } - log_verbose(verbose, "KMeans|| - New potential centroids %d.", new_potential_centroids); - - // Gather potential cluster centers from all GPUs - if (new_potential_centroids > 0) { - h_potential_centroids.clear(); - h_potential_centroids.resize(new_potential_centroids); - - int old_pot_centroids_size = h_all_potential_centroids.size(); - h_all_potential_centroids.resize(old_pot_centroids_size + new_potential_centroids); - - int offset = 0; - for (int i = 0; i < num_gpu; i++) { - if(copy_from_gpus.find(i) != copy_from_gpus.end()) { - thrust::copy( - h_potential_centroids_per_gpu[i].begin(), - h_potential_centroids_per_gpu[i].end(), - h_potential_centroids.begin() + offset - ); - offset += h_potential_centroids_per_gpu[i].size(); - } - } - - thrust::copy( - h_potential_centroids.begin(), - h_potential_centroids.end(), - h_all_potential_centroids.begin() + old_pot_centroids_size - ); - } + thrust::copy(h_potential_centroids.begin(), h_potential_centroids.end(), + h_all_potential_centroids.begin() + old_pot_centroids_size); } + } - double timeloop = static_cast(timer() - t0); + double timeloop = static_cast(timer() - t0); - thrust::host_vector final_centroids(0); - int potential_centroids_num = h_all_potential_centroids.size() / cols; + thrust::host_vector final_centroids(0); + int potential_centroids_num = h_all_potential_centroids.size() / cols; - if (potential_centroids_num <= k) { - final_centroids.resize(k * cols); - thrust::copy( - h_all_potential_centroids.begin(), - h_all_potential_centroids.end(), - final_centroids.begin() - ); - // TODO what if potential_centroids_num < k ?? we don't want 0s - } else { - // If we found more than k potential cluster centers we need to take only a subset - // This is done using a weighted k-means++ method, since the set should be very small - // it should converge very fast and is all done on the CPU. - thrust::host_vector weights(potential_centroids_num); - - double tc0 = timer(); - - // Weights correspond to the number of data points assigned to each potential cluster center - count_pts_per_centroid( - verbose, num_gpu, - rows_per_gpu, cols, - data, data_dots, - h_all_potential_centroids, - weights - ); - - double timecount = static_cast(timer() - tc0); - - double tkpp = timer(); - - kmeans_plus_plus( - verbose, - seed, - h_all_potential_centroids, - weights, - k, cols, - final_centroids - ); - - double timekpp = static_cast(timer() - tkpp); - - log_verbose(verbose, "KMeans|| - Time loop: %g Time count: %g Time kpp: %g.", timeloop, timecount, timekpp); - } + if (potential_centroids_num <= k) { + final_centroids.resize(k * cols); + thrust::copy(h_all_potential_centroids.begin(), + h_all_potential_centroids.end(), final_centroids.begin()); + // TODO what if potential_centroids_num < k ?? we don't want 0s + } else { + // If we found more than k potential cluster centers we need to take only a + // subset This is done using a weighted k-means++ method, since the set + // should be very small it should converge very fast and is all done on the + // CPU. + thrust::host_vector weights(potential_centroids_num); - return final_centroids; - } + double tc0 = timer(); - volatile std::atomic_int flaggpu(0); + // Weights correspond to the number of data points assigned to each + // potential cluster center + count_pts_per_centroid(verbose, num_gpu, rows_per_gpu, cols, data, + data_dots, h_all_potential_centroids, weights); - inline void my_function_gpu(int sig) { // can be called asynchronously - fprintf(stderr, "Caught signal %d. Terminating shortly.\n", sig); - flaggpu = 1; + double timecount = static_cast(timer() - tc0); + + double tkpp = timer(); + + kmeans_plus_plus(verbose, seed, h_all_potential_centroids, weights, k, cols, + final_centroids); + + double timekpp = static_cast(timer() - tkpp); + + log_verbose(verbose, + "KMeans|| - Time loop: %g Time count: %g Time kpp: %g.", + timeloop, timecount, timekpp); } - std::vector kmeans_init(int verbose, int *final_n_gpu, int n_gputry, int gpu_idtry, int rows) { - if (rows > std::numeric_limits::max()) { - fprintf(stderr, "rows > %d not implemented\n", std::numeric_limits::max()); - fflush(stderr); - exit(0); - } + return final_centroids; +} - std::signal(SIGINT, my_function_gpu); - std::signal(SIGTERM, my_function_gpu); +volatile std::atomic_int flaggpu(0); - // no more gpus than visible gpus - int n_gpuvis; - cudaGetDeviceCount(&n_gpuvis); - int n_gpu = std::min(n_gpuvis, n_gputry); +inline void my_function_gpu(int sig) { // can be called asynchronously + fprintf(stderr, "Caught signal %d. Terminating shortly.\n", sig); + flaggpu = 1; +} - // no more than rows - n_gpu = std::min(n_gpu, rows); +std::vector kmeans_init(int verbose, int *final_n_gpu, int n_gputry, + int gpu_idtry, int rows) { + if (rows > std::numeric_limits::max()) { + fprintf(stderr, "rows > %d not implemented\n", + std::numeric_limits::max()); + fflush(stderr); + exit(0); + } - if (verbose) { - std::cout << n_gpu << " gpus." << std::endl; - } + std::signal(SIGINT, my_function_gpu); + std::signal(SIGTERM, my_function_gpu); - int gpu_id = gpu_idtry % n_gpuvis; + // no more gpus than visible gpus + int n_gpuvis; + cudaGetDeviceCount(&n_gpuvis); + int n_gpu = std::min(n_gpuvis, n_gputry); - // setup GPU list to use - std::vector dList(n_gpu); - for (int idx = 0; idx < n_gpu; idx++) { - int device_idx = (gpu_id + idx) % n_gpuvis; - dList[idx] = device_idx; - } + // no more than rows + n_gpu = std::min(n_gpu, rows); - *final_n_gpu = n_gpu; - return dList; + if (verbose) { + std::cout << n_gpu << " gpus." << std::endl; } - template - H2O4GPUKMeans::H2O4GPUKMeans(const T *A, int k, int n, int d) { - _A = A; - _k = k; - _n = n; - _d = d; + int gpu_id = gpu_idtry % n_gpuvis; + + // setup GPU list to use + std::vector dList(n_gpu); + for (int idx = 0; idx < n_gpu; idx++) { + int device_idx = (gpu_id + idx) % n_gpuvis; + dList[idx] = device_idx; } - template - int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, const char ord, - int k, int max_iterations, int init_from_data, - T threshold, - const T *srcdata, T **pred_centroids, int **pred_labels) { - // init random seed if use the C function rand() - if (seed >= 0) { - srand(seed); - } else { - srand(unsigned(time(NULL))); - } + *final_n_gpu = n_gpu; + return dList; +} - // no more clusters than rows - if (k > rows) { - k = static_cast(rows); - fprintf(stderr, "Number of clusters adjusted to be equal to number of rows.\n"); - fflush(stderr); - } +template +H2O4GPUKMeans::H2O4GPUKMeans(const T *A, int k, int n, int d) { + _A = A; + _k = k; + _n = n; + _d = d; +} + +template +int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, size_t rows, + size_t cols, const char ord, int k, int max_iterations, + int init_from_data, T threshold, const T *srcdata, + T **pred_centroids, int **pred_labels) { + // init random seed if use the C function rand() + if (seed >= 0) { + srand(seed); + } else { + srand(unsigned(time(NULL))); + } + + // no more clusters than rows + if (k > rows) { + k = static_cast(rows); + fprintf(stderr, + "Number of clusters adjusted to be equal to number of rows.\n"); + fflush(stderr); + } - int n_gpu; - std::vector dList = kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + int n_gpu; + // device list + std::vector dList = + kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); - double t0t = timer(); - thrust::device_vector *data[n_gpu]; - thrust::device_vector *labels[n_gpu]; - thrust::device_vector *d_centroids[n_gpu]; - thrust::device_vector *data_dots[n_gpu]; + double t0t = timer(); + thrust::device_vector *data[n_gpu]; + thrust::device_vector *labels[n_gpu]; + thrust::device_vector *d_centroids[n_gpu]; + thrust::device_vector *data_dots[n_gpu]; #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - data[q] = new thrust::device_vector(rows / n_gpu * cols); - d_centroids[q] = new thrust::device_vector(k * cols); - data_dots[q] = new thrust::device_vector(rows / n_gpu); + for (int device_idx = 0; device_idx < n_gpu; device_idx++) { + CUDACHECK(cudaSetDevice(dList[device_idx])); + data[device_idx] = new thrust::device_vector(rows / n_gpu * cols); + d_centroids[device_idx] = new thrust::device_vector(k * cols); + data_dots[device_idx] = new thrust::device_vector(rows / n_gpu); - kmeans::detail::labels_init(); - } + kmeans::detail::labels_init(); + } - log_debug(verbose, "Number of points: %d", rows); - log_debug(verbose, "Number of dimensions: %d", cols); - log_debug(verbose, "Number of clusters: %d", k); - log_debug(verbose, "Max. number of iterations: %d", max_iterations); - log_debug(verbose, "Stopping threshold: %d", threshold); + log_debug(verbose, "Number of points: %d", rows); + log_debug(verbose, "Number of dimensions: %d", cols); + log_debug(verbose, "Number of clusters: %d", k); + log_debug(verbose, "Max. number of iterations: %d", max_iterations); + log_debug(verbose, "Stopping threshold: %d", threshold); - std::vector v(rows); - std::iota(std::begin(v), std::end(v), 0); // Fill with 0, 1, ..., rows. + std::vector v(rows); + std::iota(std::begin(v), std::end(v), 0); // Fill with 0, 1, ..., rows. - if (seed >= 0) { - std::shuffle(v.begin(), v.end(), std::default_random_engine(seed)); - } else { - std::random_shuffle(v.begin(), v.end()); - } + if (seed >= 0) { + std::shuffle(v.begin(), v.end(), std::default_random_engine(seed)); + } else { + std::random_shuffle(v.begin(), v.end()); + } // Copy the data to devices #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - if (verbose) { std::cout << "Copying data to device: " << dList[q] << std::endl; } + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + if (verbose) { + std::cout << "Copying data to device: " << dList[q] << std::endl; + } - copy_data(verbose, ord, *data[q], &srcdata[0], q, rows, rows / n_gpu, cols); + copy_data(verbose, ord, *data[q], &srcdata[0], q, rows, rows / n_gpu, cols); - // Pre-compute the data matrix norms - kmeans::detail::make_self_dots(rows / n_gpu, cols, *data[q], *data_dots[q]); - } + // Pre-compute the data matrix norms + kmeans::detail::make_self_dots(rows / n_gpu, cols, *data[q], *data_dots[q]); + } - // Get random points as centroids - int bytecount = cols * k * sizeof(T); // all centroids - if (0 == init_from_data) { - log_debug(verbose, "KMeans - Using random initialization."); + // Get random points as centroids + int bytecount = cols * k * sizeof(T); // all centroids + if (0 == init_from_data) { + log_debug(verbose, "KMeans - Using random initialization."); - int masterq = 0; - CUDACHECK(cudaSetDevice(dList[masterq])); - copy_centroids_shuffled(verbose, v, ord, *d_centroids[masterq], &srcdata[0], rows, k, cols); + int masterq = 0; + CUDACHECK(cudaSetDevice(dList[masterq])); + copy_centroids_shuffled(verbose, v, ord, *d_centroids[masterq], &srcdata[0], + rows, k, cols); - // Copy centroids to all devices - std::vector < cudaStream_t * > streams; - streams.resize(n_gpu); + // Copy centroids to all devices + std::vector streams; + streams.resize(n_gpu); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - if (q == masterq) continue; - - CUDACHECK(cudaSetDevice(dList[q])); - if (verbose > 0) { - std::cout << "Copying centroid data to device: " << dList[q] << std::endl; - } + for (int q = 0; q < n_gpu; q++) { + if (q == masterq) + continue; - streams[q] = reinterpret_cast(malloc(sizeof(cudaStream_t))); - cudaStreamCreate(streams[q]); - cudaMemcpyPeerAsync(thrust::raw_pointer_cast(&(*d_centroids[q])[0]), - dList[q], - thrust::raw_pointer_cast(&(*d_centroids[masterq])[0]), - dList[masterq], - bytecount, - *(streams[q])); - } -//#pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - if (q == masterq) continue; - cudaSetDevice(dList[q]); - cudaStreamDestroy(*(streams[q])); -#if(DEBUGKMEANS) - thrust::host_vector h_centroidq=*d_centroids[q]; - for(int ii=0;ii 0) { + std::cout << "Copying centroid data to device: " << dList[q] + << std::endl; } - } else if (1 == init_from_data) { // kmeans|| - log_debug(verbose, "KMeans - Using K-Means|| initialization."); - thrust::host_vector final_centroids = kmeans_parallel(verbose, seed, ord, data, data_dots, rows, cols, k, n_gpu, threshold); - -#pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - cudaMemcpy( - thrust::raw_pointer_cast(&(*d_centroids[q])[0]), - thrust::raw_pointer_cast(&final_centroids[0]), - bytecount, - cudaMemcpyHostToDevice); + streams[q] = + reinterpret_cast(malloc(sizeof(cudaStream_t))); + cudaStreamCreate(streams[q]); + cudaMemcpyPeerAsync(thrust::raw_pointer_cast(&(*d_centroids[q])[0]), + dList[q], + thrust::raw_pointer_cast(&(*d_centroids[masterq])[0]), + dList[masterq], bytecount, *(streams[q])); + } + //#pragma omp parallel for + for (int q = 0; q < n_gpu; q++) { + if (q == masterq) + continue; + cudaSetDevice(dList[q]); + cudaStreamDestroy(*(streams[q])); +#if (DEBUGKMEANS) + thrust::host_vector h_centroidq = *d_centroids[q]; + for (int ii = 0; ii < k * d; ii++) { + fprintf(stderr, "q=%d initcent[%d]=%g\n", q, ii, h_centroidq[ii]); + fflush(stderr); } - +#endif } + } else if (1 == init_from_data) { // kmeans|| + log_debug(verbose, "KMeans - Using K-Means|| initialization."); + + thrust::host_vector final_centroids = kmeans_parallel( + verbose, seed, ord, data, data_dots, rows, cols, k, n_gpu, threshold); #pragma omp parallel for for (int q = 0; q < n_gpu; q++) { CUDACHECK(cudaSetDevice(dList[q])); - labels[q] = new thrust::device_vector(rows / n_gpu); + cudaMemcpy(thrust::raw_pointer_cast(&(*d_centroids[q])[0]), + thrust::raw_pointer_cast(&final_centroids[0]), bytecount, + cudaMemcpyHostToDevice); } + } - double timetransfer = static_cast(timer() - t0t); +#pragma omp parallel for + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + labels[q] = new thrust::device_vector(rows / n_gpu); + } - double t0 = timer(); + double timetransfer = static_cast(timer() - t0t); - int iter = kmeans::kmeans(verbose, &flaggpu, rows, cols, k, data, labels, d_centroids, data_dots, - dList, n_gpu, max_iterations, threshold, true); + double t0 = timer(); - if (iter < 0) { - log_error(verbose, "KMeans algorithm failed."); - return iter; - } + int iter = kmeans::kmeans(verbose, &flaggpu, rows, cols, k, data, labels, + d_centroids, data_dots, dList, n_gpu, + max_iterations, threshold, true); - double timefit = static_cast(timer() - t0); + if (iter < 0) { + log_error(verbose, "KMeans algorithm failed."); + return iter; + } - double t1 = timer(); + double timefit = static_cast(timer() - t0); + + double t1 = timer(); // copy result of centroids (sitting entirely on each device) back to host // TODO FIXME: When do delete ctr and h_labels memory??? thrust::host_vector *ctr = new thrust::host_vector(*d_centroids[0]); *pred_centroids = ctr->data(); - // copy assigned labels - thrust::host_vector *h_labels = new thrust::host_vector(rows); -//#pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - int offset = labels[q]->size()*q; - h_labels->insert(h_labels->begin() + offset, labels[q]->begin(), labels[q]->end()); - } + // copy assigned labels + thrust::host_vector *h_labels = new thrust::host_vector(rows); + //#pragma omp parallel for + for (int q = 0; q < n_gpu; q++) { + int offset = labels[q]->size() * q; + h_labels->insert(h_labels->begin() + offset, labels[q]->begin(), + labels[q]->end()); + } - *pred_labels = h_labels->data(); + *pred_labels = h_labels->data(); - // debug - if (verbose >= H2O4GPU_LOG_VERBOSE) { - for (unsigned int ii = 0; ii < k; ii++) { - fprintf(stderr, "ii=%d of k=%d ", ii, k); - for (unsigned int jj = 0; jj < cols; jj++) { - fprintf(stderr, "%g ", (*pred_centroids)[cols * ii + jj]); - } - fprintf(stderr, "\n"); - fflush(stderr); + // debug + if (verbose >= H2O4GPU_LOG_VERBOSE) { + for (unsigned int ii = 0; ii < k; ii++) { + fprintf(stderr, "ii=%d of k=%d ", ii, k); + for (unsigned int jj = 0; jj < cols; jj++) { + fprintf(stderr, "%g ", (*pred_centroids)[cols * ii + jj]); } + fprintf(stderr, "\n"); + fflush(stderr); } + } #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - delete (data[q]); - delete (labels[q]); - delete (d_centroids[q]); - delete (data_dots[q]); - kmeans::detail::labels_close(); - } + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + delete (data[q]); + delete (labels[q]); + delete (d_centroids[q]); + delete (data_dots[q]); + kmeans::detail::labels_close(); + } - double timecleanup = static_cast(timer() - t1); + double timecleanup = static_cast(timer() - t1); - if (verbose) { - std::cout << " Time fit: " << timefit << " s" << std::endl; - fprintf(stderr, "Timetransfer: %g Timefit: %g Timecleanup: %g\n", timetransfer, timefit, timecleanup); - fflush(stderr); - } - - return 0; + if (verbose) { + std::cout << " Time fit: " << timefit << " s" << std::endl; + fprintf(stderr, "Timetransfer: %g Timefit: %g Timecleanup: %g\n", + timetransfer, timefit, timecleanup); + fflush(stderr); } - template - int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, - const char ord, int k, - const T *srcdata, const T *centroids, int **pred_labels) { - // Print centroids - if (verbose >= H2O4GPU_LOG_VERBOSE) { - std::cout << std::endl; - for (int i = 0; i < cols * k; i++) { - std::cout << centroids[i] << " "; - if (i % cols == 1) { - std::cout << std::endl; - } + return 0; +} + +template +int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, size_t rows, + size_t cols, const char ord, int k, const T *srcdata, + const T *centroids, int **pred_labels) { + // Print centroids + if (verbose >= H2O4GPU_LOG_VERBOSE) { + std::cout << std::endl; + for (int i = 0; i < cols * k; i++) { + std::cout << centroids[i] << " "; + if (i % cols == 1) { + std::cout << std::endl; } } + } - int n_gpu; - std::vector dList = kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + int n_gpu; + std::vector dList = + kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); - thrust::device_vector *d_data[n_gpu]; - thrust::device_vector *d_centroids[n_gpu]; - thrust::device_vector *data_dots[n_gpu]; - thrust::device_vector *centroid_dots[n_gpu]; - thrust::host_vector *h_labels = new thrust::host_vector(0); + thrust::device_vector *d_data[n_gpu]; + thrust::device_vector *d_centroids[n_gpu]; + thrust::device_vector *data_dots[n_gpu]; + thrust::device_vector *centroid_dots[n_gpu]; + thrust::host_vector *h_labels = new thrust::host_vector(0); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - kmeans::detail::labels_init(); + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + kmeans::detail::labels_init(); - data_dots[q] = new thrust::device_vector(rows / n_gpu); - centroid_dots[q] = new thrust::device_vector(k); + data_dots[q] = new thrust::device_vector(rows / n_gpu); + centroid_dots[q] = new thrust::device_vector(k); - d_centroids[q] = new thrust::device_vector(k * cols); - d_data[q] = new thrust::device_vector(rows / n_gpu * cols); + d_centroids[q] = new thrust::device_vector(k * cols); + d_data[q] = new thrust::device_vector(rows / n_gpu * cols); - copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); + copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); - copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, cols); + copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, + cols); - kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], *data_dots[q]); + kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], + *data_dots[q]); - thrust::device_vector d_labels(rows / n_gpu); + thrust::device_vector d_labels(rows / n_gpu); - kmeans::detail::batch_calculate_distances(verbose, q, rows / n_gpu, cols, k, - *d_data[q], *d_centroids[q], *data_dots[q], *centroid_dots[q], - [&](int n, size_t offset, thrust::device_vector &pairwise_distances) { - kmeans::detail::relabel(n, k, pairwise_distances, d_labels, offset); - } - ); + kmeans::detail::batch_calculate_distances( + verbose, q, rows / n_gpu, cols, k, *d_data[q], *d_centroids[q], + *data_dots[q], *centroid_dots[q], + [&](int n, size_t offset, + thrust::device_vector &pairwise_distances) { + kmeans::detail::relabel(n, k, pairwise_distances, d_labels, offset); + }); - h_labels->insert(h_labels->end(), d_labels.begin(), d_labels.end()); - } + h_labels->insert(h_labels->end(), d_labels.begin(), d_labels.end()); + } - *pred_labels = h_labels->data(); + *pred_labels = h_labels->data(); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - safe_cuda(cudaSetDevice(dList[q])); - kmeans::detail::labels_close(); - delete (data_dots[q]); - delete (centroid_dots[q]); - delete (d_centroids[q]); - delete (d_data[q]); - } - - return 0; + for (int q = 0; q < n_gpu; q++) { + safe_cuda(cudaSetDevice(dList[q])); + kmeans::detail::labels_close(); + delete (data_dots[q]); + delete (centroid_dots[q]); + delete (d_centroids[q]); + delete (d_data[q]); } - template - int kmeans_transform(int verbose, - int gpu_idtry, int n_gputry, - size_t rows, size_t cols, const char ord, int k, - const T *srcdata, const T *centroids, - T **preds) { - // Print centroids - if (verbose >= H2O4GPU_LOG_VERBOSE) { - std::cout << std::endl; - for (int i = 0; i < cols * k; i++) { - std::cout << centroids[i] << " "; - if (i % cols == 1) { - std::cout << std::endl; - } + return 0; +} + +template +int kmeans_transform(int verbose, int gpu_idtry, int n_gputry, size_t rows, + size_t cols, const char ord, int k, const T *srcdata, + const T *centroids, T **preds) { + // Print centroids + if (verbose >= H2O4GPU_LOG_VERBOSE) { + std::cout << std::endl; + for (int i = 0; i < cols * k; i++) { + std::cout << centroids[i] << " "; + if (i % cols == 1) { + std::cout << std::endl; } } + } - int n_gpu; - std::vector dList = kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + int n_gpu; + std::vector dList = + kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); - thrust::device_vector *d_data[n_gpu]; - thrust::device_vector *d_centroids[n_gpu]; - thrust::device_vector *d_pairwise_distances[n_gpu]; - thrust::device_vector *data_dots[n_gpu]; - thrust::device_vector *centroid_dots[n_gpu]; + thrust::device_vector *d_data[n_gpu]; + thrust::device_vector *d_centroids[n_gpu]; + thrust::device_vector *d_pairwise_distances[n_gpu]; + thrust::device_vector *data_dots[n_gpu]; + thrust::device_vector *centroid_dots[n_gpu]; #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - kmeans::detail::labels_init(); + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + kmeans::detail::labels_init(); - data_dots[q] = new thrust::device_vector(rows / n_gpu); - centroid_dots[q] = new thrust::device_vector(k); - d_pairwise_distances[q] = new thrust::device_vector(rows / n_gpu * k); + data_dots[q] = new thrust::device_vector(rows / n_gpu); + centroid_dots[q] = new thrust::device_vector(k); + d_pairwise_distances[q] = new thrust::device_vector(rows / n_gpu * k); - d_centroids[q] = new thrust::device_vector(k * cols); - d_data[q] = new thrust::device_vector(rows / n_gpu * cols); + d_centroids[q] = new thrust::device_vector(k * cols); + d_data[q] = new thrust::device_vector(rows / n_gpu * cols); - copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); + copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); - copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, cols); + copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, + cols); - kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], *data_dots[q]); + kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], + *data_dots[q]); - // TODO batch this - kmeans::detail::calculate_distances(verbose, q, rows / n_gpu, cols, k, - *d_data[q], 0, *d_centroids[q], *data_dots[q], - *centroid_dots[q], *d_pairwise_distances[q]); - } + // TODO batch this + kmeans::detail::calculate_distances( + verbose, q, rows / n_gpu, cols, k, *d_data[q], 0, *d_centroids[q], + *data_dots[q], *centroid_dots[q], *d_pairwise_distances[q]); + } - // Move the resulting labels into host memory from all devices - thrust::host_vector *h_pairwise_distances = new thrust::host_vector(0); + // Move the resulting labels into host memory from all devices + thrust::host_vector *h_pairwise_distances = new thrust::host_vector(0); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - h_pairwise_distances->insert(h_pairwise_distances->end(), - d_pairwise_distances[q]->begin(), - d_pairwise_distances[q]->end()); - } - *preds = h_pairwise_distances->data(); - - // Print centroids - if (verbose >= H2O4GPU_LOG_VERBOSE) { - std::cout << std::endl; - for (int i = 0; i < rows * cols; i++) { - std::cout << h_pairwise_distances->data()[i] << " "; - if (i % cols == 1) { - std::cout << std::endl; - } + for (int q = 0; q < n_gpu; q++) { + h_pairwise_distances->insert(h_pairwise_distances->end(), + d_pairwise_distances[q]->begin(), + d_pairwise_distances[q]->end()); + } + *preds = h_pairwise_distances->data(); + + // Print centroids + if (verbose >= H2O4GPU_LOG_VERBOSE) { + std::cout << std::endl; + for (int i = 0; i < rows * cols; i++) { + std::cout << h_pairwise_distances->data()[i] << " "; + if (i % cols == 1) { + std::cout << std::endl; } } + } #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - safe_cuda(cudaSetDevice(dList[q])); - kmeans::detail::labels_close(); - delete (d_pairwise_distances[q]); - delete (data_dots[q]); - delete (centroid_dots[q]); - delete (d_centroids[q]); - delete (d_data[q]); - } - - return 0; + for (int q = 0; q < n_gpu; q++) { + safe_cuda(cudaSetDevice(dList[q])); + kmeans::detail::labels_close(); + delete (d_pairwise_distances[q]); + delete (data_dots[q]); + delete (centroid_dots[q]); + delete (d_centroids[q]); + delete (d_data[q]); } - template - int makePtr_dense(int dopredict, int verbose, int seed, int gpu_idtry, int n_gputry, size_t rows, size_t cols, - const char ord, int k, int max_iterations, int init_from_data, - T threshold, const T *srcdata, const T *centroids, - T **pred_centroids, int **pred_labels) { - if (dopredict == 0) { - return kmeans_fit(verbose, seed, gpu_idtry, n_gputry, rows, cols, - ord, k, max_iterations, init_from_data, threshold, - srcdata, pred_centroids, pred_labels); - } else { - return kmeans_predict(verbose, gpu_idtry, n_gputry, rows, cols, - ord, k, - srcdata, centroids, pred_labels); - } + return 0; +} + +template +int makePtr_dense(int dopredict, int verbose, int seed, int gpu_idtry, + int n_gputry, size_t rows, size_t cols, const char ord, int k, + int max_iterations, int init_from_data, T threshold, + const T *srcdata, const T *centroids, T **pred_centroids, + int **pred_labels) { + if (dopredict == 0) { + return kmeans_fit(verbose, seed, gpu_idtry, n_gputry, rows, cols, ord, k, + max_iterations, init_from_data, threshold, srcdata, + pred_centroids, pred_labels); + } else { + return kmeans_predict(verbose, gpu_idtry, n_gputry, rows, cols, ord, k, + srcdata, centroids, pred_labels); } +} - template int - makePtr_dense(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t rows, size_t cols, - const char ord, int k, int max_iterations, int init_from_data, - float threshold, const float *srcdata, - const float *centroids, float **pred_centroids, int **pred_labels); - - template int - makePtr_dense(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t rows, size_t cols, - const char ord, int k, int max_iterations, int init_from_data, - double threshold, const double *srcdata, - const double *centroids, double **pred_centroids, int **pred_labels); - - template int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, - const char ord, int k, int max_iterations, - int init_from_data, float threshold, - const float *srcdata, - float **pred_centroids, int **pred_labels); - - template int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, - const char ord, int k, int max_iterations, - int init_from_data, double threshold, - const double *srcdata, - double **pred_centroids, int **pred_labels); - - template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, - const char ord, int k, - const float *srcdata, const float *centroids, int **pred_labels); - - template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, - const char ord, int k, - const double *srcdata, const double *centroids, int **pred_labels); - - template int kmeans_transform(int verbose, - int gpu_id, int n_gpu, - size_t m, size_t n, const char ord, int k, - const float *src_data, const float *centroids, - float **preds); - - template int kmeans_transform(int verbose, - int gpu_id, int n_gpu, - size_t m, size_t n, const char ord, int k, - const double *src_data, const double *centroids, - double **preds); - - // Explicit template instantiation. +template int makePtr_dense(int dopredict, int verbose, int seed, + int gpu_id, int n_gpu, size_t rows, + size_t cols, const char ord, int k, + int max_iterations, int init_from_data, + float threshold, const float *srcdata, + const float *centroids, + float **pred_centroids, int **pred_labels); + +template int makePtr_dense(int dopredict, int verbose, int seed, + int gpu_id, int n_gpu, size_t rows, + size_t cols, const char ord, int k, + int max_iterations, int init_from_data, + double threshold, const double *srcdata, + const double *centroids, + double **pred_centroids, int **pred_labels); + +template int kmeans_fit(int verbose, int seed, int gpu_idtry, + int n_gputry, size_t rows, size_t cols, + const char ord, int k, int max_iterations, + int init_from_data, float threshold, + const float *srcdata, float **pred_centroids, + int **pred_labels); + +template int kmeans_fit(int verbose, int seed, int gpu_idtry, + int n_gputry, size_t rows, size_t cols, + const char ord, int k, int max_iterations, + int init_from_data, double threshold, + const double *srcdata, double **pred_centroids, + int **pred_labels); + +template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, const char ord, + int k, const float *srcdata, + const float *centroids, int **pred_labels); + +template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, const char ord, + int k, const double *srcdata, + const double *centroids, int **pred_labels); + +template int kmeans_transform(int verbose, int gpu_id, int n_gpu, + size_t m, size_t n, const char ord, int k, + const float *src_data, + const float *centroids, float **preds); + +template int kmeans_transform(int verbose, int gpu_id, int n_gpu, + size_t m, size_t n, const char ord, int k, + const double *src_data, + const double *centroids, double **preds); + +// Explicit template instantiation. #if !defined(H2O4GPU_DOUBLE) || H2O4GPU_DOUBLE == 1 - template - class H2O4GPUKMeans; +template class H2O4GPUKMeans; #endif #if !defined(H2O4GPU_SINGLE) || H2O4GPU_SINGLE == 1 - template - class H2O4GPUKMeans; +template class H2O4GPUKMeans; #endif -} // namespace h2o4gpukmeans +} // namespace h2o4gpukmeans - /* - * Interface for other languages - */ +/* + * Interface for other languages + */ - // Fit and Predict - int make_ptr_float_kmeans(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t mTrain, size_t n, - const char ord, int k, int max_iterations, int init_from_data, - float threshold, const float *srcdata, - const float *centroids, float **pred_centroids, int **pred_labels) { - return h2o4gpukmeans::makePtr_dense(dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, - max_iterations, init_from_data, threshold, - srcdata, centroids, pred_centroids, pred_labels); - } +// Fit and Predict +int make_ptr_float_kmeans(int dopredict, int verbose, int seed, int gpu_id, + int n_gpu, size_t mTrain, size_t n, const char ord, + int k, int max_iterations, int init_from_data, + float threshold, const float *srcdata, + const float *centroids, float **pred_centroids, + int **pred_labels) { + return h2o4gpukmeans::makePtr_dense( + dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, + max_iterations, init_from_data, threshold, srcdata, centroids, + pred_centroids, pred_labels); +} - int make_ptr_double_kmeans(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t mTrain, size_t n, - const char ord, int k, int max_iterations, int init_from_data, - double threshold, const double *srcdata, - const double *centroids, double **pred_centroids, int **pred_labels) { - return h2o4gpukmeans::makePtr_dense(dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, - max_iterations, init_from_data, threshold, - srcdata, centroids, pred_centroids, pred_labels); - } +int make_ptr_double_kmeans(int dopredict, int verbose, int seed, int gpu_id, + int n_gpu, size_t mTrain, size_t n, const char ord, + int k, int max_iterations, int init_from_data, + double threshold, const double *srcdata, + const double *centroids, double **pred_centroids, + int **pred_labels) { + return h2o4gpukmeans::makePtr_dense( + dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, + max_iterations, init_from_data, threshold, srcdata, centroids, + pred_centroids, pred_labels); +} - // Transform - int kmeans_transform_float(int verbose, - int gpu_id, int n_gpu, - size_t m, size_t n, const char ord, int k, - const float *src_data, const float *centroids, - float **preds) { - return h2o4gpukmeans::kmeans_transform(verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); - } +// Transform +int kmeans_transform_float(int verbose, int gpu_id, int n_gpu, size_t m, + size_t n, const char ord, int k, + const float *src_data, const float *centroids, + float **preds) { + return h2o4gpukmeans::kmeans_transform( + verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); +} - int kmeans_transform_double(int verbose, - int gpu_id, int n_gpu, - size_t m, size_t n, const char ord, int k, - const double *src_data, const double *centroids, - double **preds) { - return h2o4gpukmeans::kmeans_transform(verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); - } +int kmeans_transform_double(int verbose, int gpu_id, int n_gpu, size_t m, + size_t n, const char ord, int k, + const double *src_data, const double *centroids, + double **preds) { + return h2o4gpukmeans::kmeans_transform( + verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); +} diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu new file mode 100644 index 000000000..584a7ec72 --- /dev/null +++ b/src/gpu/kmeans/kmeans_init.cu @@ -0,0 +1,136 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include +#include +#include + +#include + +#include + +#include "kmeans_general.h" +#include "array.cuh" +#include "kmeans_h2o4gpu.h" + +#include "kmeans_init.cuh" + +namespace H2O4GPU { +namespace KMeans { + +using namespace Array; + +// K-Means|| implementation +template +__device__ float vector_dot(T lhs_start, T lhs_end, T rhs_start) { + float result = 0; + for (T lhs_iter = lhs_start, rhs_iter = rhs_start; + lhs_iter != lhs_end; + ++lhs_iter, ++rhs_iter) { + result += (*lhs_iter) * (*rhs_iter); + } + return result; +} + +template +__global__ void min_distance(T* __restrict__ result, + T* __restrict__ data, size_t stride, + T* __restrict__ cendroids, size_t n_centroids) { + for (size_t i = 0; i < n_centroids; ++i) { + result[i] = + vector_dot(data, data+stride, data); + // vector_dot(data, data+stride, &cendroids[i*stride]) + + // vector_dot(&cendroids[i*stride], &cendroids[(i+1)*stride], &cendroids[i*stride]); + } + T minimum = std::numeric_limits::max(); + for (size_t i = 0; i < n_centroids; ++i) { + if (result[i] < minimum) { + minimum = result[i]; + } + } +} + + +template +CUDAArray KmeansLlInit::sample_centroids(CUDAArray& data, CUDAArray& prob) { + size_t n_new_centroids = thrust::count_if( + data.device_vector().begin(), data.device_vector().end(), + [=] __device__ (int idx) { + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + rng.discard(idx); + T threshold = (T)dist(rng); + // T prob = prob[i]; + T prob = 0.1f; + return prob > threshold; + }); + + CUDAArray centroids (n_new_centroids); + thrust::copy_if(data.device_vector().begin(), data.device_vector().end(), + centroids.device_vector().begin(), + [=] __device__ (int idx) { + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + // rng.discard(row + device * rows_per_gpu); + T prob_threshold = (T)dist(rng); + + // T prob_x = ((2.0 * k * min_costs_ptr[row]) / total_min_cost); + T prob_x = 0.1f; + + return prob_x > prob_threshold; + }); + return centroids; +} + +template +CUDAArray KmeansLlInit::operator()(CUDAArray& data) { + if (seed < 0) { + std::random_device rd; + seed = rd(); + } + + std::mt19937 generator(seed); + std::uniform_int_distribution<> distribution(0, data.dims()[0] - 1); + size_t idx = distribution(generator); + CUDAArray centroids = data.index(idx); + + CUDAArray distances (Dims(data.dims()[0], 1, 0, 0)); + + distances.print(); + + min_distance<<<256, data.size() / 256>>>( + distances.get(), data.get(), data.dims()[1], + centroids.get(), 1); + + cudaDeviceSynchronize(); + + // T potential = * min_element(distances.begin(), distances.end()); + T potential = 1.0f; + // for (size_t i = 0; i < log(potential); ++i) { + // min_distance(distances.device_ptr(), + // data.device_ptr(), data.stride(), + // centroids.device_ptr(), centroids.size()); + // T potential = * thrust::min_element(distances.begin(), distances.end()); + // T potential = 1.0f; + // CUDAArray prob = div(distances, potential); + + // CUDAArray new_centroids = sample_centroids(data, prob); + // thrust::copy(new_centroids.begin(), new_centroids.end(), centroids.begin()); + // } + + // re-cluster + // kmeans_plus_plus(centroids); +} + +#define INSTANTIATE(T) \ + template CUDAArray KmeansLlInit::operator()(CUDAArray& data); \ + template CUDAArray KmeansLlInit::sample_centroids( \ + CUDAArray& data, CUDAArray& prob); + +INSTANTIATE(float) +INSTANTIATE(double) + +} // namespace Kmeans +} // namespace H2O4GPU diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh new file mode 100644 index 000000000..14eadc9bc --- /dev/null +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -0,0 +1,35 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include "array.cuh" + +namespace H2O4GPU{ +namespace KMeans { + +template +class KmeansInitBase { + public: + virtual ~KmeansInitBase() {} + virtual Array::CUDAArray operator()(Array::CUDAArray& data) = 0; +}; + +template +struct KmeansLlInit : public KmeansInitBase { + private: + double over_sample; + int seed; + + public: + KmeansLlInit () : over_sample (2.0), seed (0) {} + virtual ~KmeansLlInit () override {} + + Array::CUDAArray sample_centroids(Array::CUDAArray& data, + Array::CUDAArray& prob); + + Array::CUDAArray operator()(Array::CUDAArray& data) override; +}; + +} // namespace Kmeans +} // namespace H2O4GPU diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu new file mode 100644 index 000000000..46cfe61c7 --- /dev/null +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -0,0 +1,49 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include + +#include "../../../../src/gpu/kmeans/array.cuh" +#include "../../../../src/gpu/kmeans/kmeans_init.cuh" + +#include +#include + +TEST(KmeansLL, KmeansLLInit) { + + int k = 2; + + H2O4GPU::Array::Dims dims {4, 4, 0, 0}; + H2O4GPU::KMeans::KmeansLlInit kmeans_ll_init; + + thrust::host_vector _h_data (16); + + for (size_t i = 0; i < 4; ++i) { + _h_data[i] = i; + } + for (size_t i = 4; i < 8; ++i) { + _h_data[i] = i - 2; + } + + for (size_t i = 8; i < 12; ++i) { + _h_data[i] = i; + } + for (size_t i = 12; i < 16; ++i) { + _h_data[i] = i + 2; + } + + thrust::device_vector _d_data; + _d_data = _h_data; + + H2O4GPU::Array::CUDAArray data (_d_data, dims); + + kmeans_ll_init (data); + + std::cout << "Host" << std::endl; + for (size_t i = 0; i < 16; ++i) { + std::cout << _h_data[i] << ','; + } + std::cout << std::endl; +} \ No newline at end of file From 1cdb79b1d7ee03f8afc8b09064e6078ea360d9d0 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 16 Jul 2018 09:32:48 +0800 Subject: [PATCH 05/49] Basic for KmMatrix and KMeans. Using Eigen could make KMeans work. But when it comes to expanding the cluster, the memory copy is expensive. Also, it's hard to launch separated kernel without memory copy with Eigen. So, proceed with a home brew matrix. --- CMakeLists.txt | 3 +- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 229 ++++++++++++++++++ src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 191 +++++++++++++++ src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 140 +++++++++++ src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 60 +++++ src/gpu/kmeans/array.cu | 139 ----------- src/gpu/kmeans/array.cuh | 107 --------- src/gpu/kmeans/kmeans_init.cu | 293 +++++++++++++++++------ src/gpu/kmeans/kmeans_init.cuh | 139 ++++++++++- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 45 ++-- 10 files changed, 999 insertions(+), 347 deletions(-) create mode 100644 src/gpu/kmeans/KmMatrix/KmMatrix.cpp create mode 100644 src/gpu/kmeans/KmMatrix/KmMatrix.hpp create mode 100644 src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu create mode 100644 src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh delete mode 100644 src/gpu/kmeans/array.cu delete mode 100644 src/gpu/kmeans/array.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index a140f8d04..96e5b8a62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,9 +93,10 @@ if(USE_CUDA) SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -w;") FILE(GLOB_RECURSE GPU_SOURCES - src/*.cu.cc src/*.cu src/*.cuh + src/gpu/kmeans/KmMatrix/*.cpp + src/gpu/kmeans/KmMatrix/*.hpp src/common/*.cpp src/common/*.h) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp new file mode 100644 index 000000000..058994926 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -0,0 +1,229 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include "KmMatrix.hpp" +#if defined (USE_CUDA) +#include "KmMatrixCuda.cuh" +#endif + +namespace H2O4GPU { +namespace KMeans { + +// ============================== +// KmMatrixImpl implementation +// ============================== + +template +KmMatrixImpl::KmMatrixImpl(KmMatrix *_matrix) + : matrix_(_matrix){} + + +// ============================== +// KmMatrix implementation +// ============================== + +template +KmMatrix::KmMatrix() : + param_ (0, 0, nullptr) { + init_impls(); +#if defined (USE_CUDA) + use_cuda = true; + impls[0].reset(new CudaKmMatrixImpl(this)); +#elif + use_cuda = false; + impls[0] = nullptr; +#endif +} + +template +KmMatrix::KmMatrix(size_t _rows, size_t _cols) : + param_ (_rows, _cols, nullptr) { + init_impls(); +#if defined (USE_CUDA) + use_cuda = true; + impls[0].reset(new CudaKmMatrixImpl(this)); +#elif + use_cuda = false; +#endif +} + +template +KmMatrix::KmMatrix(thrust::host_vector _other, + size_t _rows, size_t _cols) : + param_ (_rows, _cols, nullptr) { + init_impls(); +#if defined (USE_CUDA) + use_cuda = true; + impls[0].reset(new CudaKmMatrixImpl(_other, this)); +#elif + use_cuda = false; +#endif +} + +template +KmMatrix::KmMatrix(const KmMatrix& _other) : + param_(_other.param_) { + for (size_t i = 0; i < 4; ++i) { + impls[i] = _other.impls[i]; + } + use_cuda = _other.use_cuda; + name_ = _other.name_ + "(copied)"; +} + +template +KmMatrix::KmMatrix(KmMatrix&& _other) : + param_(_other.param_){ + for (size_t i = 0; i < 4; ++i) { + impls[i] = std::move(_other.impls[i]); + } + use_cuda = _other.use_cuda; + name_ = std::move(_other.name_); +} + +template +void KmMatrix::operator=(const KmMatrix& _other) { + for (size_t i = 0; i < 4; ++i) { + impls[i] = _other.impls[i]; + } + param_ = _other.param_; + use_cuda = _other.use_cuda; + name_ = _other.name_ + "(copied)"; +} + +template +void KmMatrix::operator=(KmMatrix&& _other) { + for (size_t i = 0; i < 4; ++i) { + impls[i] = std::move(_other.impls[i]); + } + param_ = _other.param_; + use_cuda = _other.use_cuda; + name_ = std::move(_other.name_); +} + +template +KmMatrix::KmMatrix(const KmMatrixProxy& _other) : + param_ (_other.param()){ + init_impls(); +#if defined (USE_CUDA) + use_cuda = true; + impls[0].reset(new CudaKmMatrixImpl(_other, this)); +#elif + use_cuda = false; +#endif +} + +template +void KmMatrix::init_impls() { + for (size_t i = 0; i < 4; ++i) { + impls[i] = nullptr; + } +} + +template +KmMatrix::~KmMatrix() { + // std::cout << "name: " << name_ << std::endl; + // for (size_t i = 0; i < 4; ++i) { + // if (impls[i] != nullptr) + // delete impls[i]; + // } +} + + +template +size_t KmMatrix::size() const { + return param_.rows * param_.cols; +} + +template +size_t KmMatrix::rows() const { + return param_.rows; +} + +template +size_t KmMatrix::cols() const { + return param_.cols; +} + +template +kParam KmMatrix::k_param () const { + return param_; +} + +template +T* KmMatrix::host_ptr() { + if (use_cuda) { + return impls[0]->host_ptr(); + } else { + // FIXME + return nullptr; + } +} + +template +T* KmMatrix::dev_ptr() { + if (use_cuda) { + return impls[CUDADense]->dev_ptr(); + } else { + return nullptr; + } +} + +template +KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem) { + size_t start = param_.cols * idx; + size_t stride = 1; + size_t end = param_.cols * (idx + 1); + + if (impls[0] != nullptr) { + if (dev_mem) { + std::cout << "row dev_mem" << std::endl; + return KmMatrixProxy(thrust::device_ptr(impls[0]->dev_ptr()), + start, end, stride, param_, true); + } else { + std::cout << "row host_mem" << std::endl; + return KmMatrixProxy(thrust::device_ptr(impls[0]->host_ptr()), + start, end, stride, param_, false); + } + } + std::cerr << "no cuda" << std::endl; + // FIXME + assert(false); + // return KmMatrixProxy(thrust::device_ptr(NULL), 0, 0, 0, param_, false); +} + +template +KmMatrixProxy KmMatrix::col(size_t idx) { + // FIXME + assert (false); + return KmMatrixProxy(nullptr, 0, 0, 0, param_, false); +} + +#define INSTANTIATE(T) \ + template KmMatrixImpl::KmMatrixImpl(KmMatrix *_matrix); \ + template KmMatrix::KmMatrix(); \ + template KmMatrix::KmMatrix(size_t _rows, size_t _cols); \ + template KmMatrix::KmMatrix(thrust::host_vector _other, \ + size_t _rows, size_t _cols); \ + template KmMatrix::KmMatrix(const KmMatrix& _other); \ + template KmMatrix::KmMatrix(KmMatrix&& _other); \ + template void KmMatrix::operator=(const KmMatrix& _other); \ + template void KmMatrix::operator=(KmMatrix&& _other); \ + template KmMatrix::KmMatrix(const KmMatrixProxy& _other); \ + template KmMatrix::~KmMatrix(); \ + template size_t KmMatrix::size() const; \ + template size_t KmMatrix::rows() const; \ + template size_t KmMatrix::cols() const; \ + template kParam KmMatrix::k_param () const; \ + template T * KmMatrix::host_ptr(); \ + template T * KmMatrix::dev_ptr(); \ + template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +#undef INSTANTIATE +} +} // H2O4GPU diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp new file mode 100644 index 000000000..5b34fad4b --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -0,0 +1,191 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef KM_MATRIX_HPP_ +#define KM_MATRIX_HPP_ + +#include +#include +#include +#include +#include + +// FIXME +#define USE_CUDA 1 + +#if defined (USE_CUDA) +#include "KmMatrixCuda.cuh" +#endif + +namespace H2O4GPU { +namespace KMeans { + +template +class KmMatrixProxy; + +template +class KmMatrix; + +// Kernel parameter +template +struct kParam { + size_t rows; + size_t cols; + T *ptr; + + kParam(size_t _rows, size_t _cols, T *_ptr) + : rows (_rows), cols(_cols), ptr (_ptr) {} + kParam(const kParam& _other) { + rows = _other.rows; + cols = _other.cols; + ptr = _other.ptr; + } + kParam operator=(const kParam& _other) { + rows = _other.rows; + cols = _other.cols; + ptr = _other.ptr; + } +}; + +template +class KmMatrixImpl { + private: + KmMatrix * matrix_; + public: + KmMatrixImpl(KmMatrix *_matrix); + virtual ~KmMatrixImpl () {} + + virtual T* host_ptr() {} + virtual T* dev_ptr() {} + virtual bool on_device() const {} +}; + +template +class KmMatrix { + private: + + enum Backend { + CUDADense = 0, + CUDASparse = 1, + CPUDense = 2, + CPUSparse = 3 + }; + + std::shared_ptr> impls[4]; + kParam param_; + + bool use_cuda; + + void init_impls(); + + std::string name_; + + public: + explicit KmMatrix(); + KmMatrix(size_t _rows, size_t _cols); + KmMatrix(thrust::host_vector _other, size_t _rows, size_t _cols); + KmMatrix(const KmMatrixProxy& _other); + + KmMatrix(const KmMatrix& _other); + KmMatrix(KmMatrix&& _other); + + void operator=(const KmMatrix& _other); + void operator=(KmMatrix&& _other); + + virtual ~KmMatrix(); + + size_t size () const; + size_t rows () const; + size_t cols () const; + + T* host_ptr(); + T* dev_ptr(); + + kParam k_param () const; + + std::string name() const { return name_; } + void set_name (std::string _name) {name_ = _name;} + + KmMatrixProxy row(size_t idx, bool dev_mem=true); + KmMatrixProxy col(size_t idx); +}; + +template +std::ostream& operator<<(std::ostream& os, KmMatrix& m) { + std::cout << "matrix: " << m.name() << std::endl; + T * ptr = m.host_ptr(); + kParam param = m.k_param(); + for (size_t i = 0; i < param.rows; ++i) { + for (size_t j = 0; j < param.cols; ++j) { + std::cout << "(" << i << ", "<< j << ", " << i*param.cols + j << ")" << std::setw(6) << ptr[i*param.cols + j] << ' '; + } + std::cout << std::endl; + } + std::cout << "---" << std::endl; +} + +template +class KmMatrixProxy { + private: + thrust::device_ptr ptr_; + size_t start_; + size_t end_; + size_t stride_; + + bool on_device_; + + kParam param_; + + public: + size_t start() const { + return start_; + } + size_t end() const { + return end_; + } + size_t stride() const { + return stride_; + } + size_t size() const { + return (end_ - start_) / stride_; + } + bool on_device() const { + return on_device_; + } + thrust::device_ptr data() const { + return ptr_ + start_; + } + kParam param() const { + return param_; + } + + KmMatrixProxy(thrust::device_ptr _ptr, + size_t _start, size_t _end, size_t _stride, + kParam _param, bool _on_device) + : ptr_(_ptr), start_(_start), end_(_end), stride_(_stride), + param_(_param), on_device_(_on_device) {} + + void operator=(KmMatrix& _other) { + assert(size() == _other.size); + + assert (_other.size() == size()); + // FIXME + assert(stride_ == 1); + + if (on_device_) { + auto _other_dev_ptr = thrust::device_ptr(_other.dev_ptr()); + + thrust::copy(_other_dev_ptr, _other_dev_ptr + size(), ptr_); + } else { + thrust::copy(_other.host_ptr(), _other.host_ptr() + size(), + ptr_ + start_); + } + } +}; + +} // namespace KMeans +} // namespace H2O4GPU + +#endif diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu new file mode 100644 index 000000000..b941bcda6 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -0,0 +1,140 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include +#include + +#include + +#include "KmMatrix.hpp" +#include "KmMatrixCuda.cuh" + +namespace H2O4GPU { +namespace KMeans { + +template +CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par) : + KmMatrixImpl(_par){} + +template +CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, + KmMatrix* _par) + : _on_device(false), KmMatrixImpl(_par) { + _h_vector.resize(_h_vec.size()); + thrust::copy(_h_vec.begin(), _h_vec.end(), _h_vector.begin()); + std::cout << "Copy host vector: " << _h_vector.size() << std::endl; + for (size_t i = 0; i < _h_vector.size(); ++i) { + std::cout << _h_vector[i] << ' '; + } + std::cout << std::endl; +} + +template +CudaKmMatrixImpl::CudaKmMatrixImpl(const KmMatrixProxy& _other, + KmMatrix* _par) + : KmMatrixImpl(_par) { + thrust::device_ptr ptr = _other.data(); + if (_other.on_device()) { + std::cout << "proxy to dev" << std::endl; + if (_other.stride() == 1) { + _d_vector.resize(_other.size()); + thrust::copy(ptr, ptr + _other.size(), _d_vector.begin()); + std::cout << "copied" << std::endl; + for (size_t i = 0; i < _d_vector.size(); ++i) { + std::cout << _d_vector[i] << ' '; + } + std::cout << std::endl; + } else { + // FIXME + assert(false); + } + _on_device = true; + } else { + if (_other.stride() == 1) { + _h_vector.resize(_other.size()); + std::cout << "_other.size(): " << _other.size() << std::endl; + for (size_t i = 0; i < _other.size(); ++i) { + std::cout << ptr[i] << ' '; + } + std::cout << std::endl; + thrust::copy(ptr, ptr + _other.size(), _h_vector.begin()); + + } else { + // FIXME + assert(false); + } + _on_device = false; + } +} + +template +CudaKmMatrixImpl::~CudaKmMatrixImpl() {} + +template +T* CudaKmMatrixImpl::host_ptr() { + device_to_host(); + std::cout << "host ptr: " << _h_vector.size() << std::endl; + for (size_t i = 0; i < 4; ++i) { + for (size_t j = 0; j < 4; ++j) { + std::cout << std::setw(6) << _h_vector[i*4+j] << ' '; + } + std::cout << std::endl; + } + std::cout << std::endl; + return _h_vector.data(); +} + +template +T* CudaKmMatrixImpl::dev_ptr() { + host_to_device(); + T* ptr = thrust::raw_pointer_cast(_d_vector.data()); + return ptr; +} + +template +void CudaKmMatrixImpl::host_to_device() { + if (_on_device) + return; + _h_vector.resize(_d_vector.size()); + thrust::copy(_h_vector.begin(), _h_vector.end(), _d_vector.begin()); + _on_device = true; +} + +template +void CudaKmMatrixImpl::device_to_host() { + if (!_on_device) + return; + std::cout << "bring back to host" << std::endl; + std::cout << "_d_.size()" << _d_vector.size() << std::endl; + _h_vector.resize(_d_vector.size()); + thrust::copy(_d_vector.begin(), _d_vector.end(), _h_vector.begin()); + _on_device = false; +} + +template +bool CudaKmMatrixImpl::on_device() const { + return _on_device; +} + +#define INSTANTIATE(T) \ + template bool CudaKmMatrixImpl::on_device() const; \ + template void CudaKmMatrixImpl::device_to_host(); \ + template void CudaKmMatrixImpl::host_to_device(); \ + template T* CudaKmMatrixImpl::dev_ptr(); \ + template T* CudaKmMatrixImpl::host_ptr(); \ + template CudaKmMatrixImpl::~CudaKmMatrixImpl(); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl( \ + const KmMatrixProxy& _other, KmMatrix* _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl( \ + const thrust::host_vector& _h_vec, KmMatrix* _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par); \ + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +#undef INSTANTIATE +} // namespace H204GPU +} // namespace Array diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh new file mode 100644 index 000000000..c636dad40 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -0,0 +1,60 @@ +#ifndef KM_MATRIX_CUDA_CUH_ +#define KM_MATRIX_CUDA_CUH_ + +#include "KmMatrix.hpp" + +namespace H2O4GPU { +namespace KMeans { + +template +class KmMatrix; + +template +class KmMatrixImpl; + +template +class KmMatrixProxy; + +struct CudaInfo { + int n_devices; + int * _devices; + + CudaInfo (int _n_devices) + : n_devices(_n_devices) { + _devices = new int[_n_devices]; + } + ~CudaInfo () { + delete [] _devices; + } +}; + +template +class CudaKmMatrixImpl : public KmMatrixImpl { + private: + thrust::device_vector _d_vector; + thrust::host_vector _h_vector; + + bool _on_device; + KmMatrix* _matrix; + + void host_to_device(); + void device_to_host(); + + public: + CudaKmMatrixImpl(KmMatrix * _par); + CudaKmMatrixImpl(const thrust::host_vector& _h_vec, + KmMatrix* _par); + CudaKmMatrixImpl(const KmMatrixProxy& _other, + KmMatrix* _par); + + virtual ~CudaKmMatrixImpl(); + + bool on_device() const override; + + virtual T* host_ptr() override; + virtual T* dev_ptr() override; +}; + +} // MkMatrix +} // H204GPU +#endif \ No newline at end of file diff --git a/src/gpu/kmeans/array.cu b/src/gpu/kmeans/array.cu deleted file mode 100644 index 1296ae45d..000000000 --- a/src/gpu/kmeans/array.cu +++ /dev/null @@ -1,139 +0,0 @@ -/*! - * Copyright 2018 H2O.ai, Inc. - * License Apache License Version 2.0 (see LICENSE for details) - */ - -#include -#include - -#include "array.cuh" -#include "kmeans_general.h" - -namespace H2O4GPU { -namespace Array { - -template -CUDAArray::CUDAArray() { - CUBLAS_CHECK(cublasCreate(&blas_handle)); -} - -template -CUDAArray::CUDAArray(size_t _size) { - this->_d_vector.resize(_size); - CUBLAS_CHECK(cublasCreate(&blas_handle)); -} - -template -CUDAArray::CUDAArray(Dims _other) { - _dims = _other; - _d_vector.resize(_dims[0] * _dims[1]); - CUBLAS_CHECK(cublasCreate(&blas_handle)); -} - -template -CUDAArray::CUDAArray(const thrust::device_vector& _d_vec, - const Dims _dims) { - this->_d_vector = _d_vec; - this->_dims = _dims; - CUBLAS_CHECK(cublasCreate(&blas_handle)); -} - -template -CUDAArray::~CUDAArray() { - // if (blas_handle != NULL) - // CUBLAS_CHECK(cublasDestroy(blas_handle)); -} - -template -void CUDAArray::operator=(const CUDAArray& _other) { - _dims = _other._dims; - _d_vector = _other._d_vector; - - for (size_t i = 0; i < _d_vector.size(); ++i) { - std::cout << _d_vector[i] << ' '; - } - std::cout << std::endl; -} - -template -void CUDAArray::print() const { - std::cout << "Array: ["; - for (size_t i = 0; i < 4; ++i) { - std::cout << _dims[i] << ", "; - } - std::cout << "\b\b]" << std::endl; - for (size_t i = 0; i < _dims[0]; ++i) { - for (size_t j = 0; j < _dims[1]; ++j) { - std::cout << _d_vector[i*_dims[0]+j] << ' '; - } - std::cout << std::endl; - } - std::cout << std::endl; -} - -// return 1 row -template -CUDAArray CUDAArray::index(size_t _idx) { - - Dims new_dim (1, _dims[1], 0, 0); - CUDAArray result (new_dim); - thrust::device_vector _row (_dims[1]); - - thrust::copy(_d_vector.begin() + _idx * _dims[1], - _d_vector.begin() + (_idx+1) * _dims[1], - result._d_vector.begin()); - return result; -} - -template -T* CUDAArray::get() { - return _d_vector.data().get(); -} - -template -thrust::device_vector& CUDAArray::device_vector() { - return _d_vector; -} - -template -size_t CUDAArray::stride() { - return _stride; -} - -template -size_t CUDAArray::size () const { - return _h_vector.size(); -} - -template -size_t CUDAArray::n_gpu() const { - return _n_gpu; -} - -template -Dims CUDAArray::dims() const { - return _dims; -} - - -#define INSTANTIATE(T) \ - template CUDAArray::CUDAArray(); \ - template CUDAArray::CUDAArray(size_t _size); \ - template CUDAArray::CUDAArray(const thrust::device_vector& _d_vec, \ - const Dims _dims); \ - template CUDAArray::~CUDAArray(); \ - template void CUDAArray::operator=(const CUDAArray& _other); \ - template void CUDAArray::print() const; \ - template CUDAArray CUDAArray::index(size_t dim0); \ - template T * CUDAArray::get(); \ - template thrust::device_vector& CUDAArray::device_vector(); \ - template size_t CUDAArray::stride(); \ - template size_t CUDAArray::size () const; \ - template size_t CUDAArray::n_gpu() const; \ - template Dims CUDAArray::dims() const; \ - -INSTANTIATE(float) -INSTANTIATE(double) - -} // namespace H204GPU -} // namespace Array diff --git a/src/gpu/kmeans/array.cuh b/src/gpu/kmeans/array.cuh deleted file mode 100644 index 5cf069bf8..000000000 --- a/src/gpu/kmeans/array.cuh +++ /dev/null @@ -1,107 +0,0 @@ -/*! - * Copyright 2018 H2O.ai, Inc. - * License Apache License Version 2.0 (see LICENSE for details) - */ - -#ifndef CUDA_ARRAY_H_ -#define CUDA_ARRAY_H_ - -#include -#include - -namespace H2O4GPU { -namespace Array { - -constexpr float esp = 1e-16f; - -struct Dims { - size_t dims[4]; - Dims() { - for (size_t i = 0; i < 4; ++i) { - dims[i] = 0; - } - } - Dims(size_t _dims[4]) { - for (size_t i = 0; i < 4; ++i) { - dims[i] = _dims[i]; - } - } - Dims (size_t d0, size_t d1, size_t d2, size_t d3) { - dims[0] = d0; - dims[1] = d1; - dims[2] = d2; - dims[3] = d3; - } - size_t operator[](size_t _idx) const { - return dims[_idx]; - } - void operator=(const Dims& _other) { - for (size_t i = 0; i < 4; ++i) { - dims[i] = _other.dims[i]; - } - } -}; - -template -class CUDAArray { - private: - thrust::host_vector _h_vector; - thrust::device_vector _d_vector; - - Dims _dims; - - bool _is_synced; - - size_t _stride; - - size_t _n_gpu; - - cublasHandle_t blas_handle; - - public: - CUDAArray(); - CUDAArray(size_t _size); - CUDAArray(Dims _dims); - CUDAArray(const thrust::device_vector& _d_vec, const Dims _dims); - - virtual ~CUDAArray(); - - void operator=(const CUDAArray& _other); - - void print() const; - - CUDAArray index(size_t dim0); - - thrust::device_vector& device_vector(); - - size_t stride(); - - size_t size () const; - - size_t n_gpu() const; - - T* get(); - - Dims dims () const; -}; - - -template -CUDAArray div(CUDAArray _lhs, T _rhs) { - if (_rhs < esp) { - throw std::runtime_error("Value under flow"); - } - // cublasScal(blas_handle, lhs); -} - -template -CUDAArray min_element(CUDAArray& _value) { - T result = thrust::min_element - (_value._d_vector.begin(), _value._d_vector.end()); - return result; -} - -} // namespace Array -} // namespace H2O4GPU - -#endif diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 584a7ec72..4535726d1 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -7,130 +7,267 @@ #include #include -#include +#define EIGNE_USE_GPU +#include "Eigen/Dense" #include #include "kmeans_general.h" -#include "array.cuh" #include "kmeans_h2o4gpu.h" #include "kmeans_init.cuh" +#include "KmMatrix/KmMatrix.hpp" namespace H2O4GPU { namespace KMeans { -using namespace Array; - -// K-Means|| implementation template -__device__ float vector_dot(T lhs_start, T lhs_end, T rhs_start) { - float result = 0; - for (T lhs_iter = lhs_start, rhs_iter = rhs_start; - lhs_iter != lhs_end; - ++lhs_iter, ++rhs_iter) { - result += (*lhs_iter) * (*rhs_iter); +__device__ __forceinline__ +T min_distance(VE_T(T) *x, MA_T(T) *centroids) { + + KmShardMem shared; + T * _distances = shared.ptr(); + + size_t n_rows = centroids->rows(); + for (size_t i = 0; i < centroids->rows(); ++i) { + auto temp = *x - centroids->row(i); + _distances[i] = temp.dot(temp); } + + __syncthreads(); + + Eigen::Map _distances_vec(_distances, n_rows, 1); + T result = _distances_vec.minCoeff(); return result; } template -__global__ void min_distance(T* __restrict__ result, - T* __restrict__ data, size_t stride, - T* __restrict__ cendroids, size_t n_centroids) { - for (size_t i = 0; i < n_centroids; ++i) { - result[i] = - vector_dot(data, data+stride, data); - // vector_dot(data, data+stride, &cendroids[i*stride]) + - // vector_dot(&cendroids[i*stride], &cendroids[(i+1)*stride], &cendroids[i*stride]); - } - T minimum = std::numeric_limits::max(); - for (size_t i = 0; i < n_centroids; ++i) { - if (result[i] < minimum) { - minimum = result[i]; - } +__global__ +void potential_kernel(kVParam _dis, kMParam _data, kMParam _cent) { + + MA_T(T) data = Eigen::Map(_data.ptr, _data.rows, _data.cols); + MA_T(T) centroids = Eigen::Map(_cent.ptr, _cent.rows, _cent.cols); + + Eigen::Map distances(_dis.ptr, _dis.size); + + size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + if (tid < _dis.size) { + distances(tid) = min_distance(&( (VE_T(T)) data.row(tid)), + ¢roids); + printf("distance[%u] %f\n", tid, distances(tid)); } } +template +T KmeansLlInit::potential(MA_T(T)& data, MA_T(T)& centroids) { + + VE_T(T) distances (data.rows()); + + T* d_distances, * d_data, *d_centroids; + + CUDACHECK(cudaMalloc((void**)&d_distances, sizeof(T) * distances.size())); + CUDACHECK(cudaMalloc((void**)&d_data, sizeof(T) * data.size())); + CUDACHECK(cudaMalloc((void**)&d_centroids, sizeof(T) * centroids.size())); + + CUDACHECK(cudaMemcpy(d_distances, (void*)distances.data(), + sizeof(T) * distances.size(), + cudaMemcpyHostToDevice)); + CUDACHECK(cudaMemcpy(d_data, (void*)data.data(), + sizeof(T) * data.size(), + cudaMemcpyHostToDevice)); + CUDACHECK(cudaMemcpy(d_centroids, (void*)centroids.data(), + sizeof(T) * centroids.size(), + cudaMemcpyHostToDevice)); + + potential_kernel<<<256, div_roundup(data.rows(), 256), + sizeof(T)*centroids.rows()>>>( + kVParam(d_distances, distances.size()), + kMParam(d_data, data.rows(), data.cols()), + kMParam(d_centroids, centroids.rows(), centroids.cols())); + + CUDACHECK(cudaDeviceSynchronize()); + + thrust::device_ptr distances_vec (d_distances); + + T * temp = new T[distances.size()]; + CUDACHECK(cudaMemcpy(temp, d_distances, sizeof(T)*distances.size(), cudaMemcpyDeviceToHost)); + + T res = thrust::reduce(distances_vec, distances_vec + distances.size(), (T)0, + thrust::plus()); + + CUDACHECK(cudaFree(d_distances)); + CUDACHECK(cudaFree(d_data)); + + CUDACHECK(cudaFree(d_centroids)); + + CUDACHECK(cudaGetLastError()); + + return res; +} + +template +T KmeansLlInit::probability(MA_T(T)& data, MA_T(T)& controids) { + +} + +template +struct InplaceMulOp { + T a; + InplaceMulOp(T _a) : a(_a) {} + + __host__ __device__ + void operator()(T x) { + // *x = *x * a; + } +}; template -CUDAArray KmeansLlInit::sample_centroids(CUDAArray& data, CUDAArray& prob) { - size_t n_new_centroids = thrust::count_if( - data.device_vector().begin(), data.device_vector().end(), - [=] __device__ (int idx) { - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - rng.discard(idx); - T threshold = (T)dist(rng); - // T prob = prob[i]; - T prob = 0.1f; - return prob > threshold; - }); - - CUDAArray centroids (n_new_centroids); - thrust::copy_if(data.device_vector().begin(), data.device_vector().end(), - centroids.device_vector().begin(), - [=] __device__ (int idx) { - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - // rng.discard(row + device * rows_per_gpu); - T prob_threshold = (T)dist(rng); - - // T prob_x = ((2.0 * k * min_costs_ptr[row]) / total_min_cost); - T prob_x = 0.1f; - - return prob_x > prob_threshold; - }); +MA_T(T) KmeansLlInit::sample_centroids(MA_T(T)& data, MA_T(T)& centroids) { + VE_T(T) distances (data.rows()); + + T* d_distances, * d_data, *d_centroids; + + CUDACHECK(cudaMalloc((void**)&d_distances, sizeof(T) * distances.size())); + CUDACHECK(cudaMalloc((void**)&d_data, sizeof(T) * data.size())); + CUDACHECK(cudaMalloc((void**)&d_centroids, sizeof(T) * centroids.size())); + + CUDACHECK(cudaMemcpy(d_distances, (void*)distances.data(), + sizeof(T) * distances.size(), + cudaMemcpyHostToDevice)); + CUDACHECK(cudaMemcpy(d_data, (void*)data.data(), + sizeof(T) * data.size(), + cudaMemcpyHostToDevice)); + CUDACHECK(cudaMemcpy(d_centroids, (void*)centroids.data(), + sizeof(T) * centroids.size(), + cudaMemcpyHostToDevice)); + + potential_kernel<<<256, div_roundup(data.rows(), 256), + sizeof(T)*centroids.rows()>>>( + kVParam(d_distances, distances.size()), + kMParam(d_data, data.rows(), data.cols()), + kMParam(d_centroids, centroids.rows(), centroids.cols())); + + CUDACHECK(cudaDeviceSynchronize()); + + thrust::device_ptr distances_vec (d_distances); + + // T * temp = new T[distances.size()]; + // CUDACHECK(cudaMemcpy(temp, d_distances, sizeof(T)*distances.size(), cudaMemcpyDeviceToHost)); + + T pot = thrust::reduce(distances_vec, distances_vec + distances.size(), (T)0, + thrust::plus()); + + thrust::device_ptr& prob_vec = distances_vec; + thrust::for_each(prob_vec, prob_vec + distances.size(), InplaceMulOp(1/pot)); + + CUDACHECK(cudaDeviceSynchronize()); + + size_t _cols = data.cols(); + size_t _rows = data.rows(); + + std::cout << "distances.size()" << distances.size() << std::endl; + auto pot_cent_filter_counter = thrust::make_counting_iterator(0); + size_t n_new_centroids = + thrust::count_if(pot_cent_filter_counter, pot_cent_filter_counter + distances.size()-1, + [=] __device__(int idx) { + thrust::default_random_engine rng(0); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + rng.discard(idx + _rows); + T threshold = (T) dist (rng); + printf("count:thresh[%u]: %f\n", idx, threshold); + // T prob = d_distances[idx] / pot; + T prob = 0.5; + printf("count:prob[%u]: %f\n", idx, prob); + return prob > threshold; }); + + std::cout << "n_new_centroids:" << n_new_centroids << std::endl; + thrust::device_vector d_new_centroids (n_new_centroids * data.cols()); + auto range = thrust::make_counting_iterator(0); + thrust::device_ptr d_data_vec (d_data); + + // thrust::copy_if( + // d_data_vec, d_data_vec + data.size(), + // range, + // d_new_centroids.begin(), + // [=] __device__ (int idx) { + // size_t row = idx / _cols; + // thrust::default_random_engine rng(seed); + // thrust::uniform_real_distribution<> dist(0.0f, 1.0f); + // rng.discard(row); + // T threshold = (T) dist (rng); + // printf("copy:thresh[%u]: %f", row, threshold); + // T prob = d_distances[row]; + // return prob > threshold;}); + + thrust::host_vector h_new_centroids (n_new_centroids); + thrust::copy(d_new_centroids.begin(), d_new_centroids.end(), + h_new_centroids.begin()); + + size_t old_rows = centroids.rows(); + centroids.conservativeResize(data.rows() + n_new_centroids, Eigen::NoChange); + + for (size_t i = 0; i < n_new_centroids; i ++) { + centroids.row(i+old_rows) = Eigen::Map (h_new_centroids.data(), 1, data.cols()); + } + + CUDACHECK(cudaFree(d_distances)); + CUDACHECK(cudaFree(d_data)); + + CUDACHECK(cudaFree(d_centroids)); + + CUDACHECK(cudaGetLastError()); + return centroids; } template -CUDAArray KmeansLlInit::operator()(CUDAArray& data) { +KmMatrix +KmeansLlInit::operator()(H2O4GPU::KMeans::KmMatrix& data) { + if (seed < 0) { std::random_device rd; seed = rd(); } - std::mt19937 generator(seed); - std::uniform_int_distribution<> distribution(0, data.dims()[0] - 1); - size_t idx = distribution(generator); - CUDAArray centroids = data.index(idx); + std::mt19937 generator(0); + thrust::host_vector vec (4); - CUDAArray distances (Dims(data.dims()[0], 1, 0, 0)); - - distances.print(); + std::uniform_int_distribution<> distribution(0, data.rows()); + size_t idx = distribution(generator); - min_distance<<<256, data.size() / 256>>>( - distances.get(), data.get(), data.dims()[1], - centroids.get(), 1); + KmMatrix centroids = data.row(idx); + std::cout << "centroids" << std::endl; + std::cout << centroids << std::endl; - cudaDeviceSynchronize(); + // MA_T(T) centroids = data.row(idx); - // T potential = * min_element(distances.begin(), distances.end()); - T potential = 1.0f; - // for (size_t i = 0; i < log(potential); ++i) { - // min_distance(distances.device_ptr(), - // data.device_ptr(), data.stride(), - // centroids.device_ptr(), centroids.size()); - // T potential = * thrust::min_element(distances.begin(), distances.end()); - // T potential = 1.0f; - // CUDAArray prob = div(distances, potential); + // std::cout << "data\n" << data << std::endl; + // T pot = potential(data, centroids); + // std::cout << "pot: " << pot << std::endl; - // CUDAArray new_centroids = sample_centroids(data, prob); - // thrust::copy(new_centroids.begin(), new_centroids.end(), centroids.begin()); + // for (size_t i = 0; i < std::log(pot); ++i) { + // sample_centroids(data, centroids); + // std::cout << "new centroids" << std::endl; + // std::cout << centroids << std::endl; // } // re-cluster // kmeans_plus_plus(centroids); + return data; } #define INSTANTIATE(T) \ - template CUDAArray KmeansLlInit::operator()(CUDAArray& data); \ - template CUDAArray KmeansLlInit::sample_centroids( \ - CUDAArray& data, CUDAArray& prob); + template KmMatrix KmeansLlInit::operator()( \ + KmMatrix& data); \ + template MA_T(T) KmeansLlInit::sample_centroids( \ + MA_T(T)& data, MA_T(T)& centroids); \ + template T KmeansLlInit::probability(MA_T(T)& data, MA_T(T)& controids); \ + INSTANTIATE(float) INSTANTIATE(double) +INSTANTIATE(int) } // namespace Kmeans } // namespace H2O4GPU diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 14eadc9bc..608df8140 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -3,16 +3,137 @@ * License Apache License Version 2.0 (see LICENSE for details) */ -#include "array.cuh" +#include "Eigen/Dense" +#include "KmMatrix/KmMatrix.hpp" +// #include namespace H2O4GPU{ namespace KMeans { +// Wrappers for Eigen matrix and vector +template +struct EiMatrix; + +template <> +struct EiMatrix { + using type = Eigen::MatrixXf; +}; +template <> +struct EiMatrix { + using type = Eigen::MatrixXd; +}; +template <> +struct EiMatrix { + using type = Eigen::MatrixXi; +}; + +template +struct EiVector; +template <> +struct EiVector { + using type = Eigen::VectorXf; +}; +template <> +struct EiVector { + using type = Eigen::VectorXd; +}; +template <> +struct EiVector { + using type = Eigen::VectorXi; +}; + +// Work around for shared memory +// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name +template +struct KmShardMem; + +template <> +struct KmShardMem { + __device__ float * ptr() { + extern __shared__ __align__(sizeof(float)) float s_float[]; + return s_float; + } +}; + +template <> +struct KmShardMem { + __device__ double * ptr() { + extern __shared__ __align__(sizeof(double)) double s_double[]; + return s_double; + } +}; + +template <> +struct KmShardMem { + __device__ int * ptr() { + extern __shared__ __align__(sizeof(int)) int s_int[]; + return s_int; + } +}; + +#define MA_T(T) \ + typename EiMatrix::type +#define VE_T(T) \ + typename EiVector::type + +template +struct kMParam { + T* ptr; + size_t rows; + size_t cols; + + kMParam(T* _ptr, size_t _rows, size_t _cols) : + ptr (_ptr), rows (_rows), cols (_cols) {} + kMParam(size_t _rows, size_t _cols): + rows (_rows), cols (_cols) {} + kMParam(size_t _cols) : cols (_cols) {} +}; + +template +struct kVParam { + T* ptr; + size_t size; + kVParam(T* _ptr, size_t _size) : ptr(_ptr), size(_size) {} +}; + +// template +// struct HostDeviceVector { +// private: +// kMParam param; +// // thrust::device_vector _d_vector; +// std::vector* _h_vector; + +// public: +// HostDeviceVector (const std::vector& _h_vec, size_t _cols) : +// param(_cols) { +// _h_vector = new std::vector(_h_vec); +// } +// HostDeviceVector (const std::vector& _h_vec, +// size_t _rows, size_t _cols) : +// param(_rows, _cols) { +// _h_vector = new std::vector(_h_vec); +// } +// ~HostDeviceVector() { delete _h_vector; } +// // HostDeviceVector (size_t _cols) : +// // param.rows {1}, param.cols (_cols) { +// // _d_vector.resize(_cols); +// // } +// size_t rows() { return param.rows; } +// size_t cols() { return param.cols; } +// size_t size() { return param.rows * param.cols; } + +// // kMParam kParam() { +// // param.ptr = _d_vector.data().get(); +// // return param; +// // } +// }; + + template class KmeansInitBase { public: virtual ~KmeansInitBase() {} - virtual Array::CUDAArray operator()(Array::CUDAArray& data) = 0; + virtual KmMatrix operator()(KmMatrix& data) = 0; }; template @@ -21,15 +142,23 @@ struct KmeansLlInit : public KmeansInitBase { double over_sample; int seed; + T potential(MA_T(T)& data, MA_T(T)& centroids); + T probability(MA_T(T)& data, MA_T(T)& controids); + public: KmeansLlInit () : over_sample (2.0), seed (0) {} virtual ~KmeansLlInit () override {} - Array::CUDAArray sample_centroids(Array::CUDAArray& data, - Array::CUDAArray& prob); + MA_T(T) sample_centroids(MA_T(T)& data, MA_T(T)& centroids); - Array::CUDAArray operator()(Array::CUDAArray& data) override; + // MA_T(T) operator()(MA_T(T)&) override; + KmMatrix operator()(KmMatrix& data) override; }; +template +T1 div_roundup(const T1 a, const T2 b) { + return static_cast(ceil(static_cast(a) / b)); +} + } // namespace Kmeans } // namespace H2O4GPU diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 46cfe61c7..3dfb1bf38 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -5,45 +5,56 @@ #include -#include "../../../../src/gpu/kmeans/array.cuh" +// #include "../../../../src/gpu/kmeans/Eigen/Dense" +#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" #include "../../../../src/gpu/kmeans/kmeans_init.cuh" #include -#include +#include TEST(KmeansLL, KmeansLLInit) { int k = 2; - H2O4GPU::Array::Dims dims {4, 4, 0, 0}; H2O4GPU::KMeans::KmeansLlInit kmeans_ll_init; - thrust::host_vector _h_data (16); + thrust::host_vector _h_data (16); for (size_t i = 0; i < 4; ++i) { - _h_data[i] = i; + _h_data[i] = double(i); } for (size_t i = 4; i < 8; ++i) { - _h_data[i] = i - 2; + _h_data[i] = double(i - 2); } for (size_t i = 8; i < 12; ++i) { - _h_data[i] = i; + _h_data[i] = double(i); } for (size_t i = 12; i < 16; ++i) { - _h_data[i] = i + 2; + _h_data[i] = double(i + 2); } - thrust::device_vector _d_data; - _d_data = _h_data; + H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); + h_data.set_name ("h_data"); + std::cout << h_data << std::endl; - H2O4GPU::Array::CUDAArray data (_d_data, dims); + // H2O4GPU::KMeans::HostDeviceVector d_data (_h_data, 4, 4); - kmeans_ll_init (data); + // Eigen::MatrixXd _h_data (4, 4); - std::cout << "Host" << std::endl; - for (size_t i = 0; i < 16; ++i) { - std::cout << _h_data[i] << ','; - } - std::cout << std::endl; + // for (size_t i = 0; i < 4; ++i) { + // _h_data(0, i) = double(i); + // } + // for (size_t i = 4; i < 8; ++i) { + // _h_data(1, i-4) = double(i - 2); + // } + + // for (size_t i = 8; i < 12; ++i) { + // _h_data(2, i-8) = double(i); + // } + // for (size_t i = 12; i < 16; ++i) { + // _h_data(3, i-12) = double(i + 2); + // } + + auto result = kmeans_ll_init (h_data); } \ No newline at end of file From 42e034169292eaddeb0fd076eb0c7d54df8a2bd5 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 16 Jul 2018 22:51:59 +0800 Subject: [PATCH 06/49] [KmMatrix] Split up proxy, add tests, and fixes. * Split proxy implementation into cpp file to save some compile time. * Fix some memory error. * Add two tests. --- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 113 +++++++++++++++------- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 89 ++++++----------- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 106 ++++++++++---------- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 18 ++-- src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp | 63 ++++++++++++ tests/cpp/gpu/KmMatrix/test_proxy.cu | 50 ++++++++++ 6 files changed, 284 insertions(+), 155 deletions(-) create mode 100644 src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp create mode 100644 tests/cpp/gpu/KmMatrix/test_proxy.cu diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index 058994926..e5bbc7141 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -30,7 +30,8 @@ KmMatrix::KmMatrix() : init_impls(); #if defined (USE_CUDA) use_cuda = true; - impls[0].reset(new CudaKmMatrixImpl(this)); + KmMatrixImpl * ptr = new CudaKmMatrixImpl(this); + impls[0].reset(ptr); #elif use_cuda = false; impls[0] = nullptr; @@ -43,20 +44,36 @@ KmMatrix::KmMatrix(size_t _rows, size_t _cols) : init_impls(); #if defined (USE_CUDA) use_cuda = true; - impls[0].reset(new CudaKmMatrixImpl(this)); + KmMatrixImpl * ptr = new CudaKmMatrixImpl(this); + impls[0].reset(ptr); #elif use_cuda = false; #endif } template -KmMatrix::KmMatrix(thrust::host_vector _other, +KmMatrix::KmMatrix(thrust::host_vector _vec, size_t _rows, size_t _cols) : param_ (_rows, _cols, nullptr) { init_impls(); #if defined (USE_CUDA) use_cuda = true; - impls[0].reset(new CudaKmMatrixImpl(_other, this)); + KmMatrixImpl * ptr = new CudaKmMatrixImpl(_vec, this); + impls[0].reset(ptr); +#elif + use_cuda = false; +#endif +} + +template +KmMatrix::KmMatrix(const KmMatrixProxy& _other) : + param_ (_other.param_){ + init_impls(); +#if defined (USE_CUDA) + use_cuda = true; + KmMatrixImpl * ptr = new CudaKmMatrixImpl( + _other.orgi_, _other.start(), _other.size(), _other.stride(), this); + impls[0].reset(ptr); #elif use_cuda = false; #endif @@ -79,7 +96,7 @@ KmMatrix::KmMatrix(KmMatrix&& _other) : impls[i] = std::move(_other.impls[i]); } use_cuda = _other.use_cuda; - name_ = std::move(_other.name_); + name_ = _other.name_ + "(copied [in move])"; } template @@ -99,19 +116,7 @@ void KmMatrix::operator=(KmMatrix&& _other) { } param_ = _other.param_; use_cuda = _other.use_cuda; - name_ = std::move(_other.name_); -} - -template -KmMatrix::KmMatrix(const KmMatrixProxy& _other) : - param_ (_other.param()){ - init_impls(); -#if defined (USE_CUDA) - use_cuda = true; - impls[0].reset(new CudaKmMatrixImpl(_other, this)); -#elif - use_cuda = false; -#endif + name_ = _other.name_ + "(copied [in move])"; } template @@ -170,37 +175,68 @@ T* KmMatrix::dev_ptr() { } } +template +bool KmMatrix::on_device() const { + if (use_cuda) { + return impls[CUDADense]->on_device(); + } else { + return false; + } +} + template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem) { size_t start = param_.cols * idx; size_t stride = 1; size_t end = param_.cols * (idx + 1); - - if (impls[0] != nullptr) { - if (dev_mem) { - std::cout << "row dev_mem" << std::endl; - return KmMatrixProxy(thrust::device_ptr(impls[0]->dev_ptr()), - start, end, stride, param_, true); - } else { - std::cout << "row host_mem" << std::endl; - return KmMatrixProxy(thrust::device_ptr(impls[0]->host_ptr()), - start, end, stride, param_, false); - } - } - std::cerr << "no cuda" << std::endl; - // FIXME - assert(false); - // return KmMatrixProxy(thrust::device_ptr(NULL), 0, 0, 0, param_, false); + kParam param(1, param_.cols, nullptr); + return KmMatrixProxy(*this, start, end, stride, param); } template KmMatrixProxy KmMatrix::col(size_t idx) { // FIXME assert (false); - return KmMatrixProxy(nullptr, 0, 0, 0, param_, false); + return KmMatrixProxy(*this, 0, 0, 0); +} + +template +bool KmMatrix::operator==(const KmMatrix &_rhs) { + if (_rhs.use_cuda && use_cuda) { + std::shared_ptr> tmp = + std::dynamic_pointer_cast>(impls[CUDADense]); + bool res = std::dynamic_pointer_cast>( + impls[CUDADense])->equal(tmp); + // return std::dynamic_pointer_cast>(impls[CUDADense])->equal( + // _rhs.impls[CUDADense]); + return res; + } else { + // FIXME + assert(false); + return false; + } +} + +// ============================== +// Helper functions +// ============================== +template +std::ostream& operator<<(std::ostream& os, KmMatrix& m) { + std::cout << "matrix: " << m.name() << std::endl << "---" << std::endl; + T * ptr = m.host_ptr(); + kParam param = m.k_param(); + for (size_t i = 0; i < param.rows; ++i) { + for (size_t j = 0; j < param.cols; ++j) { + std::cout << std::setw(5) << ptr[i*param.cols + j] << ','; + } + std::cout << std::endl; + } + std::cout << "---" << std::endl; + return os; } #define INSTANTIATE(T) \ + /* Standard con(de)structors*/ \ template KmMatrixImpl::KmMatrixImpl(KmMatrix *_matrix); \ template KmMatrix::KmMatrix(); \ template KmMatrix::KmMatrix(size_t _rows, size_t _cols); \ @@ -212,13 +248,18 @@ KmMatrixProxy KmMatrix::col(size_t idx) { template void KmMatrix::operator=(KmMatrix&& _other); \ template KmMatrix::KmMatrix(const KmMatrixProxy& _other); \ template KmMatrix::~KmMatrix(); \ + /* Methods */ \ template size_t KmMatrix::size() const; \ template size_t KmMatrix::rows() const; \ template size_t KmMatrix::cols() const; \ template kParam KmMatrix::k_param () const; \ template T * KmMatrix::host_ptr(); \ template T * KmMatrix::dev_ptr(); \ - template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); + template bool KmMatrix::on_device() const; \ + template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); \ + template bool KmMatrix::operator==(const KmMatrix &_rhs); \ + /* Helper functions */ \ + template std::ostream& operator<<(std::ostream& os, KmMatrix& m); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 5b34fad4b..36fc2c451 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -51,7 +51,7 @@ struct kParam { template class KmMatrixImpl { - private: + protected: KmMatrix * matrix_; public: KmMatrixImpl(KmMatrix *_matrix); @@ -59,6 +59,7 @@ class KmMatrixImpl { virtual T* host_ptr() {} virtual T* dev_ptr() {} + virtual size_t size() const {} virtual bool on_device() const {} }; @@ -66,6 +67,8 @@ template class KmMatrix { private: + bool use_cuda; + enum Backend { CUDADense = 0, CUDASparse = 1, @@ -76,8 +79,6 @@ class KmMatrix { std::shared_ptr> impls[4]; kParam param_; - bool use_cuda; - void init_impls(); std::string name_; @@ -94,6 +95,8 @@ class KmMatrix { void operator=(const KmMatrix& _other); void operator=(KmMatrix&& _other); + bool operator==(const KmMatrix& _rhs); + virtual ~KmMatrix(); size_t size () const; @@ -103,6 +106,8 @@ class KmMatrix { T* host_ptr(); T* dev_ptr(); + bool on_device() const; + kParam k_param () const; std::string name() const { return name_; } @@ -113,76 +118,36 @@ class KmMatrix { }; template -std::ostream& operator<<(std::ostream& os, KmMatrix& m) { - std::cout << "matrix: " << m.name() << std::endl; - T * ptr = m.host_ptr(); - kParam param = m.k_param(); - for (size_t i = 0; i < param.rows; ++i) { - for (size_t j = 0; j < param.cols; ++j) { - std::cout << "(" << i << ", "<< j << ", " << i*param.cols + j << ")" << std::setw(6) << ptr[i*param.cols + j] << ' '; - } - std::cout << std::endl; - } - std::cout << "---" << std::endl; -} +std::ostream& operator<<(std::ostream& os, KmMatrix& m); template class KmMatrixProxy { private: - thrust::device_ptr ptr_; + KmMatrix& orgi_; + + kParam param_; + size_t start_; size_t end_; size_t stride_; - bool on_device_; - - kParam param_; + size_t start() const; + size_t end() const; + size_t stride() const; public: - size_t start() const { - return start_; - } - size_t end() const { - return end_; - } - size_t stride() const { - return stride_; - } - size_t size() const { - return (end_ - start_) / stride_; - } - bool on_device() const { - return on_device_; - } - thrust::device_ptr data() const { - return ptr_ + start_; - } - kParam param() const { - return param_; - } - KmMatrixProxy(thrust::device_ptr _ptr, - size_t _start, size_t _end, size_t _stride, - kParam _param, bool _on_device) - : ptr_(_ptr), start_(_start), end_(_end), stride_(_stride), - param_(_param), on_device_(_on_device) {} - - void operator=(KmMatrix& _other) { - assert(size() == _other.size); - - assert (_other.size() == size()); - // FIXME - assert(stride_ == 1); - - if (on_device_) { - auto _other_dev_ptr = thrust::device_ptr(_other.dev_ptr()); - - thrust::copy(_other_dev_ptr, _other_dev_ptr + size(), ptr_); - } else { - thrust::copy(_other.host_ptr(), _other.host_ptr() + size(), - ptr_ + start_); - } - } + size_t size() const; + bool on_device() const; + + KmMatrixProxy(KmMatrix& _other, + size_t _start, size_t _end, size_t _stride, kParam& _param); + + bool operator==(const KmMatrix& _rhs); + bool operator==(const KmMatrixProxy& _rhs); + + void operator=(KmMatrix& _other); + friend KmMatrix; }; } // namespace KMeans diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index b941bcda6..7d7d6c467 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -5,6 +5,7 @@ #include #include +#include #include @@ -24,48 +25,36 @@ CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, : _on_device(false), KmMatrixImpl(_par) { _h_vector.resize(_h_vec.size()); thrust::copy(_h_vec.begin(), _h_vec.end(), _h_vector.begin()); - std::cout << "Copy host vector: " << _h_vector.size() << std::endl; - for (size_t i = 0; i < _h_vector.size(); ++i) { - std::cout << _h_vector[i] << ' '; - } - std::cout << std::endl; } template -CudaKmMatrixImpl::CudaKmMatrixImpl(const KmMatrixProxy& _other, - KmMatrix* _par) - : KmMatrixImpl(_par) { - thrust::device_ptr ptr = _other.data(); +CudaKmMatrixImpl::CudaKmMatrixImpl( + KmMatrix& _other, size_t _start, size_t _size, size_t _stride, + KmMatrix * _par) : + KmMatrixImpl(_par) { + assert (_size > 0); + + if (_size == 0) + return; + + T* raw_ptr; if (_other.on_device()) { - std::cout << "proxy to dev" << std::endl; - if (_other.stride() == 1) { - _d_vector.resize(_other.size()); - thrust::copy(ptr, ptr + _other.size(), _d_vector.begin()); - std::cout << "copied" << std::endl; - for (size_t i = 0; i < _d_vector.size(); ++i) { - std::cout << _d_vector[i] << ' '; - } - std::cout << std::endl; - } else { - // FIXME - assert(false); - } + raw_ptr = _other.dev_ptr(); + thrust::device_ptr ptr (raw_ptr); + ptr += _start; + _d_vector.resize(_size); _on_device = true; + thrust::copy(ptr, ptr + _size, _d_vector.begin()); } else { - if (_other.stride() == 1) { - _h_vector.resize(_other.size()); - std::cout << "_other.size(): " << _other.size() << std::endl; - for (size_t i = 0; i < _other.size(); ++i) { - std::cout << ptr[i] << ' '; - } - std::cout << std::endl; - thrust::copy(ptr, ptr + _other.size(), _h_vector.begin()); - - } else { - // FIXME - assert(false); + raw_ptr = _other.host_ptr(); + if (raw_ptr == nullptr) { + std::cerr << "nullptr: " << _other.name(); + abort(); } + raw_ptr += _start; + _h_vector.resize(_size); _on_device = false; + thrust::copy(raw_ptr, raw_ptr + _size, _h_vector.begin()); } } @@ -75,15 +64,7 @@ CudaKmMatrixImpl::~CudaKmMatrixImpl() {} template T* CudaKmMatrixImpl::host_ptr() { device_to_host(); - std::cout << "host ptr: " << _h_vector.size() << std::endl; - for (size_t i = 0; i < 4; ++i) { - for (size_t j = 0; j < 4; ++j) { - std::cout << std::setw(6) << _h_vector[i*4+j] << ' '; - } - std::cout << std::endl; - } - std::cout << std::endl; - return _h_vector.data(); + return thrust::raw_pointer_cast(_h_vector.data()); } template @@ -106,8 +87,6 @@ template void CudaKmMatrixImpl::device_to_host() { if (!_on_device) return; - std::cout << "bring back to host" << std::endl; - std::cout << "_d_.size()" << _d_vector.size() << std::endl; _h_vector.resize(_d_vector.size()); thrust::copy(_d_vector.begin(), _d_vector.end(), _h_vector.begin()); _on_device = false; @@ -118,18 +97,43 @@ bool CudaKmMatrixImpl::on_device() const { return _on_device; } +template +size_t CudaKmMatrixImpl::size() const { + if (_on_device) { + return _d_vector.size(); + } else { + return _h_vector.size(); + } +} + +template +bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { + // FIXME, Is it floating compatible? + _rhs->host_to_device(); + host_to_device(); + bool res = thrust::equal(_d_vector.begin(), _d_vector.end(), + _rhs->_d_vector.begin()); + return res; +} + #define INSTANTIATE(T) \ + /* Standard con(de)structors*/ \ + template CudaKmMatrixImpl::CudaKmMatrixImpl( \ + KmMatrix& _other, size_t _start, size_t _size, size_t _stride, \ + KmMatrix * _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl( \ + const thrust::host_vector& _h_vec, KmMatrix* _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par); \ + template CudaKmMatrixImpl::~CudaKmMatrixImpl(); \ + /* Member functions */ \ template bool CudaKmMatrixImpl::on_device() const; \ template void CudaKmMatrixImpl::device_to_host(); \ template void CudaKmMatrixImpl::host_to_device(); \ template T* CudaKmMatrixImpl::dev_ptr(); \ template T* CudaKmMatrixImpl::host_ptr(); \ - template CudaKmMatrixImpl::~CudaKmMatrixImpl(); \ - template CudaKmMatrixImpl::CudaKmMatrixImpl( \ - const KmMatrixProxy& _other, KmMatrix* _par); \ - template CudaKmMatrixImpl::CudaKmMatrixImpl( \ - const thrust::host_vector& _h_vec, KmMatrix* _par); \ - template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par); \ + template size_t CudaKmMatrixImpl::size() const; \ + template bool CudaKmMatrixImpl::equal( \ + std::shared_ptr>& _rhs); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index c636dad40..a4a05f368 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -2,6 +2,7 @@ #define KM_MATRIX_CUDA_CUH_ #include "KmMatrix.hpp" +#include namespace H2O4GPU { namespace KMeans { @@ -44,17 +45,22 @@ class CudaKmMatrixImpl : public KmMatrixImpl { CudaKmMatrixImpl(KmMatrix * _par); CudaKmMatrixImpl(const thrust::host_vector& _h_vec, KmMatrix* _par); - CudaKmMatrixImpl(const KmMatrixProxy& _other, - KmMatrix* _par); - + CudaKmMatrixImpl(KmMatrix& _other, + size_t _start, size_t _size, size_t _stride, + KmMatrix * _par); virtual ~CudaKmMatrixImpl(); - bool on_device() const override; - virtual T* host_ptr() override; virtual T* dev_ptr() override; + + virtual size_t size() const override; + + virtual bool equal(std::shared_ptr>& _rhs); + + virtual bool on_device() const override; }; } // MkMatrix } // H204GPU -#endif \ No newline at end of file + +#endif diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp b/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp new file mode 100644 index 000000000..9cc2f1e29 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp @@ -0,0 +1,63 @@ +#include "KmMatrix.hpp" + +namespace H2O4GPU { +namespace KMeans { + +template +KmMatrixProxy::KmMatrixProxy(KmMatrix& _other, + size_t _start, size_t _end, size_t _stride, + kParam& _param) + : orgi_ (_other), start_(_start), end_(_end), stride_(_stride), + param_(_param) { + assert(size() > 0); +} + +template +bool KmMatrixProxy::on_device() const { + return orgi_.on_device(); +} + +template +size_t KmMatrixProxy::start() const { + return start_; +} + +template +size_t KmMatrixProxy::end() const { + return end_; +} + +template +size_t KmMatrixProxy::stride() const { + return stride_; +} + +template +size_t KmMatrixProxy::size() const { + return (end_ - start_) / stride_; +} + +template +void KmMatrixProxy::operator=(KmMatrix &_other) { + // FIXME + assert(false); +} + +#define INSTANTIATE(T) \ + template KmMatrixProxy::KmMatrixProxy(KmMatrix& _other, \ + size_t _start, size_t _end, \ + size_t _stride, \ + kParam& _param); \ + template bool KmMatrixProxy::on_device() const; \ + template size_t KmMatrixProxy::start() const; \ + template size_t KmMatrixProxy::end() const; \ + template size_t KmMatrixProxy::stride() const; \ + template size_t KmMatrixProxy::size() const; \ + template void KmMatrixProxy::operator=(KmMatrix &_other); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +} +} diff --git a/tests/cpp/gpu/KmMatrix/test_proxy.cu b/tests/cpp/gpu/KmMatrix/test_proxy.cu new file mode 100644 index 000000000..d55e2c0dc --- /dev/null +++ b/tests/cpp/gpu/KmMatrix/test_proxy.cu @@ -0,0 +1,50 @@ +#include +#include +#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" + +#include + +// r --gtest_filter=KmMatrix.KmMatrixProxy +TEST(KmMatrix, KmMatrixProxy) { + size_t rows = 12, cols = 16; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + vec[i] = i; + } + + H2O4GPU::KMeans::KmMatrix mat (vec, rows, cols); + mat.set_name ("mat"); + + H2O4GPU::KMeans::KmMatrix row = mat.row(1); + row.set_name ("row"); + + thrust::host_vector res (cols); + + for (size_t i = 0, v = 16; v < 32; ++i, ++v) { + res[i] = v; + std::cout << v << ' '; + } + + H2O4GPU::KMeans::KmMatrix res_mat (res, 1, cols); + res_mat.set_name("res_mat"); + + ASSERT_TRUE(res_mat == row); +} + +// r --gtest_filter=KmMatrix.KmMatrix +TEST(KmMatrix, KmMatrix) { + thrust::host_vector vec (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec[i] = i; + } + H2O4GPU::KMeans::KmMatrix mat (vec, 2048, 1024); + + ASSERT_TRUE (vec == vec); + + thrust::host_vector vec2 (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec2[i] = i + 1; + } + + ASSERT_FALSE(vec == vec2); +} \ No newline at end of file From 79529c636637b75e5b29de0363ceb379701a02ac Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 17 Jul 2018 06:58:48 +0800 Subject: [PATCH 07/49] More tests and fixes for KmMatrix. --- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 27 +++++------ src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 4 +- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 11 ++--- tests/cpp/gpu/KmMatrix/test_matrix.cu | 59 ++++++++++++++++++++++++ tests/cpp/gpu/KmMatrix/test_proxy.cu | 43 +++++++++-------- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 20 -------- 6 files changed, 102 insertions(+), 62 deletions(-) create mode 100644 tests/cpp/gpu/KmMatrix/test_matrix.cu diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index e5bbc7141..d73c689c6 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -69,6 +69,8 @@ template KmMatrix::KmMatrix(const KmMatrixProxy& _other) : param_ (_other.param_){ init_impls(); + name_ = _other.orgi_.name_ + "(" + std::to_string(_other.start()) + "," + + std::to_string(_other.end()) + ")"; #if defined (USE_CUDA) use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl( @@ -127,13 +129,7 @@ void KmMatrix::init_impls() { } template -KmMatrix::~KmMatrix() { - // std::cout << "name: " << name_ << std::endl; - // for (size_t i = 0; i < 4; ++i) { - // if (impls[i] != nullptr) - // delete impls[i]; - // } -} +KmMatrix::~KmMatrix() {} template @@ -152,8 +148,11 @@ size_t KmMatrix::cols() const { } template -kParam KmMatrix::k_param () const { - return param_; +kParam KmMatrix::k_param () { + T * ptr = dev_ptr(); + kParam param (param_); + param.ptr = ptr; + return param; } template @@ -201,14 +200,12 @@ KmMatrixProxy KmMatrix::col(size_t idx) { } template -bool KmMatrix::operator==(const KmMatrix &_rhs) { +bool KmMatrix::operator==(KmMatrix& _rhs) { if (_rhs.use_cuda && use_cuda) { std::shared_ptr> tmp = - std::dynamic_pointer_cast>(impls[CUDADense]); + std::dynamic_pointer_cast>(_rhs.impls[CUDADense]); bool res = std::dynamic_pointer_cast>( impls[CUDADense])->equal(tmp); - // return std::dynamic_pointer_cast>(impls[CUDADense])->equal( - // _rhs.impls[CUDADense]); return res; } else { // FIXME @@ -252,12 +249,12 @@ std::ostream& operator<<(std::ostream& os, KmMatrix& m) { template size_t KmMatrix::size() const; \ template size_t KmMatrix::rows() const; \ template size_t KmMatrix::cols() const; \ - template kParam KmMatrix::k_param () const; \ + template kParam KmMatrix::k_param (); \ template T * KmMatrix::host_ptr(); \ template T * KmMatrix::dev_ptr(); \ template bool KmMatrix::on_device() const; \ template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); \ - template bool KmMatrix::operator==(const KmMatrix &_rhs); \ + template bool KmMatrix::operator==(KmMatrix &_rhs); \ /* Helper functions */ \ template std::ostream& operator<<(std::ostream& os, KmMatrix& m); diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 36fc2c451..b7aaaeb35 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -95,7 +95,7 @@ class KmMatrix { void operator=(const KmMatrix& _other); void operator=(KmMatrix&& _other); - bool operator==(const KmMatrix& _rhs); + bool operator==(KmMatrix& _rhs); virtual ~KmMatrix(); @@ -108,7 +108,7 @@ class KmMatrix { bool on_device() const; - kParam k_param () const; + kParam k_param (); std::string name() const { return name_; } void set_name (std::string _name) {name_ = _name;} diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index 7d7d6c467..5a2afd68d 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -38,6 +38,9 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( return; T* raw_ptr; + + assert (raw_ptr != nullptr && raw_ptr != NULL); + if (_other.on_device()) { raw_ptr = _other.dev_ptr(); thrust::device_ptr ptr (raw_ptr); @@ -47,10 +50,6 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( thrust::copy(ptr, ptr + _size, _d_vector.begin()); } else { raw_ptr = _other.host_ptr(); - if (raw_ptr == nullptr) { - std::cerr << "nullptr: " << _other.name(); - abort(); - } raw_ptr += _start; _h_vector.resize(_size); _on_device = false; @@ -78,7 +77,7 @@ template void CudaKmMatrixImpl::host_to_device() { if (_on_device) return; - _h_vector.resize(_d_vector.size()); + _d_vector.resize(_h_vector.size()); thrust::copy(_h_vector.begin(), _h_vector.end(), _d_vector.begin()); _on_device = true; } @@ -113,7 +112,7 @@ bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { host_to_device(); bool res = thrust::equal(_d_vector.begin(), _d_vector.end(), _rhs->_d_vector.begin()); - return res; + return res; } #define INSTANTIATE(T) \ diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu new file mode 100644 index 000000000..1665a4fba --- /dev/null +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -0,0 +1,59 @@ +#include +#include + +#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" + +// r --gtest_filter=KmMatrix.KmMatrix +TEST(KmMatrix, KmMatrixEqual) { + thrust::host_vector vec (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec[i] = i; + } + H2O4GPU::KMeans::KmMatrix mat (vec, 2048, 1024); + + ASSERT_TRUE (mat == mat); + + thrust::host_vector vec2 (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec2[i] = i + i; + } + H2O4GPU::KMeans::KmMatrix mat2 (vec2, 2048, 1024); + + ASSERT_FALSE(mat == mat2); +} + +TEST(KmMatrix, KmMatrixAssig) { + thrust::host_vector vec (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec[i] = i; + } + + H2O4GPU::KMeans::KmMatrix mat0 (vec, 2048, 1024); + H2O4GPU::KMeans::KmMatrix mat1 = mat0; + H2O4GPU::KMeans::KmMatrix mat2; + + mat2 = mat0; + + ASSERT_TRUE(mat0 == mat1); + ASSERT_TRUE(mat1 == mat2); +} + +TEST(KmMatrix, KmMatrixUtils) { + thrust::host_vector vec (12 * 16); + H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); + + ASSERT_EQ(mat.rows(), 12); + ASSERT_EQ(mat.cols(), 16); + ASSERT_EQ(mat.size(), 12 * 16); +} + +TEST(KmMatrix, KmMatrixKparam) { + thrust::host_vector vec (12 * 16); + thrust::fill(vec.begin(), vec.end(), 1); + H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); + + H2O4GPU::KMeans::kParam param = mat.k_param(); + ASSERT_EQ(param.ptr, mat.dev_ptr()); + ASSERT_EQ(param.rows, 12); + ASSERT_EQ(param.cols, 16); +} \ No newline at end of file diff --git a/tests/cpp/gpu/KmMatrix/test_proxy.cu b/tests/cpp/gpu/KmMatrix/test_proxy.cu index d55e2c0dc..732d27eef 100644 --- a/tests/cpp/gpu/KmMatrix/test_proxy.cu +++ b/tests/cpp/gpu/KmMatrix/test_proxy.cu @@ -2,10 +2,8 @@ #include #include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" -#include - -// r --gtest_filter=KmMatrix.KmMatrixProxy -TEST(KmMatrix, KmMatrixProxy) { +// r --gtest_filter=KmMatrix.KmMatrixHostProxy +TEST(KmMatrix, KmMatrixProxyHostEqual) { size_t rows = 12, cols = 16; thrust::host_vector vec (rows * cols); for (size_t i = 0; i < rows * cols; ++i) { @@ -13,38 +11,45 @@ TEST(KmMatrix, KmMatrixProxy) { } H2O4GPU::KMeans::KmMatrix mat (vec, rows, cols); - mat.set_name ("mat"); H2O4GPU::KMeans::KmMatrix row = mat.row(1); - row.set_name ("row"); thrust::host_vector res (cols); for (size_t i = 0, v = 16; v < 32; ++i, ++v) { res[i] = v; - std::cout << v << ' '; } H2O4GPU::KMeans::KmMatrix res_mat (res, 1, cols); - res_mat.set_name("res_mat"); ASSERT_TRUE(res_mat == row); } -// r --gtest_filter=KmMatrix.KmMatrix -TEST(KmMatrix, KmMatrix) { - thrust::host_vector vec (2048 * 1024); - for (size_t i = 0; i < 2048 * 1024; ++i) { +// r --gtest_filter=KmMatrix.KmMatrixDevProxy +// FIXME +TEST(KmMatrix, KmMatrixProxyDevEqual) { + size_t rows = 12, cols = 16; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat (vec, 2048, 1024); - ASSERT_TRUE (vec == vec); + H2O4GPU::KMeans::KmMatrix mat (vec, rows, cols); + mat.set_name ("mat"); - thrust::host_vector vec2 (2048 * 1024); - for (size_t i = 0; i < 2048 * 1024; ++i) { - vec2[i] = i + 1; + mat.dev_ptr(); + + H2O4GPU::KMeans::KmMatrix row = mat.row(1); + row.set_name ("row"); + + thrust::host_vector res (cols); + + for (size_t i = 0, v = 16; v < 16 + cols; ++i, ++v) { + res[i] = v; } - ASSERT_FALSE(vec == vec2); -} \ No newline at end of file + H2O4GPU::KMeans::KmMatrix res_mat (res, 1, cols); + res_mat.set_name("res"); + + ASSERT_TRUE(res_mat == row); +} diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 3dfb1bf38..2f9c5356b 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -35,26 +35,6 @@ TEST(KmeansLL, KmeansLLInit) { } H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); - h_data.set_name ("h_data"); - std::cout << h_data << std::endl; - - // H2O4GPU::KMeans::HostDeviceVector d_data (_h_data, 4, 4); - - // Eigen::MatrixXd _h_data (4, 4); - - // for (size_t i = 0; i < 4; ++i) { - // _h_data(0, i) = double(i); - // } - // for (size_t i = 4; i < 8; ++i) { - // _h_data(1, i-4) = double(i - 2); - // } - - // for (size_t i = 8; i < 12; ++i) { - // _h_data(2, i-8) = double(i); - // } - // for (size_t i = 12; i < 16; ++i) { - // _h_data(3, i-12) = double(i + 2); - // } auto result = kmeans_ll_init (h_data); } \ No newline at end of file From 3b19006add4ed0d5c69416038432f04f77b5f1d2 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 18 Jul 2018 04:20:53 +0800 Subject: [PATCH 08/49] Fix KmMatrix initialization with size. --- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 2 +- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 13 +++++++++++++ src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 4 ++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index d73c689c6..bbd721544 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -44,7 +44,7 @@ KmMatrix::KmMatrix(size_t _rows, size_t _cols) : init_impls(); #if defined (USE_CUDA) use_cuda = true; - KmMatrixImpl * ptr = new CudaKmMatrixImpl(this); + KmMatrixImpl * ptr = new CudaKmMatrixImpl(this, _rows * _cols); impls[0].reset(ptr); #elif use_cuda = false; diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index 5a2afd68d..aefb55f21 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -27,6 +27,15 @@ CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, thrust::copy(_h_vec.begin(), _h_vec.end(), _h_vector.begin()); } +template +CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par, size_t _size) : + KmMatrixImpl(_par) { + if (_size == 0) return; + + _d_vector.resize(_size); + _on_device = true; +} + template CudaKmMatrixImpl::CudaKmMatrixImpl( KmMatrix& _other, size_t _start, size_t _size, size_t _stride, @@ -41,6 +50,8 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( assert (raw_ptr != nullptr && raw_ptr != NULL); + std::cerr << "Warning: Copying data from " << _other.name() + << "." << std::endl; if (_other.on_device()) { raw_ptr = _other.dev_ptr(); thrust::device_ptr ptr (raw_ptr); @@ -123,6 +134,8 @@ bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { template CudaKmMatrixImpl::CudaKmMatrixImpl( \ const thrust::host_vector& _h_vec, KmMatrix* _par); \ template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par, \ + size_t _size); \ template CudaKmMatrixImpl::~CudaKmMatrixImpl(); \ /* Member functions */ \ template bool CudaKmMatrixImpl::on_device() const; \ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index a4a05f368..5fb47267f 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -43,8 +43,8 @@ class CudaKmMatrixImpl : public KmMatrixImpl { public: CudaKmMatrixImpl(KmMatrix * _par); - CudaKmMatrixImpl(const thrust::host_vector& _h_vec, - KmMatrix* _par); + CudaKmMatrixImpl(const thrust::host_vector& _h_vec, KmMatrix* _par); + CudaKmMatrixImpl(KmMatrix * _par, size_t _size); CudaKmMatrixImpl(KmMatrix& _other, size_t _start, size_t _size, size_t _stride, KmMatrix * _par); From b58d86852d8b1aab9738413eeccd83db358559fd Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 18 Jul 2018 07:22:31 +0800 Subject: [PATCH 09/49] Add helper tools for GPU. --- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 40 ++++++++++++++++++++++++++++ src/gpu/kmeans/KmMatrix/KmConfig.h | 21 +++++++++++++++ src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 11 ++++---- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 9 ++++--- src/gpu/kmeans/KmMatrix/utils.cuh | 27 +++++++++++++++++++ 5 files changed, 100 insertions(+), 8 deletions(-) create mode 100644 src/gpu/kmeans/KmMatrix/GpuInfo.cuh create mode 100644 src/gpu/kmeans/KmMatrix/KmConfig.h create mode 100644 src/gpu/kmeans/KmMatrix/utils.cuh diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh new file mode 100644 index 000000000..afb4dff5c --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -0,0 +1,40 @@ +#ifndef GPU_INFO_HPP_ +#define GPU_INFO_HPP_ + +#include "KmConfig.h" +#include "stdlib.h" + +class GpuInfo { + private: + int n_gpu_; + size_t* n_sm_; + public: + GpuInfo () { + CUDA_CHECK(cudaGetDeviceCount(&n_gpu_)); + n_sm_ = (size_t*) malloc (n_gpu_); + } + ~GpuInfo () { + free (n_sm_); + } + + size_t blocks (size_t _mul, int _device=0) { + if (has_device(_device)) { + return _mul * n_sm_[_device]; + } + return 0; + } + + bool has_device(int _device) { + return _device < n_gpu_ && _device > 0; + } + + static GpuInfo& ins() { + static GpuInfo obj; + return obj; + } + +}; + +// const GpuInfoImpl GpuInfo::impl = GpuInfoImpl(); + +#endif // GPU_INFO_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h new file mode 100644 index 000000000..8d6cf6f12 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/KmConfig.h @@ -0,0 +1,21 @@ +#ifndef KM_CONFIG_H_ +#define KM_CONFIG_H_ + +#define USE_CUDA() 1 + +// Matrix host dev +#define M_HOSTDEV __host__ __device__ +#define M_DEVINLINE __device__ __forceinline__ +#define M_HOSTDEVINLINE __host__ __device__ __forceinline__ + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Cuda failure %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e));\ + fflush( stdout ); \ + exit(EXIT_FAILURE); \ + } \ + } while(0) + +#endif // KM_CONFIG_H_ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index bbd721544..96f6c0895 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -4,7 +4,8 @@ */ #include "KmMatrix.hpp" -#if defined (USE_CUDA) +#include "KmConfig.h" +#if USE_CUDA() #include "KmMatrixCuda.cuh" #endif @@ -28,7 +29,7 @@ template KmMatrix::KmMatrix() : param_ (0, 0, nullptr) { init_impls(); -#if defined (USE_CUDA) +#if USE_CUDA() use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl(this); impls[0].reset(ptr); @@ -42,7 +43,7 @@ template KmMatrix::KmMatrix(size_t _rows, size_t _cols) : param_ (_rows, _cols, nullptr) { init_impls(); -#if defined (USE_CUDA) +#if USE_CUDA() use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl(this, _rows * _cols); impls[0].reset(ptr); @@ -56,7 +57,7 @@ KmMatrix::KmMatrix(thrust::host_vector _vec, size_t _rows, size_t _cols) : param_ (_rows, _cols, nullptr) { init_impls(); -#if defined (USE_CUDA) +#if USE_CUDA() use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl(_vec, this); impls[0].reset(ptr); @@ -71,7 +72,7 @@ KmMatrix::KmMatrix(const KmMatrixProxy& _other) : init_impls(); name_ = _other.orgi_.name_ + "(" + std::to_string(_other.start()) + "," + std::to_string(_other.end()) + ")"; -#if defined (USE_CUDA) +#if USE_CUDA() use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl( _other.orgi_, _other.start(), _other.size(), _other.stride(), this); diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index b7aaaeb35..56e33d43a 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -12,10 +12,9 @@ #include #include -// FIXME -#define USE_CUDA 1 +#include "KmConfig.h" -#if defined (USE_CUDA) +#if USE_CUDA() #include "KmMatrixCuda.cuh" #endif @@ -47,6 +46,10 @@ struct kParam { cols = _other.cols; ptr = _other.ptr; } + + M_HOSTDEV size_t size() const { + return rows * cols; + } }; template diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh new file mode 100644 index 000000000..59059d30d --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/utils.cuh @@ -0,0 +1,27 @@ +#ifndef UTILS_CUH_ +#define UTILS_CUH_ + +#include "GpuInfo.cuh" + +namespace H2O4GPU { +namespace KMeans { + +M_DEVINLINE size_t global_thread_idx () { + return threadIdx.x + blockIdx.x * blockDim.x; +} + +M_DEVINLINE size_t grid_stride () { + return blockDim.x * gridDim.x; +} + +// This wrapper function is created to work around a possible bug in nvcc, +// which threats GpuInfo::ins() as calling base class method when used inside a +// class member function. +size_t get_blocks(size_t _mul, int _device=0) { + return GpuInfo::ins().blocks(_mul, _device); +} + +} // KMeans +} // H2O4GPU + +#endif // UTILS_CUH_ \ No newline at end of file From 6f771b8a04d47a6e3c35933349a8cc1871e42c1f Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 18 Jul 2018 22:03:00 +0800 Subject: [PATCH 10/49] cuBlas wrapper file. Only two functions are added. --- src/gpu/kmeans/KmMatrix/blas.cuh | 96 ++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 src/gpu/kmeans/KmMatrix/blas.cuh diff --git a/src/gpu/kmeans/KmMatrix/blas.cuh b/src/gpu/kmeans/KmMatrix/blas.cuh new file mode 100644 index 000000000..90c5e323f --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/blas.cuh @@ -0,0 +1,96 @@ +#ifndef KM_BLAS_CUH_ +#define KM_BLAS_CUH_ + +#include +#include "KmConfig.h" + +// C++ Wrappers for cublas + +namespace H2O4GPU { +namespace KMeans { + +namespace Blas { +// LEVEL 1 +inline void axpy(cublasHandle_t handle, int n, + const float *alpha, + const float *x, int incx, + float *y, int incy) { + CUBLAS_CHECK(cublasSaxpy(handle, n, + alpha, + x, incx, + y, incy));} + +inline void axpy(cublasHandle_t handle, int n, + const double *alpha, + const double *x, int incx, + double *y, int incy) { + CUBLAS_CHECK(cublasDaxpy(handle, n, + alpha, + x, incx, + y, incy));} + +// LEVEL 3 +inline void gemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + const float *beta, /* host or device pointer */ + float *C, + int ldc) { + CUBLAS_CHECK(cublasSgemm(handle, + transa, + transb, + m, + n, + k, + alpha, /* host or device pointer */ + A, + lda, + B, + ldb, + beta, /* host or device pointer */ + C, + ldc));} + +inline void gemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + const double *beta, /* host or device pointer */ + double *C, + int ldc) { + CUBLAS_CHECK(cublasDgemm(handle, + transa, + transb, + m, + n, + k, + alpha, /* host or device pointer */ + A, + lda, + B, + ldb, + beta, /* host or device pointer */ + C, + ldc));} + +} // Blas + +} // KMeans +} // H2O4GPU + +#endif // KM_BLAS_CUH_ \ No newline at end of file From c901a7fa8ac1b85d5dd1d48250e358f8c049c7d0 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 18 Jul 2018 22:05:43 +0800 Subject: [PATCH 11/49] Factor out config code, cublas_handle, add MatrixDim. * Checking code is factored into KmConfig * GpuInfo now takes care of cublas_handle; * MatrixDim for latter use. --- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 39 ++++++++++++++++--- src/gpu/kmeans/KmMatrix/KmConfig.h | 57 ++++++++++++++++++++++++---- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 5 +++ src/gpu/kmeans/kmeans_general.h | 41 -------------------- 4 files changed, 87 insertions(+), 55 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh index afb4dff5c..14c717d49 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -2,30 +2,57 @@ #define GPU_INFO_HPP_ #include "KmConfig.h" -#include "stdlib.h" + +#include + +#include +#include class GpuInfo { private: int n_gpu_; - size_t* n_sm_; + int* n_sm_; // number of gpu processors for each device + cublasHandle_t* handles_; // handle for each device + public: GpuInfo () { CUDA_CHECK(cudaGetDeviceCount(&n_gpu_)); - n_sm_ = (size_t*) malloc (n_gpu_); + n_sm_ = (int*) malloc (n_gpu_); + handles_ = (cublasHandle_t*) malloc (n_gpu_); + + for (int i = 0; i < n_gpu_; ++i) { + cudaDeviceGetAttribute(&n_sm_[i], cudaDevAttrMultiProcessorCount, i); + CUBLAS_CHECK(cublasCreate(&handles_[i])); + printf("n_sm[%d]: %d\n", i, n_sm_[i]); + } } ~GpuInfo () { free (n_sm_); + for (size_t i = 0; i < n_gpu_; ++i) { + CUBLAS_CHECK(cublasDestroy(handles_[i])); + } } - + // FIXME, get active device size_t blocks (size_t _mul, int _device=0) { if (has_device(_device)) { return _mul * n_sm_[_device]; + } else { + fprintf(stderr, "Doesn't have device: %d\n", _device); + abort(); + } + } + // FIXME, ditto + cublasHandle_t cublas_handle(int _device=0) { + if (has_device(_device)) { + return handles_[_device]; + } else { + fprintf(stderr, "Doesn't have device: %d\n", _device); + abort(); } - return 0; } bool has_device(int _device) { - return _device < n_gpu_ && _device > 0; + return _device < n_gpu_ && _device >= 0; } static GpuInfo& ins() { diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h index 8d6cf6f12..06366f448 100644 --- a/src/gpu/kmeans/KmMatrix/KmConfig.h +++ b/src/gpu/kmeans/KmMatrix/KmConfig.h @@ -8,14 +8,55 @@ #define M_DEVINLINE __device__ __forceinline__ #define M_HOSTDEVINLINE __host__ __device__ __forceinline__ -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e));\ - fflush( stdout ); \ - exit(EXIT_FAILURE); \ - } \ +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + printf("Cuda failure %s:%d '%s'\n", \ + __FILE__,__LINE__,cudaGetErrorString(e)); \ + fflush( stdout ); \ + exit(EXIT_FAILURE); \ + } \ } while(0) +#define CUBLAS_CHECK(cmd) do { \ + cublasStatus_t status = cmd; \ + if ( status != CUBLAS_STATUS_SUCCESS) { \ + const char* errmsg = nullptr; \ + switch(status) { \ + case CUBLAS_STATUS_NOT_INITIALIZED: \ + errmsg = "library not initialized"; \ + break; \ + \ + case CUBLAS_STATUS_ALLOC_FAILED: \ + errmsg = "resource allocation failed"; \ + break; \ + \ + case CUBLAS_STATUS_INVALID_VALUE: \ + errmsg = "an invalid numeric value was used as an argument"; \ + break; \ + \ + case CUBLAS_STATUS_ARCH_MISMATCH: \ + errmsg = "an absent device architectural feature is required"; \ + break; \ + \ + case CUBLAS_STATUS_MAPPING_ERROR: \ + errmsg = "an access to GPU memory space failed"; \ + break; \ + \ + case CUBLAS_STATUS_EXECUTION_FAILED: \ + errmsg = "the GPU program failed to execute"; \ + break; \ + \ + case CUBLAS_STATUS_INTERNAL_ERROR: \ + errmsg = "an internal operation failed"; \ + break; \ + \ + default: \ + errmsg = "unknown error"; \ + break; \ + } \ + printf("%s", errmsg); \ + } \ + } while (false) + #endif // KM_CONFIG_H_ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 56e33d43a..7b1c99e3a 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -52,6 +52,11 @@ struct kParam { } }; +enum class MatrixDim { + ROW, + COL +}; + template class KmMatrixImpl { protected: diff --git a/src/gpu/kmeans/kmeans_general.h b/src/gpu/kmeans/kmeans_general.h index a64901a7d..9697a48e3 100644 --- a/src/gpu/kmeans/kmeans_general.h +++ b/src/gpu/kmeans/kmeans_general.h @@ -25,44 +25,3 @@ exit(EXIT_FAILURE); \ } \ } while(0) - -#define CUBLAS_CHECK(cmd) do { \ - cublasStatus_t status = cmd; \ - if ( status != CUBLAS_STATUS_SUCCESS) { \ - const char* errmsg = nullptr; \ - switch(status) { \ - case CUBLAS_STATUS_NOT_INITIALIZED: \ - errmsg = "library not initialized"; \ - break; \ - \ - case CUBLAS_STATUS_ALLOC_FAILED: \ - errmsg = "resource allocation failed"; \ - break; \ - \ - case CUBLAS_STATUS_INVALID_VALUE: \ - errmsg = "an invalid numeric value was used as an argument"; \ - break; \ - \ - case CUBLAS_STATUS_ARCH_MISMATCH: \ - errmsg = "an absent device architectural feature is required"; \ - break; \ - \ - case CUBLAS_STATUS_MAPPING_ERROR: \ - errmsg = "an access to GPU memory space failed"; \ - break; \ - \ - case CUBLAS_STATUS_EXECUTION_FAILED: \ - errmsg = "the GPU program failed to execute"; \ - break; \ - \ - case CUBLAS_STATUS_INTERNAL_ERROR: \ - errmsg = "an internal operation failed"; \ - break; \ - \ - default: \ - errmsg = "unknown error"; \ - break; \ - } \ - printf("%s", errmsg); \ - } \ - } while (false) From 1db828249e1bfd4bceb2dad25a7950f29be52a2c Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 19 Jul 2018 17:07:36 +0800 Subject: [PATCH 12/49] Rename class member variables for CudaKmMatrix. --- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 54 ++++++++++++------------ src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 8 ++-- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index aefb55f21..a187ed043 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -22,9 +22,9 @@ CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par) : template CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, KmMatrix* _par) - : _on_device(false), KmMatrixImpl(_par) { - _h_vector.resize(_h_vec.size()); - thrust::copy(_h_vec.begin(), _h_vec.end(), _h_vector.begin()); + : on_device_(false), KmMatrixImpl(_par) { + h_vector_.resize(_h_vec.size()); + thrust::copy(_h_vec.begin(), _h_vec.end(), h_vector_.begin()); } template @@ -32,8 +32,8 @@ CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par, size_t _size) : KmMatrixImpl(_par) { if (_size == 0) return; - _d_vector.resize(_size); - _on_device = true; + d_vector_.resize(_size); + on_device_ = true; } template @@ -56,15 +56,15 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( raw_ptr = _other.dev_ptr(); thrust::device_ptr ptr (raw_ptr); ptr += _start; - _d_vector.resize(_size); - _on_device = true; - thrust::copy(ptr, ptr + _size, _d_vector.begin()); + d_vector_.resize(_size); + on_device_ = true; + thrust::copy(ptr, ptr + _size, d_vector_.begin()); } else { raw_ptr = _other.host_ptr(); raw_ptr += _start; - _h_vector.resize(_size); - _on_device = false; - thrust::copy(raw_ptr, raw_ptr + _size, _h_vector.begin()); + h_vector_.resize(_size); + on_device_ = false; + thrust::copy(raw_ptr, raw_ptr + _size, h_vector_.begin()); } } @@ -74,45 +74,45 @@ CudaKmMatrixImpl::~CudaKmMatrixImpl() {} template T* CudaKmMatrixImpl::host_ptr() { device_to_host(); - return thrust::raw_pointer_cast(_h_vector.data()); + return thrust::raw_pointer_cast(h_vector_.data()); } template T* CudaKmMatrixImpl::dev_ptr() { host_to_device(); - T* ptr = thrust::raw_pointer_cast(_d_vector.data()); + T* ptr = thrust::raw_pointer_cast(d_vector_.data()); return ptr; } template void CudaKmMatrixImpl::host_to_device() { - if (_on_device) + if (on_device_) return; - _d_vector.resize(_h_vector.size()); - thrust::copy(_h_vector.begin(), _h_vector.end(), _d_vector.begin()); - _on_device = true; + d_vector_.resize(h_vector_.size()); + thrust::copy(h_vector_.begin(), h_vector_.end(), d_vector_.begin()); + on_device_ = true; } template void CudaKmMatrixImpl::device_to_host() { - if (!_on_device) + if (!on_device_) return; - _h_vector.resize(_d_vector.size()); - thrust::copy(_d_vector.begin(), _d_vector.end(), _h_vector.begin()); - _on_device = false; + h_vector_.resize(d_vector_.size()); + thrust::copy(d_vector_.begin(), d_vector_.end(), h_vector_.begin()); + on_device_ = false; } template bool CudaKmMatrixImpl::on_device() const { - return _on_device; + return on_device_; } template size_t CudaKmMatrixImpl::size() const { - if (_on_device) { - return _d_vector.size(); + if (on_device_) { + return d_vector_.size(); } else { - return _h_vector.size(); + return h_vector_.size(); } } @@ -121,8 +121,8 @@ bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { // FIXME, Is it floating compatible? _rhs->host_to_device(); host_to_device(); - bool res = thrust::equal(_d_vector.begin(), _d_vector.end(), - _rhs->_d_vector.begin()); + bool res = thrust::equal(d_vector_.begin(), d_vector_.end(), + _rhs->d_vector_.begin()); return res; } diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index 5fb47267f..9fd7c5bee 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -32,11 +32,11 @@ struct CudaInfo { template class CudaKmMatrixImpl : public KmMatrixImpl { private: - thrust::device_vector _d_vector; - thrust::host_vector _h_vector; + thrust::device_vector d_vector_; + thrust::host_vector h_vector_; - bool _on_device; - KmMatrix* _matrix; + bool on_device_; + KmMatrix* matrix_; void host_to_device(); void device_to_host(); From 27b399feea91d4a7061ea6f870847a28190ca4c9 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 Jul 2018 01:18:11 +0800 Subject: [PATCH 13/49] Add stack for KmMatrix, fix impl ownership bug. --- src/gpu/kmeans/KmMatrix/KmConfig.h | 7 ++ src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 127 +++++++++++++++-------- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 58 +++++++---- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 61 +++++++++-- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 19 +++- tests/cpp/gpu/KmMatrix/test_matrix.cu | 48 ++++++++- 6 files changed, 244 insertions(+), 76 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h index 06366f448..446401545 100644 --- a/src/gpu/kmeans/KmMatrix/KmConfig.h +++ b/src/gpu/kmeans/KmMatrix/KmConfig.h @@ -3,8 +3,11 @@ #define USE_CUDA() 1 +#include "stdio.h" + // Matrix host dev #define M_HOSTDEV __host__ __device__ +#define M_DEV __device__ #define M_DEVINLINE __device__ __forceinline__ #define M_HOSTDEVINLINE __host__ __device__ __forceinline__ @@ -59,4 +62,8 @@ } \ } while (false) +#define M_ERROR(msg) \ + printf("%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, __func__); \ + abort(); + #endif // KM_CONFIG_H_ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index 96f6c0895..7c43cccea 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -30,12 +30,11 @@ KmMatrix::KmMatrix() : param_ (0, 0, nullptr) { init_impls(); #if USE_CUDA() - use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl(this); - impls[0].reset(ptr); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; #elif - use_cuda = false; - impls[0] = nullptr; + backend_ = Backend::CPUDense; #endif } @@ -44,11 +43,11 @@ KmMatrix::KmMatrix(size_t _rows, size_t _cols) : param_ (_rows, _cols, nullptr) { init_impls(); #if USE_CUDA() - use_cuda = true; - KmMatrixImpl * ptr = new CudaKmMatrixImpl(this, _rows * _cols); - impls[0].reset(ptr); + KmMatrixImpl * ptr = new CudaKmMatrixImpl(_rows * _cols, this); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; #elif - use_cuda = false; + backend_ = Backend::CPUDense; #endif } @@ -58,11 +57,11 @@ KmMatrix::KmMatrix(thrust::host_vector _vec, param_ (_rows, _cols, nullptr) { init_impls(); #if USE_CUDA() - use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl(_vec, this); - impls[0].reset(ptr); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; #elif - use_cuda = false; + backend_ = Backend::CPUDense; #endif } @@ -73,42 +72,46 @@ KmMatrix::KmMatrix(const KmMatrixProxy& _other) : name_ = _other.orgi_.name_ + "(" + std::to_string(_other.start()) + "," + std::to_string(_other.end()) + ")"; #if USE_CUDA() - use_cuda = true; KmMatrixImpl * ptr = new CudaKmMatrixImpl( _other.orgi_, _other.start(), _other.size(), _other.stride(), this); - impls[0].reset(ptr); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; #elif - use_cuda = false; + backend_ = Backend::CPUDense; #endif } template KmMatrix::KmMatrix(const KmMatrix& _other) : param_(_other.param_) { - for (size_t i = 0; i < 4; ++i) { - impls[i] = _other.impls[i]; - } - use_cuda = _other.use_cuda; + copy_impls(_other.impls); + backend_ = _other.backend_; name_ = _other.name_ + "(copied)"; } template KmMatrix::KmMatrix(KmMatrix&& _other) : - param_(_other.param_){ + param_(_other.param_) { + copy_impls(_other.impls); + backend_ = _other.backend_; + name_ = _other.name_ + "(copied [in move])"; +} + +template +void KmMatrix::copy_impls(const std::shared_ptr>* _impls) { for (size_t i = 0; i < 4; ++i) { - impls[i] = std::move(_other.impls[i]); + if (_impls[i].get() != nullptr) { + impls[i] = _impls[i]; + impls[i]->set_interface(this); + } } - use_cuda = _other.use_cuda; - name_ = _other.name_ + "(copied [in move])"; } template void KmMatrix::operator=(const KmMatrix& _other) { - for (size_t i = 0; i < 4; ++i) { - impls[i] = _other.impls[i]; - } + copy_impls(_other.impls); param_ = _other.param_; - use_cuda = _other.use_cuda; + backend_ = _other.backend_; name_ = _other.name_ + "(copied)"; } @@ -116,9 +119,11 @@ template void KmMatrix::operator=(KmMatrix&& _other) { for (size_t i = 0; i < 4; ++i) { impls[i] = std::move(_other.impls[i]); + if (impls[i] != nullptr) + impls[i]->set_interface(this); } param_ = _other.param_; - use_cuda = _other.use_cuda; + backend_ = _other.backend_; name_ = _other.name_ + "(copied [in move])"; } @@ -158,7 +163,7 @@ kParam KmMatrix::k_param () { template T* KmMatrix::host_ptr() { - if (use_cuda) { + if (backend_ == Backend::CUDADense) { return impls[0]->host_ptr(); } else { // FIXME @@ -168,8 +173,8 @@ T* KmMatrix::host_ptr() { template T* KmMatrix::dev_ptr() { - if (use_cuda) { - return impls[CUDADense]->dev_ptr(); + if (backend_ == Backend::CUDADense) { + return impls[(int)Backend::CUDADense]->dev_ptr(); } else { return nullptr; } @@ -177,8 +182,8 @@ T* KmMatrix::dev_ptr() { template bool KmMatrix::on_device() const { - if (use_cuda) { - return impls[CUDADense]->on_device(); + if (backend_ == Backend::CUDADense) { + return impls[(int)Backend::CUDADense]->on_device(); } else { return false; } @@ -195,29 +200,54 @@ KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem) { template KmMatrixProxy KmMatrix::col(size_t idx) { - // FIXME - assert (false); + M_ERROR("Not implemented."); return KmMatrixProxy(*this, 0, 0, 0); } template bool KmMatrix::operator==(KmMatrix& _rhs) { - if (_rhs.use_cuda && use_cuda) { - std::shared_ptr> tmp = - std::dynamic_pointer_cast>(_rhs.impls[CUDADense]); - bool res = std::dynamic_pointer_cast>( - impls[CUDADense])->equal(tmp); + if (_rhs.backend_ == Backend::CUDADense && backend_ == Backend::CUDADense) { + // std::shared_ptr> tmp = + // std::dynamic_pointer_cast>( + // _rhs.impls[(int)Backend::CUDADense]); + bool res = impls[(int)Backend::CUDADense]->equal(_rhs); + // bool res = std::dynamic_pointer_cast>( + // impls[(int)Backend::CUDADense])->equal(*tmp); return res; } else { - // FIXME - assert(false); + M_ERROR("Not implemented."); return false; } } +template +KmMatrix KmMatrix::stack(KmMatrix &_second, + KmMatrixDim _dim) { + KmMatrix res; + + if (_dim == KmMatrixDim::ROW) { + if (cols() != _second.cols()) { + M_ERROR("Columns of first is not equal to second."); + } + + if (backend_ == Backend::CUDADense) { + res = impls[(int)Backend::CUDADense]->stack(_second, _dim); + } else { + M_ERROR("Not implemented."); + } + + } else { + M_ERROR("Not implemented."); + } + + return res; +} + + // ============================== // Helper functions // ============================== + template std::ostream& operator<<(std::ostream& os, KmMatrix& m) { std::cout << "matrix: " << m.name() << std::endl << "---" << std::endl; @@ -233,6 +263,12 @@ std::ostream& operator<<(std::ostream& os, KmMatrix& m) { return os; } +template +KmMatrix stack(KmMatrix& _first, KmMatrix& _second, + KmMatrixDim _dim) { + return _first.stack(_second, _dim); +} + #define INSTANTIATE(T) \ /* Standard con(de)structors*/ \ template KmMatrixImpl::KmMatrixImpl(KmMatrix *_matrix); \ @@ -242,6 +278,8 @@ std::ostream& operator<<(std::ostream& os, KmMatrix& m) { size_t _rows, size_t _cols); \ template KmMatrix::KmMatrix(const KmMatrix& _other); \ template KmMatrix::KmMatrix(KmMatrix&& _other); \ + template void KmMatrix::copy_impls( \ + const std::shared_ptr>* _impls); \ template void KmMatrix::operator=(const KmMatrix& _other); \ template void KmMatrix::operator=(KmMatrix&& _other); \ template KmMatrix::KmMatrix(const KmMatrixProxy& _other); \ @@ -256,8 +294,13 @@ std::ostream& operator<<(std::ostream& os, KmMatrix& m) { template bool KmMatrix::on_device() const; \ template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); \ template bool KmMatrix::operator==(KmMatrix &_rhs); \ + template KmMatrix KmMatrix::stack(KmMatrix &_second, \ + H2O4GPU::KMeans::KmMatrixDim _dim); \ /* Helper functions */ \ - template std::ostream& operator<<(std::ostream& os, KmMatrix& m); + template std::ostream& operator<<(std::ostream& os, KmMatrix& m); \ + template KmMatrix stack(KmMatrix& _first, KmMatrix& _second, \ + KmMatrixDim _dim); + INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 7b1c99e3a..f00a07a0b 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -6,7 +6,7 @@ #ifndef KM_MATRIX_HPP_ #define KM_MATRIX_HPP_ -#include +#include #include #include #include @@ -27,6 +27,18 @@ class KmMatrixProxy; template class KmMatrix; +enum class Backend { + CUDADense = 0, + CUDASparse = 1, + CPUDense = 2, + CPUSparse = 3 +}; + +enum class KmMatrixDim { + ROW, + COL +}; + // Kernel parameter template struct kParam { @@ -52,11 +64,6 @@ struct kParam { } }; -enum class MatrixDim { - ROW, - COL -}; - template class KmMatrixImpl { protected: @@ -65,32 +72,36 @@ class KmMatrixImpl { KmMatrixImpl(KmMatrix *_matrix); virtual ~KmMatrixImpl () {} - virtual T* host_ptr() {} - virtual T* dev_ptr() {} - virtual size_t size() const {} - virtual bool on_device() const {} + // FIXME + // Used in KmMatrix constructors to deal with temp return value. + // Maybe better solution. + virtual void set_interface(KmMatrix* _par) = 0; + + virtual T* host_ptr() = 0; + virtual T* dev_ptr() = 0; + virtual size_t size() const = 0; + virtual bool on_device() const = 0; + + virtual KmMatrix stack(KmMatrix&, KmMatrixDim _dim) = 0; + virtual bool equal(KmMatrix& _val) = 0; + + friend KmMatrix; }; template class KmMatrix { private: - bool use_cuda; - - enum Backend { - CUDADense = 0, - CUDASparse = 1, - CPUDense = 2, - CPUSparse = 3 - }; + Backend backend_; std::shared_ptr> impls[4]; kParam param_; - void init_impls(); - std::string name_; + void init_impls(); + void copy_impls(const std::shared_ptr>* _impls); + public: explicit KmMatrix(); KmMatrix(size_t _rows, size_t _cols); @@ -123,8 +134,15 @@ class KmMatrix { KmMatrixProxy row(size_t idx, bool dev_mem=true); KmMatrixProxy col(size_t idx); + + KmMatrix stack(KmMatrix& _second, KmMatrixDim _dim); }; +template +KmMatrix stack(KmMatrix& _first, + KmMatrix& _second, + KmMatrixDim _dim); + template std::ostream& operator<<(std::ostream& os, KmMatrix& m); diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index a187ed043..a27167734 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -9,8 +9,9 @@ #include -#include "KmMatrix.hpp" #include "KmMatrixCuda.cuh" +#include "KmMatrix.hpp" +#include "backend.hpp" namespace H2O4GPU { namespace KMeans { @@ -28,7 +29,7 @@ CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, } template -CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par, size_t _size) : +CudaKmMatrixImpl::CudaKmMatrixImpl(size_t _size, KmMatrix * _par) : KmMatrixImpl(_par) { if (_size == 0) return; @@ -71,6 +72,11 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( template CudaKmMatrixImpl::~CudaKmMatrixImpl() {} +template +void CudaKmMatrixImpl::set_interface(KmMatrix* _par) { + KmMatrixImpl::matrix_ = _par; +} + template T* CudaKmMatrixImpl::host_ptr() { device_to_host(); @@ -117,15 +123,48 @@ size_t CudaKmMatrixImpl::size() const { } template -bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { - // FIXME, Is it floating compatible? - _rhs->host_to_device(); +bool CudaKmMatrixImpl::equal(KmMatrix& _rhs) { + T* rhs_raw_ptr = _rhs.dev_ptr(); host_to_device(); + thrust::device_ptr rhs_ptr (rhs_raw_ptr); + // FIXME, Is it floating compatible? bool res = thrust::equal(d_vector_.begin(), d_vector_.end(), - _rhs->d_vector_.begin()); + rhs_ptr); return res; } +template +KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, + KmMatrixDim _dim) { + if (_dim == KmMatrixDim::ROW) { + if (KmMatrixImpl::matrix_->cols() != _second.cols()) { + M_ERROR("Columns of first is not equal to second."); + } + host_to_device(); + + T * sec_raw_ptr = _second.dev_ptr(); + thrust::device_ptr self_ptr = d_vector_.data(); + + thrust::device_ptr sec_ptr (sec_raw_ptr); + + KmMatrix res (KmMatrixImpl::matrix_->rows() + _second.rows(), + KmMatrixImpl::matrix_->cols()); + + T * res_raw_ptr = res.dev_ptr(); + thrust::device_ptr res_ptr (res_raw_ptr); + + thrust::copy(self_ptr, self_ptr + size(), res_ptr); + res_ptr = thrust::device_ptr(res_raw_ptr) + size(); + thrust::copy(sec_ptr, sec_ptr + _second.size(), res_ptr); + + return res; + } else { + // FIXME + M_ERROR("Not implemented."); + } +} + + #define INSTANTIATE(T) \ /* Standard con(de)structors*/ \ template CudaKmMatrixImpl::CudaKmMatrixImpl( \ @@ -134,9 +173,10 @@ bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { template CudaKmMatrixImpl::CudaKmMatrixImpl( \ const thrust::host_vector& _h_vec, KmMatrix* _par); \ template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par); \ - template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par, \ - size_t _size); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl(size_t _size, \ + KmMatrix * _par); \ template CudaKmMatrixImpl::~CudaKmMatrixImpl(); \ + template void CudaKmMatrixImpl::set_interface(KmMatrix* _par); \ /* Member functions */ \ template bool CudaKmMatrixImpl::on_device() const; \ template void CudaKmMatrixImpl::device_to_host(); \ @@ -144,8 +184,9 @@ bool CudaKmMatrixImpl::equal(std::shared_ptr>& _rhs) { template T* CudaKmMatrixImpl::dev_ptr(); \ template T* CudaKmMatrixImpl::host_ptr(); \ template size_t CudaKmMatrixImpl::size() const; \ - template bool CudaKmMatrixImpl::equal( \ - std::shared_ptr>& _rhs); + template bool CudaKmMatrixImpl::equal(KmMatrix& _rhs); \ + template KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, \ + KmMatrixDim _dim); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index 9fd7c5bee..73f0c50b4 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -2,6 +2,7 @@ #define KM_MATRIX_CUDA_CUH_ #include "KmMatrix.hpp" +#include "thrust/device_vector.h"; #include namespace H2O4GPU { @@ -44,20 +45,34 @@ class CudaKmMatrixImpl : public KmMatrixImpl { public: CudaKmMatrixImpl(KmMatrix * _par); CudaKmMatrixImpl(const thrust::host_vector& _h_vec, KmMatrix* _par); - CudaKmMatrixImpl(KmMatrix * _par, size_t _size); + CudaKmMatrixImpl(size_t _size, KmMatrix * _par); CudaKmMatrixImpl(KmMatrix& _other, size_t _start, size_t _size, size_t _stride, KmMatrix * _par); + + CudaKmMatrixImpl(const CudaKmMatrixImpl&) = delete; + CudaKmMatrixImpl(CudaKmMatrixImpl&&) = delete; + virtual ~CudaKmMatrixImpl(); + virtual void set_interface(KmMatrix* _par) override; + + void operator=(const CudaKmMatrixImpl&) = delete; + void operator=(CudaKmMatrixImpl&&) = delete; + + KmMatrix stack(KmMatrix& _second, KmMatrixDim _dim) override; + virtual T* host_ptr() override; virtual T* dev_ptr() override; virtual size_t size() const override; - virtual bool equal(std::shared_ptr>& _rhs); + // virtual bool equal(std::shared_ptr>& _rhs); + virtual bool equal(KmMatrix& _rhs); virtual bool on_device() const override; + + friend KmMatrix; }; } // MkMatrix diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu index 1665a4fba..4491ab5f7 100644 --- a/tests/cpp/gpu/KmMatrix/test_matrix.cu +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -3,7 +3,7 @@ #include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" -// r --gtest_filter=KmMatrix.KmMatrix +// r --gtest_filter=KmMatrix.KmMatrixEqual TEST(KmMatrix, KmMatrixEqual) { thrust::host_vector vec (2048 * 1024); for (size_t i = 0; i < 2048 * 1024; ++i) { @@ -56,4 +56,48 @@ TEST(KmMatrix, KmMatrixKparam) { ASSERT_EQ(param.ptr, mat.dev_ptr()); ASSERT_EQ(param.rows, 12); ASSERT_EQ(param.cols, 16); -} \ No newline at end of file +} + +TEST(KmMatrix, KmMatrixCycle) { + thrust::host_vector vec (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec[i] = i; + } + H2O4GPU::KMeans::KmMatrix mat0 (vec, 2048, 1024); + for (size_t i = 0; i < 1000; ++i) { + H2O4GPU::KMeans::KmMatrix mat1 = mat0; + H2O4GPU::KMeans::KmMatrix mat2 = mat1; + mat0 = mat2; + } +} + +// r --gtest_filter=KmMatrix.Stack +TEST(KmMatrix, Stack) { + constexpr size_t rows = 16, cols = 16; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + vec[i] = i; + } + H2O4GPU::KMeans::KmMatrix mat(vec, rows, cols); + + thrust::host_vector vec1 (rows * cols); + for (size_t i = rows * cols; i < 2 * rows * cols; ++i) { + vec1[i - rows * cols] = i; + } + H2O4GPU::KMeans::KmMatrix mat1(vec1, rows, cols); + + H2O4GPU::KMeans::KmMatrix calculated = + H2O4GPU::KMeans::stack(mat, mat1, H2O4GPU::KMeans::KmMatrixDim::ROW); + + thrust::host_vector res (2 * rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + res[i] = i; + } + for (size_t i = rows * cols; i < 2 * rows * cols; ++i) { + res[i] = i; + } + + H2O4GPU::KMeans::KmMatrix res_mat (res, 2 * rows, cols); + + ASSERT_TRUE(calculated == res_mat); +} From 70d0a1053bba114b6992b8e9d7672d376bf47505 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 Jul 2018 01:28:07 +0800 Subject: [PATCH 14/49] Add license. --- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 5 +++++ src/gpu/kmeans/KmMatrix/KmConfig.h | 8 +++++++- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 1 - src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 7 +++++++ src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp | 5 +++++ src/gpu/kmeans/KmMatrix/blas.cuh | 5 +++++ src/gpu/kmeans/KmMatrix/utils.cuh | 5 +++++ 7 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh index 14c717d49..559a52c2d 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #ifndef GPU_INFO_HPP_ #define GPU_INFO_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h index 446401545..dd0c43367 100644 --- a/src/gpu/kmeans/KmMatrix/KmConfig.h +++ b/src/gpu/kmeans/KmMatrix/KmConfig.h @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #ifndef KM_CONFIG_H_ #define KM_CONFIG_H_ @@ -62,8 +67,9 @@ } \ } while (false) -#define M_ERROR(msg) \ +#define M_ERROR(msg) \ printf("%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, __func__); \ abort(); + #endif // KM_CONFIG_H_ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index a27167734..7eaa55c1d 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -11,7 +11,6 @@ #include "KmMatrixCuda.cuh" #include "KmMatrix.hpp" -#include "backend.hpp" namespace H2O4GPU { namespace KMeans { diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index 73f0c50b4..95e9b27a5 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #ifndef KM_MATRIX_CUDA_CUH_ #define KM_MATRIX_CUDA_CUH_ @@ -17,6 +22,8 @@ class KmMatrixImpl; template class KmMatrixProxy; +enum class KmMatrixDim; + struct CudaInfo { int n_devices; int * _devices; diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp b/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp index 9cc2f1e29..d7f70e82f 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #include "KmMatrix.hpp" namespace H2O4GPU { diff --git a/src/gpu/kmeans/KmMatrix/blas.cuh b/src/gpu/kmeans/KmMatrix/blas.cuh index 90c5e323f..619fca756 100644 --- a/src/gpu/kmeans/KmMatrix/blas.cuh +++ b/src/gpu/kmeans/KmMatrix/blas.cuh @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #ifndef KM_BLAS_CUH_ #define KM_BLAS_CUH_ diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh index 59059d30d..8dfbd6c53 100644 --- a/src/gpu/kmeans/KmMatrix/utils.cuh +++ b/src/gpu/kmeans/KmMatrix/utils.cuh @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #ifndef UTILS_CUH_ #define UTILS_CUH_ From 8f65ea5808b5c4ed2d4c2de38eca804dd61c8b93 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 Jul 2018 16:58:29 +0800 Subject: [PATCH 15/49] Construct kmeans|| based on KmMatrix. Builds basic kmeans|| framework on top of KmMatrix. The algorithm is not working yet. --- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 3 +- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 2 +- src/gpu/kmeans/KmMatrix/blas.cuh | 117 +++++- src/gpu/kmeans/KmMatrix/utils.cuh | 18 +- src/gpu/kmeans/kmeans_init.cu | 528 +++++++++++++++++--------- src/gpu/kmeans/kmeans_init.cuh | 164 ++++---- tests/cpp/gpu/KmMatrix/test_matrix.cu | 14 +- 7 files changed, 553 insertions(+), 293 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh index 559a52c2d..789cf08b5 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -13,6 +13,7 @@ #include #include +// Singleton class storing gpu info. class GpuInfo { private: int n_gpu_; @@ -67,6 +68,4 @@ class GpuInfo { }; -// const GpuInfoImpl GpuInfo::impl = GpuInfoImpl(); - #endif // GPU_INFO_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index 7c43cccea..9fe8008b0 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -250,7 +250,7 @@ KmMatrix KmMatrix::stack(KmMatrix &_second, template std::ostream& operator<<(std::ostream& os, KmMatrix& m) { - std::cout << "matrix: " << m.name() << std::endl << "---" << std::endl; + std::cout << "\nmatrix: " << m.name() << std::endl << "---" << std::endl; T * ptr = m.host_ptr(); kParam param = m.k_param(); for (size_t i = 0; i < param.rows; ++i) { diff --git a/src/gpu/kmeans/KmMatrix/blas.cuh b/src/gpu/kmeans/KmMatrix/blas.cuh index 619fca756..fcbcd6339 100644 --- a/src/gpu/kmeans/KmMatrix/blas.cuh +++ b/src/gpu/kmeans/KmMatrix/blas.cuh @@ -17,9 +17,9 @@ namespace KMeans { namespace Blas { // LEVEL 1 inline void axpy(cublasHandle_t handle, int n, - const float *alpha, - const float *x, int incx, - float *y, int incy) { + const float *alpha, + const float *x, int incx, + float *y, int incy) { CUBLAS_CHECK(cublasSaxpy(handle, n, alpha, x, incx, @@ -50,19 +50,13 @@ inline void gemm(cublasHandle_t handle, float *C, int ldc) { CUBLAS_CHECK(cublasSgemm(handle, - transa, - transb, - m, - n, - k, + transa, transb, + m, n, k, alpha, /* host or device pointer */ - A, - lda, - B, - ldb, + A, lda, + B, ldb, beta, /* host or device pointer */ - C, - ldc));} + C, ldc));} inline void gemm(cublasHandle_t handle, cublasOperation_t transa, @@ -93,8 +87,101 @@ inline void gemm(cublasHandle_t handle, C, ldc));} -} // Blas +inline void gemm_batched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const double *alpha, + const double *Aarray[], int lda, + const double *Barray[], int ldb, + const double *beta, + double *Carray[], int ldc, + int batchCount) { + CUBLAS_CHECK(cublasDgemmBatched(handle, + transa, + transb, + m, n, k, + alpha, + Aarray, lda, + Barray, ldb, + beta, + Carray, ldc, + batchCount)); +} +inline void gemm_batched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const float *Aarray[], int lda, + const float *Barray[], int ldb, + const float *beta, + float *Carray[], int ldc, + int batchCount) { + CUBLAS_CHECK(cublasSgemmBatched(handle, + transa, + transb, + m, n, k, + alpha, + Aarray, lda, + Barray, ldb, + beta, + Carray, ldc, + batchCount)); +} + +inline void gemm_strided_batched( + cublasHandle_t handle, + cublasOperation_t transA, cublasOperation_t transB, + int M, int N, int K, + const double* alpha, + const double* A, int ldA, int strideA, + const double* B, int ldB, int strideB, + const double* beta, + double* C, int ldC, int strideC, + int batchCount) { + CUBLAS_CHECK(cublasDgemmStridedBatched(handle, + transA, + transB, + M, N, K, + alpha, + A, ldA, + strideA, + B, ldB, + strideB, + beta, + C, ldC, + strideC, + batchCount)); +} + +inline void gemm_strided_batched( + cublasHandle_t handle, + cublasOperation_t transA, cublasOperation_t transB, + int M, int N, int K, + const float* alpha, + const float* A, int ldA, int strideA, + const float* B, int ldB, int strideB, + const float* beta, + float* C, int ldC, int strideC, + int batchCount) { + CUBLAS_CHECK(cublasSgemmStridedBatched(handle, + transA, + transB, + M, N, K, + alpha, + A, ldA, + strideA, + B, ldB, + strideB, + beta, + C, ldC, + strideC, + batchCount)); +} + +} // Blas } // KMeans } // H2O4GPU diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh index 8dfbd6c53..296ab08b2 100644 --- a/src/gpu/kmeans/KmMatrix/utils.cuh +++ b/src/gpu/kmeans/KmMatrix/utils.cuh @@ -15,15 +15,21 @@ M_DEVINLINE size_t global_thread_idx () { return threadIdx.x + blockIdx.x * blockDim.x; } -M_DEVINLINE size_t grid_stride () { +M_DEVINLINE size_t global_thread_idy () { + return threadIdx.y + blockIdx.y * blockDim.y; +} + +M_DEVINLINE size_t grid_stride_x () { return blockDim.x * gridDim.x; } -// This wrapper function is created to work around a possible bug in nvcc, -// which threats GpuInfo::ins() as calling base class method when used inside a -// class member function. -size_t get_blocks(size_t _mul, int _device=0) { - return GpuInfo::ins().blocks(_mul, _device); +M_DEVINLINE size_t grid_stride_y () { + return blockDim.y * gridDim.y; +} + +template +T1 M_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) { + return static_cast(ceil(static_cast(a) / b)); } } // KMeans diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 4535726d1..150ee3f87 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -5,269 +5,417 @@ #include #include -#include -#define EIGNE_USE_GPU -#include "Eigen/Dense" +#include -#include +#include +#include +#include -#include "kmeans_general.h" -#include "kmeans_h2o4gpu.h" +#include #include "kmeans_init.cuh" + #include "KmMatrix/KmMatrix.hpp" +#include "KmMatrix/utils.cuh" +#include "KmMatrix/GpuInfo.cuh" +#include "KmMatrix/blas.cuh" + namespace H2O4GPU { namespace KMeans { -template -__device__ __forceinline__ -T min_distance(VE_T(T) *x, MA_T(T) *centroids) { +namespace kernel { - KmShardMem shared; - T * _distances = shared.ptr(); +__global__ void setup_random_states(curandState *state, size_t size) +{ + int id = threadIdx.x + blockIdx.x * threadIdx.x; + /* Each thread gets same seed, a different sequence + number, no offset */ + if (id < size) + curand_init(1234, id, 0, &state[id]); +} - size_t n_rows = centroids->rows(); - for (size_t i = 0; i < centroids->rows(); ++i) { - auto temp = *x - centroids->row(i); - _distances[i] = temp.dot(temp); - } +__global__ void generate_uniform_kernel(float *_res, + curandState *_state, + int _size) +{ + int idx = threadIdx.x + blockIdx.x * threadIdx.x; + if (idx < _size) { + float x; + curandState localState = _state[idx]; + x = curand_uniform(&localState); + _state[idx] = localState; + _res[idx] = x; + } +} + +__global__ void generate_uniform_kernel(double *_res, + curandState *_state, + int _size) +{ + int idx = threadIdx.x + blockIdx.x * threadIdx.x; + if (idx < _size) { + double x; + curandState localState = _state[idx]; + x = curand_uniform_double(&localState); + _state[idx] = localState; + _res[idx] = x; + } +} + +/* + * @tparam T Numeric type of the data + * @param _res The output matrix with shape m x 1 + * @param _val The input matrix with shape m x n + */ +template +__global__ void col_min_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + size_t stride = grid_stride_x () * _val.cols; - __syncthreads(); + size_t n_batches = div_roundup(_val.cols, 128); - Eigen::Map _distances_vec(_distances, n_rows, 1); - T result = _distances_vec.minCoeff(); - return result; + for (size_t i = idx; i < _val.size(); i += stride) { + T min = std::numeric_limits::max(); + + for (size_t j = 0; j < _val.cols; ++j) { + T tmp = _val.ptr[i+j]; + if (tmp < min) + min = tmp; + _res.ptr[idx] = tmp; + } + } } +} // namespace kernel + + template -__global__ -void potential_kernel(kVParam _dis, kMParam _data, kMParam _cent) { +struct DotOp { + void dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); + } + void dot(KmMatrix& _res, KmMatrix& _lhs, + KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm(handle, + CUBLAS_OP_T, CUBLAS_OP_N, // FIXME + _lhs.rows(), _rhs.cols(), _lhs.cols(), + &alpha, + _lhs.dev_ptr(), _lhs.cols(), + _rhs.dev_ptr(), _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols()); + } +}; - MA_T(T) data = Eigen::Map(_data.ptr, _data.rows, _data.cols); - MA_T(T) centroids = Eigen::Map(_cent.ptr, _cent.rows, _cent.cols); +template +struct VecBatchDotOp { + void dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); + } + void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm_strided_batched( + handle, + // k-means use row major, so transpose the second vector. + CUBLAS_OP_N, CUBLAS_OP_T, + 1, 1, _rhs.cols(), // m, n, k + &alpha, + _lhs.dev_ptr(), 1, _lhs.cols(), + _rhs.dev_ptr(), 1, _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols(), 1, // c should be columun vector + _lhs.rows()); + } +}; + +// FIXME: Using struct for operations is just keeping the possibility to create +// some unified operations for KmMatrix. For example, let KmMatrix +// inherit those left associative ops, or create a inferface for elementwise +// operations. +template +struct SumOp { + T sum(KmMatrix& _val) { + T* raw_ptr = _val.dev_ptr(); + thrust::device_ptr ptr (raw_ptr); + T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus()); + return res; + } +}; + +template +struct MeanOp { + T mean(KmMatrix& _val) { + T res = SumOp().sum(_val); + return res; + } +}; - Eigen::Map distances(_dis.ptr, _dis.size); +template +struct MulOp { + void mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs) { + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::axpy( + handle, _lhs.size(), // handle, n + &_rhs, // alpha + _lhs.dev_ptr(), 1, + _res.dev_ptr(), 1); + } +}; - size_t tid = threadIdx.x + blockIdx.x * blockDim.x; +template +struct MinOp { + + void min(KmMatrix& _res, KmMatrix& _val, KmMatrixDim _dim) { + size_t blocks = GpuInfo::ins().blocks(32); + if (_dim == KmMatrixDim::COL) { + kernel::col_min_sequential<<>>( + _res.k_param(), _val.k_param()); + } else { + // FIXME + M_ERROR("Not implemented"); + } + } +}; - if (tid < _dis.size) { - distances(tid) = min_distance(&( (VE_T(T)) data.row(tid)), - ¢roids); - printf("distance[%u] %f\n", tid, distances(tid)); +namespace kernel { +// X^2 + Y^2 +template +__global__ void construct_distance_pairs_kernel( + kParam _distance_pairs, + kParam _data_dots, kParam _centroids_dots) { + + size_t idx = global_thread_idx(); // indexing data + size_t idy = global_thread_idy(); // indexing centroids + + size_t stride_x = grid_stride_x () * _data_dots.cols; + // strides only for data. + for (size_t i = idx; i < _data_dots.rows; i += stride_x) { + if (i < _data_dots.rows && idy < _centroids_dots.rows ) { + // i + idy: x^2 + y^2 between i^th data (a.k.a x) and idy^th + // centroid (a.k.a y) + _distance_pairs.ptr[i + idy] = + _data_dots.ptr[idx] + _centroids_dots.ptr[idy]; + } } } +} template -T KmeansLlInit::potential(MA_T(T)& data, MA_T(T)& centroids) { +struct PairWiseDistanceOp { + KmMatrix data_dot_; + KmMatrix centroids_dot_; + KmMatrix distance_pairs_; - VE_T(T) distances (data.rows()); + bool initialized_; - T* d_distances, * d_data, *d_centroids; + void initialize(size_t _n_data, size_t k, size_t _dim) { + // FIXME + } - CUDACHECK(cudaMalloc((void**)&d_distances, sizeof(T) * distances.size())); - CUDACHECK(cudaMalloc((void**)&d_data, sizeof(T) * data.size())); - CUDACHECK(cudaMalloc((void**)&d_centroids, sizeof(T) * centroids.size())); + PairWiseDistanceOp () : initialized_(false) {} - CUDACHECK(cudaMemcpy(d_distances, (void*)distances.data(), - sizeof(T) * distances.size(), - cudaMemcpyHostToDevice)); - CUDACHECK(cudaMemcpy(d_data, (void*)data.data(), - sizeof(T) * data.size(), - cudaMemcpyHostToDevice)); - CUDACHECK(cudaMemcpy(d_centroids, (void*)centroids.data(), - sizeof(T) * centroids.size(), - cudaMemcpyHostToDevice)); + PairWiseDistanceOp (KmMatrix& _data_dot, KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs) : + data_dot_(_data_dot), centroids_dot_(_centroids_dot), + distance_pairs_(_distance_pairs), initialized_(true) { + data_dot_.set_name ("data dot"); + centroids_dot_.set_name ("centroids_dot"); + distance_pairs_.set_name ("distance pairs"); + } + + KmMatrix operator()(KmMatrix& _data, KmMatrix& _centroids) { + + kernel::construct_distance_pairs_kernel<<< + dim3(GpuInfo::ins().blocks(32), div_roundup(_centroids.rows(), 16)), + dim3(16, 16)>>>( + distance_pairs_.k_param(), + data_dot_.k_param(), + centroids_dot_.k_param()); + + CUDA_CHECK(cudaGetLastError()); + std::cout << std::endl; + std::cout << "in distance op" << std::endl; + std::cout << distance_pairs_ << std::endl; + + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + + T alpha = -2.0; + T beta = 1.0; + std::cout << "data.shape: " << _data.rows() << ", " << _data.cols() << + "\tcentroids.shape: " << _centroids.rows() << ", " << _centroids.cols() << + "\tdp.shape: " << distance_pairs_.rows() << ", " << distance_pairs_.cols() << + std::endl; + + std::cout << _data << std::endl; + std::cout << _centroids << std::endl; + + std::cout << _centroids.dev_ptr() << std::endl; + + Blas::gemm( + handle, + CUBLAS_OP_T, CUBLAS_OP_N, + // n, d, d/k + _data.rows(), _data.cols(), _data.cols(), + &alpha, + _data.dev_ptr(), _data.rows(), + _centroids.dev_ptr(), _centroids.cols(), + &beta, + distance_pairs_.dev_ptr(), distance_pairs_.rows()); + + std::cout << distance_pairs_ << std::endl; + std::cout << "return" << std::endl; + return distance_pairs_; + } +}; - potential_kernel<<<256, div_roundup(data.rows(), 256), - sizeof(T)*centroids.rows()>>>( - kVParam(d_distances, distances.size()), - kMParam(d_data, data.rows(), data.cols()), - kMParam(d_centroids, centroids.rows(), centroids.cols())); +template +KmMatrix KmeansLlInit::probability( + KmMatrix& _data, KmMatrix& _centroids) { - CUDACHECK(cudaDeviceSynchronize()); + _centroids.set_name ("centroids"); - thrust::device_ptr distances_vec (d_distances); + KmMatrix centroids_dot (_centroids.rows(), 1); + centroids_dot.set_name ("centroids_dot"); - T * temp = new T[distances.size()]; - CUDACHECK(cudaMemcpy(temp, d_distances, sizeof(T)*distances.size(), cudaMemcpyDeviceToHost)); + VecBatchDotOp().dot(centroids_dot, _centroids); - T res = thrust::reduce(distances_vec, distances_vec + distances.size(), (T)0, - thrust::plus()); + std::cout << data_dot_ << centroids_dot << std::endl; - CUDACHECK(cudaFree(d_distances)); - CUDACHECK(cudaFree(d_data)); + // FIXME: Time this + distance_pairs_ = KmMatrix(_data.rows(), _centroids.rows()); + PairWiseDistanceOp distance_op (data_dot_, centroids_dot, distance_pairs_); + distance_pairs_ = distance_op(_data, _centroids); - CUDACHECK(cudaFree(d_centroids)); + KmMatrix min_distances (_data.rows(), 1); + min_distances.set_name ("min distances"); - CUDACHECK(cudaGetLastError()); + MinOp().min(min_distances, distance_pairs_, KmMatrixDim::COL); - return res; -} + CUDA_CHECK(cudaGetLastError()); -template -T KmeansLlInit::probability(MA_T(T)& data, MA_T(T)& controids) { + T cost = SumOp().sum(min_distances); + + // Re-use min_distances to store prob + MulOp mul_op; + mul_op.mul(min_distances, min_distances, 1 / cost * over_sample_ * k_); + return min_distances; } + template -struct InplaceMulOp { - T a; - InplaceMulOp(T _a) : a(_a) {} +KmMatrix KmeansLlInit::sample_centroids(KmMatrix& _data, KmMatrix& _prob) { - __host__ __device__ - void operator()(T x) { - // *x = *x * a; - } -}; + KmMatrix distances (1, _data.rows()); -template -MA_T(T) KmeansLlInit::sample_centroids(MA_T(T)& data, MA_T(T)& centroids) { - VE_T(T) distances (data.rows()); - - T* d_distances, * d_data, *d_centroids; - - CUDACHECK(cudaMalloc((void**)&d_distances, sizeof(T) * distances.size())); - CUDACHECK(cudaMalloc((void**)&d_data, sizeof(T) * data.size())); - CUDACHECK(cudaMalloc((void**)&d_centroids, sizeof(T) * centroids.size())); - - CUDACHECK(cudaMemcpy(d_distances, (void*)distances.data(), - sizeof(T) * distances.size(), - cudaMemcpyHostToDevice)); - CUDACHECK(cudaMemcpy(d_data, (void*)data.data(), - sizeof(T) * data.size(), - cudaMemcpyHostToDevice)); - CUDACHECK(cudaMemcpy(d_centroids, (void*)centroids.data(), - sizeof(T) * centroids.size(), - cudaMemcpyHostToDevice)); - - potential_kernel<<<256, div_roundup(data.rows(), 256), - sizeof(T)*centroids.rows()>>>( - kVParam(d_distances, distances.size()), - kMParam(d_data, data.rows(), data.cols()), - kMParam(d_centroids, centroids.rows(), centroids.cols())); - - CUDACHECK(cudaDeviceSynchronize()); - - thrust::device_ptr distances_vec (d_distances); - - // T * temp = new T[distances.size()]; - // CUDACHECK(cudaMemcpy(temp, d_distances, sizeof(T)*distances.size(), cudaMemcpyDeviceToHost)); - - T pot = thrust::reduce(distances_vec, distances_vec + distances.size(), (T)0, - thrust::plus()); - - thrust::device_ptr& prob_vec = distances_vec; - thrust::for_each(prob_vec, prob_vec + distances.size(), InplaceMulOp(1/pot)); - - CUDACHECK(cudaDeviceSynchronize()); - - size_t _cols = data.cols(); - size_t _rows = data.rows(); - - std::cout << "distances.size()" << distances.size() << std::endl; - auto pot_cent_filter_counter = thrust::make_counting_iterator(0); - size_t n_new_centroids = - thrust::count_if(pot_cent_filter_counter, pot_cent_filter_counter + distances.size()-1, - [=] __device__(int idx) { - thrust::default_random_engine rng(0); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - rng.discard(idx + _rows); - T threshold = (T) dist (rng); - printf("count:thresh[%u]: %f\n", idx, threshold); - // T prob = d_distances[idx] / pot; - T prob = 0.5; - printf("count:prob[%u]: %f\n", idx, prob); - return prob > threshold; }); - - std::cout << "n_new_centroids:" << n_new_centroids << std::endl; - thrust::device_vector d_new_centroids (n_new_centroids * data.cols()); - auto range = thrust::make_counting_iterator(0); - thrust::device_ptr d_data_vec (d_data); - - // thrust::copy_if( - // d_data_vec, d_data_vec + data.size(), - // range, - // d_new_centroids.begin(), - // [=] __device__ (int idx) { - // size_t row = idx / _cols; - // thrust::default_random_engine rng(seed); - // thrust::uniform_real_distribution<> dist(0.0f, 1.0f); - // rng.discard(row); - // T threshold = (T) dist (rng); - // printf("copy:thresh[%u]: %f", row, threshold); - // T prob = d_distances[row]; - // return prob > threshold;}); - - thrust::host_vector h_new_centroids (n_new_centroids); - thrust::copy(d_new_centroids.begin(), d_new_centroids.end(), - h_new_centroids.begin()); - - size_t old_rows = centroids.rows(); - centroids.conservativeResize(data.rows() + n_new_centroids, Eigen::NoChange); - - for (size_t i = 0; i < n_new_centroids; i ++) { - centroids.row(i+old_rows) = Eigen::Map (h_new_centroids.data(), 1, data.cols()); - } + T potential = SumOp().sum(_prob); - CUDACHECK(cudaFree(d_distances)); - CUDACHECK(cudaFree(d_data)); + MulOp().mul(_prob, _prob, 1 / potential); - CUDACHECK(cudaFree(d_centroids)); - CUDACHECK(cudaGetLastError()); + Generator uniform_dist(_data.rows()); + KmMatrix thresholds = uniform_dist.generate(); - return centroids; + T * thresholds_ptr = thresholds.dev_ptr(); + + // If use kParam, nvcc complains: + // identifier "H2O4GPU::KMeans::kParam ::kParam" is undefined in + // device code. + T* prob_ptr = _prob.k_param().ptr; + + auto prob_iter = thrust::make_counting_iterator(0); + size_t n_new_centroids = thrust::count_if(thrust::device, prob_iter, + prob_iter + _prob.size(), + [=] __device__ (int idx) { + float thresh = thresholds_ptr[idx]; + T prob_x = prob_ptr[idx]; + return prob_x > thresh; + }); + + KmMatrix new_centroids(n_new_centroids, _data.cols()); + thrust::device_ptr new_centroids_ptr (new_centroids.dev_ptr()); + + thrust::device_ptr data_ptr (_data.dev_ptr()); + + size_t cols = _data.cols(); + // renew iterator + prob_iter = thrust::make_counting_iterator(0); + thrust::copy_if(thrust::device, + data_ptr, data_ptr + _data.size(), prob_iter, + new_centroids_ptr, + [=] __device__(int idx) { + int row = idx / cols; + T thresh = thresholds_ptr[row]; + T prob_x = prob_ptr[idx]; + return prob_x > thresh; + }); + + return new_centroids; } template KmMatrix -KmeansLlInit::operator()(H2O4GPU::KMeans::KmMatrix& data) { +KmeansLlInit::operator()(KmMatrix& _data, size_t k) { - if (seed < 0) { + if (seed_ < 0) { std::random_device rd; - seed = rd(); + seed_ = rd(); } + k_ = k; std::mt19937 generator(0); - thrust::host_vector vec (4); - std::uniform_int_distribution<> distribution(0, data.rows()); + std::uniform_int_distribution<> distribution(0, _data.rows()); size_t idx = distribution(generator); - KmMatrix centroids = data.row(idx); - std::cout << "centroids" << std::endl; - std::cout << centroids << std::endl; + // Calculate X^2 (point-wise) + data_dot_ = KmMatrix(_data.rows(), 1); + VecBatchDotOp().dot(data_dot_, _data); + + // First centroid + KmMatrix centroids = _data.row(idx); - // MA_T(T) centroids = data.row(idx); + KmMatrix prob = probability(_data, centroids); - // std::cout << "data\n" << data << std::endl; - // T pot = potential(data, centroids); - // std::cout << "pot: " << pot << std::endl; + T cost = SumOp().sum(prob); + // FIXME + // for (size_t i = 0; i < std::log(cost); ++i) { + for (size_t i = 0; i < 1; ++i) { + std::cout << "looping" << std::endl; + KmMatrix new_centroids = sample_centroids(_data, centroids); + centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); + prob = probability(_data, centroids); + centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); + } - // for (size_t i = 0; i < std::log(pot); ++i) { - // sample_centroids(data, centroids); - // std::cout << "new centroids" << std::endl; - // std::cout << centroids << std::endl; - // } + if (centroids.rows() < k_) { + // FIXME: When n_centroids < k + } - // re-cluster + // FIXME: re-cluster // kmeans_plus_plus(centroids); - return data; + return centroids; } #define INSTANTIATE(T) \ template KmMatrix KmeansLlInit::operator()( \ - KmMatrix& data); \ - template MA_T(T) KmeansLlInit::sample_centroids( \ - MA_T(T)& data, MA_T(T)& centroids); \ - template T KmeansLlInit::probability(MA_T(T)& data, MA_T(T)& controids); \ - + KmMatrix& data, size_t k); \ + template KmMatrix KmeansLlInit::probability(KmMatrix& data, \ + KmMatrix& centroids); \ + template KmMatrix KmeansLlInit::sample_centroids( \ + KmMatrix& data, KmMatrix& centroids); \ INSTANTIATE(float) INSTANTIATE(double) -INSTANTIATE(int) +// FIXME: int is not supported due to random kernel } // namespace Kmeans } // namespace H2O4GPU diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 608df8140..0aa59c561 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -2,45 +2,50 @@ * Copyright 2018 H2O.ai, Inc. * License Apache License Version 2.0 (see LICENSE for details) */ +#ifndef KMEANS_INIT_H_ +#define KMEANS_INIT_H_ -#include "Eigen/Dense" +#include +#include + +#include "KmMatrix/KmConfig.h" #include "KmMatrix/KmMatrix.hpp" -// #include +#include "KmMatrix/utils.cuh" namespace H2O4GPU{ namespace KMeans { // Wrappers for Eigen matrix and vector -template -struct EiMatrix; +// template +// struct EiMatrix; -template <> -struct EiMatrix { - using type = Eigen::MatrixXf; -}; -template <> -struct EiMatrix { - using type = Eigen::MatrixXd; -}; -template <> -struct EiMatrix { - using type = Eigen::MatrixXi; -}; +// template <> +// struct EiMatrix { +// using type = Eigen::MatrixXf; +// }; +// template <> +// struct EiMatrix { +// using type = Eigen::MatrixXd; +// }; +// template <> +// struct EiMatrix { +// using type = Eigen::MatrixXi; +// }; -template -struct EiVector; -template <> -struct EiVector { - using type = Eigen::VectorXf; -}; -template <> -struct EiVector { - using type = Eigen::VectorXd; -}; -template <> -struct EiVector { - using type = Eigen::VectorXi; -}; +// template +// struct EiVector; +// template <> +// struct EiVector { +// using type = Eigen::VectorXf; +// }; +// template <> +// struct EiVector { +// using type = Eigen::VectorXd; +// }; +// template <> +// struct EiVector { +// using type = Eigen::VectorXi; +// }; // Work around for shared memory // https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name @@ -96,69 +101,78 @@ struct kVParam { kVParam(T* _ptr, size_t _size) : ptr(_ptr), size(_size) {} }; -// template -// struct HostDeviceVector { -// private: -// kMParam param; -// // thrust::device_vector _d_vector; -// std::vector* _h_vector; - -// public: -// HostDeviceVector (const std::vector& _h_vec, size_t _cols) : -// param(_cols) { -// _h_vector = new std::vector(_h_vec); -// } -// HostDeviceVector (const std::vector& _h_vec, -// size_t _rows, size_t _cols) : -// param(_rows, _cols) { -// _h_vector = new std::vector(_h_vec); -// } -// ~HostDeviceVector() { delete _h_vector; } -// // HostDeviceVector (size_t _cols) : -// // param.rows {1}, param.cols (_cols) { -// // _d_vector.resize(_cols); -// // } -// size_t rows() { return param.rows; } -// size_t cols() { return param.cols; } -// size_t size() { return param.rows * param.cols; } - -// // kMParam kParam() { -// // param.ptr = _d_vector.data().get(); -// // return param; -// // } -// }; +namespace kernel { +__global__ void setup_random_states(curandState *state, size_t size); +__global__ void generate_uniform_kernel(float *_res, + curandState *_state, + int _size); +__global__ void generate_uniform_kernel(double *_res, + curandState *_state, + int _size); +} + +template +struct Generator { + // FIXME: Use KmMatrix + curandState *dev_states_; + size_t size_; + // FIXME: Cache random_numbers_ in a safer way. + KmMatrix random_numbers_; + + Generator (size_t _size) : size_(_size) , random_numbers_(1, _size) { + CUDA_CHECK(cudaMalloc((void **)&dev_states_, _size * + sizeof(curandState))); + kernel::setup_random_states<<>>( + dev_states_, size_); + } + ~Generator () { + CUDA_CHECK(cudaFree(dev_states_)); + } + + KmMatrix generate() { + kernel::generate_uniform_kernel<<>> + (random_numbers_.k_param().ptr, dev_states_, size_); + return random_numbers_; + } +}; template class KmeansInitBase { public: virtual ~KmeansInitBase() {} - virtual KmMatrix operator()(KmMatrix& data) = 0; + virtual KmMatrix operator()(KmMatrix& data, size_t k) = 0; }; template struct KmeansLlInit : public KmeansInitBase { private: - double over_sample; - int seed; + double over_sample_; + int seed_; + int k_; + // Buffer like variables + // store the self dot product of each data point + KmMatrix data_dot_; + // store distances between each data point and centroids + KmMatrix distance_pairs_; - T potential(MA_T(T)& data, MA_T(T)& centroids); - T probability(MA_T(T)& data, MA_T(T)& controids); + KmMatrix probability(KmMatrix& data, KmMatrix& centroids); public: - KmeansLlInit () : over_sample (2.0), seed (0) {} + KmeansLlInit () : over_sample_ (2.0), seed_ (0), k_(0) { + data_dot_.set_name ("data_dot"); + distance_pairs_.set_name ("distance pairs"); + } virtual ~KmeansLlInit () override {} - MA_T(T) sample_centroids(MA_T(T)& data, MA_T(T)& centroids); - - // MA_T(T) operator()(MA_T(T)&) override; - KmMatrix operator()(KmMatrix& data) override; + KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); + KmMatrix operator()(KmMatrix& data, size_t k) override; }; -template -T1 div_roundup(const T1 a, const T2 b) { - return static_cast(ceil(static_cast(a) / b)); -} + +// FIXME: Make kmeans++ a derived class of KmeansInitBase } // namespace Kmeans } // namespace H2O4GPU + +#endif // KMEANS_INIT_H_ \ No newline at end of file diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu index 4491ab5f7..af2865800 100644 --- a/tests/cpp/gpu/KmMatrix/test_matrix.cu +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -2,6 +2,7 @@ #include #include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" +#include // r --gtest_filter=KmMatrix.KmMatrixEqual TEST(KmMatrix, KmMatrixEqual) { @@ -59,12 +60,17 @@ TEST(KmMatrix, KmMatrixKparam) { } TEST(KmMatrix, KmMatrixCycle) { - thrust::host_vector vec (2048 * 1024); - for (size_t i = 0; i < 2048 * 1024; ++i) { + size_t rows = 2048, cols = 1024; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat0 (vec, 2048, 1024); - for (size_t i = 0; i < 1000; ++i) { + // Tweak this one to see if memory grows, there should be a better way to + // test memory leak. + size_t iters = std::pow(16, 1); + H2O4GPU::KMeans::KmMatrix mat0 (vec, rows, cols); + mat0.dev_ptr(); + for (size_t i = 0; i < iters; ++i) { H2O4GPU::KMeans::KmMatrix mat1 = mat0; H2O4GPU::KMeans::KmMatrix mat2 = mat1; mat0 = mat2; From b7e7446aba2239084a0f11f0f961bb8cbbe908c9 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 Jul 2018 18:55:00 +0800 Subject: [PATCH 16/49] Fix prob for kmeans||. --- src/gpu/kmeans/kmeans_init.cu | 56 ++++++++++++++-------------------- src/gpu/kmeans/kmeans_init.cuh | 3 +- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 150ee3f87..3f5693539 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -235,29 +235,17 @@ struct PairWiseDistanceOp { kernel::construct_distance_pairs_kernel<<< dim3(GpuInfo::ins().blocks(32), div_roundup(_centroids.rows(), 16)), - dim3(16, 16)>>>( + dim3(16, 16)>>>( // FIXME: Tune this. distance_pairs_.k_param(), data_dot_.k_param(), centroids_dot_.k_param()); CUDA_CHECK(cudaGetLastError()); - std::cout << std::endl; - std::cout << "in distance op" << std::endl; - std::cout << distance_pairs_ << std::endl; cublasHandle_t handle = GpuInfo::ins().cublas_handle(); T alpha = -2.0; T beta = 1.0; - std::cout << "data.shape: " << _data.rows() << ", " << _data.cols() << - "\tcentroids.shape: " << _centroids.rows() << ", " << _centroids.cols() << - "\tdp.shape: " << distance_pairs_.rows() << ", " << distance_pairs_.cols() << - std::endl; - - std::cout << _data << std::endl; - std::cout << _centroids << std::endl; - - std::cout << _centroids.dev_ptr() << std::endl; Blas::gemm( handle, @@ -270,12 +258,11 @@ struct PairWiseDistanceOp { &beta, distance_pairs_.dev_ptr(), distance_pairs_.rows()); - std::cout << distance_pairs_ << std::endl; - std::cout << "return" << std::endl; return distance_pairs_; } }; + template KmMatrix KmeansLlInit::probability( KmMatrix& _data, KmMatrix& _centroids) { @@ -301,13 +288,19 @@ KmMatrix KmeansLlInit::probability( CUDA_CHECK(cudaGetLastError()); + std::cout << min_distances << std::endl; + T cost = SumOp().sum(min_distances); + std::cout << "cost: " << cost << std::endl; - // Re-use min_distances to store prob MulOp mul_op; - mul_op.mul(min_distances, min_distances, 1 / cost * over_sample_ * k_); - return min_distances; + KmMatrix prob (min_distances.rows(), 1); + mul_op.mul(prob, min_distances, (over_sample_ * k_ * 1) / cost); + + std::cout << prob << std::endl; + + return prob; } @@ -316,11 +309,7 @@ KmMatrix KmeansLlInit::sample_centroids(KmMatrix& _data, KmMatrix& _ KmMatrix distances (1, _data.rows()); - T potential = SumOp().sum(_prob); - - MulOp().mul(_prob, _prob, 1 / potential); - - + // FIXME: Keep generator out. Generator uniform_dist(_data.rows()); KmMatrix thresholds = uniform_dist.generate(); @@ -357,19 +346,19 @@ KmMatrix KmeansLlInit::sample_centroids(KmMatrix& _data, KmMatrix& _ T prob_x = prob_ptr[idx]; return prob_x > thresh; }); - + std::cout << std::endl; return new_centroids; } template KmMatrix -KmeansLlInit::operator()(KmMatrix& _data, size_t k) { +KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { if (seed_ < 0) { std::random_device rd; seed_ = rd(); } - k_ = k; + k_ = _k; std::mt19937 generator(0); @@ -386,14 +375,15 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t k) { KmMatrix prob = probability(_data, centroids); T cost = SumOp().sum(prob); - // FIXME - // for (size_t i = 0; i < std::log(cost); ++i) { - for (size_t i = 0; i < 1; ++i) { - std::cout << "looping" << std::endl; - KmMatrix new_centroids = sample_centroids(_data, centroids); - centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); + + for (size_t i = 0; i < std::log(cost); ++i) { prob = probability(_data, centroids); + KmMatrix new_centroids = sample_centroids(_data, prob); + new_centroids.set_name ("new centroids"); + std::cout << new_centroids << std::endl; centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); + centroids.set_name ("centroids"); + std::cout << centroids << std::endl; } if (centroids.rows() < k_) { @@ -407,7 +397,7 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t k) { #define INSTANTIATE(T) \ template KmMatrix KmeansLlInit::operator()( \ - KmMatrix& data, size_t k); \ + KmMatrix& _data, size_t _k); \ template KmMatrix KmeansLlInit::probability(KmMatrix& data, \ KmMatrix& centroids); \ template KmMatrix KmeansLlInit::sample_centroids( \ diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 0aa59c561..864b38476 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -159,7 +159,8 @@ struct KmeansLlInit : public KmeansInitBase { KmMatrix probability(KmMatrix& data, KmMatrix& centroids); public: - KmeansLlInit () : over_sample_ (2.0), seed_ (0), k_(0) { + KmeansLlInit (T _over_sample=2.0) : + over_sample_ (_over_sample), seed_ (0), k_(0) { data_dot_.set_name ("data_dot"); distance_pairs_.set_name ("distance pairs"); } From d50725978fb78880f40975bbfa3c2d9644409a76 Mon Sep 17 00:00:00 2001 From: fis Date: Sat, 21 Jul 2018 04:23:24 +0800 Subject: [PATCH 17/49] Fix K-Means|| along with added documents. * Fix copy_if prob_ptr index. * Add document for the algorithm object interface. * Move Generator into a member variable and complete its implementation. * Removes dead code. --- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 19 ++- src/gpu/kmeans/kmeans_init.cu | 109 +++++++--------- src/gpu/kmeans/kmeans_init.cuh | 155 +++++++++++++++-------- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 4 +- 4 files changed, 165 insertions(+), 122 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh index 789cf08b5..646260e8a 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -14,6 +14,7 @@ #include // Singleton class storing gpu info. +// Call GpuInfo::ins() to use the class; class GpuInfo { private: int n_gpu_; @@ -29,7 +30,6 @@ class GpuInfo { for (int i = 0; i < n_gpu_; ++i) { cudaDeviceGetAttribute(&n_sm_[i], cudaDevAttrMultiProcessorCount, i); CUBLAS_CHECK(cublasCreate(&handles_[i])); - printf("n_sm[%d]: %d\n", i, n_sm_[i]); } } ~GpuInfo () { @@ -38,6 +38,17 @@ class GpuInfo { CUBLAS_CHECK(cublasDestroy(handles_[i])); } } + + static GpuInfo& ins() { + static GpuInfo obj; + return obj; + } + + // Call the following methods with GpuInfo::ins(). For example: + // GpuInfo::ins().blocks(32) + + // Get number of blocks for grid strided loop kernel. + // returns _mul * MultiProcessorCount[device]. // FIXME, get active device size_t blocks (size_t _mul, int _device=0) { if (has_device(_device)) { @@ -60,12 +71,6 @@ class GpuInfo { bool has_device(int _device) { return _device < n_gpu_ && _device >= 0; } - - static GpuInfo& ins() { - static GpuInfo obj; - return obj; - } - }; #endif // GPU_INFO_HPP_ diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 3f5693539..62affdf9d 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -27,8 +27,7 @@ namespace KMeans { namespace kernel { -__global__ void setup_random_states(curandState *state, size_t size) -{ +__global__ void setup_random_states(curandState *state, size_t size) { int id = threadIdx.x + blockIdx.x * threadIdx.x; /* Each thread gets same seed, a different sequence number, no offset */ @@ -38,39 +37,38 @@ __global__ void setup_random_states(curandState *state, size_t size) __global__ void generate_uniform_kernel(float *_res, curandState *_state, - int _size) -{ + int _size) { int idx = threadIdx.x + blockIdx.x * threadIdx.x; if (idx < _size) { float x; - curandState localState = _state[idx]; - x = curand_uniform(&localState); - _state[idx] = localState; + curandState local_state = _state[idx]; + x = curand_uniform(&local_state); + _state[idx] = local_state; _res[idx] = x; } } __global__ void generate_uniform_kernel(double *_res, curandState *_state, - int _size) -{ + int _size) { int idx = threadIdx.x + blockIdx.x * threadIdx.x; if (idx < _size) { double x; - curandState localState = _state[idx]; - x = curand_uniform_double(&localState); - _state[idx] = localState; + curandState local_state = _state[idx]; + x = curand_uniform_double(&local_state); + _state[idx] = local_state; _res[idx] = x; } } /* + * Compute min value for each row. * @tparam T Numeric type of the data * @param _res The output matrix with shape m x 1 * @param _val The input matrix with shape m x n */ template -__global__ void col_min_sequential(kParam _res, kParam _val) { +__global__ void row_min_sequential(kParam _res, kParam _val) { size_t idx = global_thread_idx(); size_t stride = grid_stride_x () * _val.cols; @@ -175,8 +173,8 @@ struct MinOp { void min(KmMatrix& _res, KmMatrix& _val, KmMatrixDim _dim) { size_t blocks = GpuInfo::ins().blocks(32); - if (_dim == KmMatrixDim::COL) { - kernel::col_min_sequential<<>>( + if (_dim == KmMatrixDim::ROW) { + kernel::row_min_sequential<<>>( _res.k_param(), _val.k_param()); } else { // FIXME @@ -186,7 +184,7 @@ struct MinOp { }; namespace kernel { -// X^2 + Y^2 +// X^2 + Y^2, here only calculates the + operation. template __global__ void construct_distance_pairs_kernel( kParam _distance_pairs, @@ -195,6 +193,8 @@ __global__ void construct_distance_pairs_kernel( size_t idx = global_thread_idx(); // indexing data size_t idy = global_thread_idy(); // indexing centroids + // FIXME: Is using shared memory necessary? + size_t stride_x = grid_stride_x () * _data_dots.cols; // strides only for data. for (size_t i = idx; i < _data_dots.rows; i += stride_x) { @@ -206,8 +206,10 @@ __global__ void construct_distance_pairs_kernel( } } } -} +} // namespace kernel + +// Extracted as an independent Op for k-means use. template struct PairWiseDistanceOp { KmMatrix data_dot_; @@ -235,7 +237,7 @@ struct PairWiseDistanceOp { kernel::construct_distance_pairs_kernel<<< dim3(GpuInfo::ins().blocks(32), div_roundup(_centroids.rows(), 16)), - dim3(16, 16)>>>( // FIXME: Tune this. + dim3(32, 16)>>>( // FIXME: Tune this. distance_pairs_.k_param(), data_dot_.k_param(), centroids_dot_.k_param()); @@ -267,67 +269,50 @@ template KmMatrix KmeansLlInit::probability( KmMatrix& _data, KmMatrix& _centroids) { - _centroids.set_name ("centroids"); - KmMatrix centroids_dot (_centroids.rows(), 1); - centroids_dot.set_name ("centroids_dot"); VecBatchDotOp().dot(centroids_dot, _centroids); - std::cout << data_dot_ << centroids_dot << std::endl; - // FIXME: Time this distance_pairs_ = KmMatrix(_data.rows(), _centroids.rows()); PairWiseDistanceOp distance_op (data_dot_, centroids_dot, distance_pairs_); distance_pairs_ = distance_op(_data, _centroids); KmMatrix min_distances (_data.rows(), 1); - min_distances.set_name ("min distances"); - MinOp().min(min_distances, distance_pairs_, KmMatrixDim::COL); - - CUDA_CHECK(cudaGetLastError()); - - std::cout << min_distances << std::endl; + MinOp().min(min_distances, distance_pairs_, KmMatrixDim::ROW); T cost = SumOp().sum(min_distances); - std::cout << "cost: " << cost << std::endl; - - MulOp mul_op; KmMatrix prob (min_distances.rows(), 1); - mul_op.mul(prob, min_distances, (over_sample_ * k_ * 1) / cost); - - std::cout << prob << std::endl; + MulOp().mul(prob, min_distances, over_sample_ / cost); return prob; } template -KmMatrix KmeansLlInit::sample_centroids(KmMatrix& _data, KmMatrix& _prob) { - - KmMatrix distances (1, _data.rows()); +KmMatrix KmeansLlInit::sample_centroids( + KmMatrix& _data, KmMatrix& _prob) { - // FIXME: Keep generator out. - Generator uniform_dist(_data.rows()); - KmMatrix thresholds = uniform_dist.generate(); + KmMatrix thresholds = uniform_dist.generate(_data.rows()); T * thresholds_ptr = thresholds.dev_ptr(); // If use kParam, nvcc complains: // identifier "H2O4GPU::KMeans::kParam ::kParam" is undefined in // device code. - T* prob_ptr = _prob.k_param().ptr; + T* prob_ptr = _prob.dev_ptr(); auto prob_iter = thrust::make_counting_iterator(0); - size_t n_new_centroids = thrust::count_if(thrust::device, prob_iter, - prob_iter + _prob.size(), - [=] __device__ (int idx) { - float thresh = thresholds_ptr[idx]; - T prob_x = prob_ptr[idx]; - return prob_x > thresh; - }); + size_t n_new_centroids = thrust::count_if( + thrust::device, prob_iter, + prob_iter + _prob.size(), + [=] __device__ (int idx) { + float thresh = thresholds_ptr[idx]; + T prob_x = prob_ptr[idx]; + return prob_x > thresh; + }); KmMatrix new_centroids(n_new_centroids, _data.cols()); thrust::device_ptr new_centroids_ptr (new_centroids.dev_ptr()); @@ -337,16 +322,17 @@ KmMatrix KmeansLlInit::sample_centroids(KmMatrix& _data, KmMatrix& _ size_t cols = _data.cols(); // renew iterator prob_iter = thrust::make_counting_iterator(0); - thrust::copy_if(thrust::device, - data_ptr, data_ptr + _data.size(), prob_iter, - new_centroids_ptr, - [=] __device__(int idx) { - int row = idx / cols; - T thresh = thresholds_ptr[row]; - T prob_x = prob_ptr[idx]; - return prob_x > thresh; - }); - std::cout << std::endl; + thrust::copy_if( + thrust::device, + data_ptr, data_ptr + _data.size(), prob_iter, + new_centroids_ptr, + [=] __device__(int idx) { + size_t row = idx / cols; + T thresh = thresholds_ptr[row]; + T prob_x = prob_ptr[row]; + return prob_x > thresh; + }); + return new_centroids; } @@ -379,15 +365,12 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { for (size_t i = 0; i < std::log(cost); ++i) { prob = probability(_data, centroids); KmMatrix new_centroids = sample_centroids(_data, prob); - new_centroids.set_name ("new centroids"); - std::cout << new_centroids << std::endl; centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); - centroids.set_name ("centroids"); - std::cout << centroids << std::endl; } if (centroids.rows() < k_) { // FIXME: When n_centroids < k + // Get random selection in? } // FIXME: re-cluster diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 864b38476..3e71c09ab 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -15,38 +15,6 @@ namespace H2O4GPU{ namespace KMeans { -// Wrappers for Eigen matrix and vector -// template -// struct EiMatrix; - -// template <> -// struct EiMatrix { -// using type = Eigen::MatrixXf; -// }; -// template <> -// struct EiMatrix { -// using type = Eigen::MatrixXd; -// }; -// template <> -// struct EiMatrix { -// using type = Eigen::MatrixXi; -// }; - -// template -// struct EiVector; -// template <> -// struct EiVector { -// using type = Eigen::VectorXf; -// }; -// template <> -// struct EiVector { -// using type = Eigen::VectorXd; -// }; -// template <> -// struct EiVector { -// using type = Eigen::VectorXi; -// }; - // Work around for shared memory // https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name template @@ -76,11 +44,6 @@ struct KmShardMem { } }; -#define MA_T(T) \ - typename EiMatrix::type -#define VE_T(T) \ - typename EiVector::type - template struct kMParam { T* ptr; @@ -113,43 +76,99 @@ __global__ void generate_uniform_kernel(double *_res, } template -struct Generator { +struct UniformGenerator { + // private: + // FIXME: Use KmMatrix curandState *dev_states_; size_t size_; // FIXME: Cache random_numbers_ in a safer way. KmMatrix random_numbers_; - Generator (size_t _size) : size_(_size) , random_numbers_(1, _size) { - CUDA_CHECK(cudaMalloc((void **)&dev_states_, _size * - sizeof(curandState))); + void initialize (size_t _size) { + size_ = _size; + random_numbers_ = KmMatrix (1, size_); + + if (dev_states_ != nullptr) { + CUDA_CHECK(cudaFree(dev_states_)); + } + CUDA_CHECK(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState))); kernel::setup_random_states<<>>( dev_states_, size_); } - ~Generator () { - CUDA_CHECK(cudaFree(dev_states_)); + + // public: + UniformGenerator() : dev_states_ (nullptr), size_ (0){} + + UniformGenerator (size_t _size) { + if (_size == 0) { + M_ERROR("Zero size for generate is not allowed."); + } + initialize(_size); + } + + ~UniformGenerator () { + if (dev_states_ != nullptr) { + CUDA_CHECK(cudaFree(dev_states_)); + } } + UniformGenerator(const UniformGenerator& _rhs) = delete; + UniformGenerator(UniformGenerator&& _rhs) = delete; + void operator=(const UniformGenerator& _rhs) = delete; + void operator=(UniformGenerator&& _rhs) = delete; + KmMatrix generate() { kernel::generate_uniform_kernel<<>> (random_numbers_.k_param().ptr, dev_states_, size_); return random_numbers_; } + KmMatrix generate(size_t _size) { + if (_size == 0) { + M_ERROR("Zero size for generate is not allowed."); + } + if (_size != size_) { + initialize(_size); + } + return generate(); + } }; +/* + * Base class used for all K-Means initialization algorithms. + */ template class KmeansInitBase { public: virtual ~KmeansInitBase() {} + /* + * Select k centroids from data. + * + * @param data data points stored in row major matrix. + * @param k number of centroids. + */ virtual KmMatrix operator()(KmMatrix& data, size_t k) = 0; }; +/* + * Each instance of KmeansLlInit corresponds to one dataset, if a new data set + * is used, users need to create a new instance. + * + * k-means|| algorithm based on the paper: + * + * Scalable K-Means++ + * + * + * @tparam Data type, supported types are float and double. + */ template struct KmeansLlInit : public KmeansInitBase { private: - double over_sample_; + T over_sample_; int seed_; int k_; + UniformGenerator uniform_dist; + // Buffer like variables // store the self dot product of each data point KmMatrix data_dot_; @@ -157,16 +176,50 @@ struct KmeansLlInit : public KmeansInitBase { KmMatrix distance_pairs_; KmMatrix probability(KmMatrix& data, KmMatrix& centroids); - public: - KmeansLlInit (T _over_sample=2.0) : - over_sample_ (_over_sample), seed_ (0), k_(0) { - data_dot_.set_name ("data_dot"); - distance_pairs_.set_name ("distance pairs"); - } + // sample_centroids should not be part of the interface, but following error + // is generated when put in private section: + // The enclosing parent function ("sample_centroids") for an extended + // __device__ lambda cannot have private or protected access within its class + KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); + + /* + * Initialize KmeansLlInit algorithm, with default: + * over_sample = 1.5, + * seed = 0, + */ + KmeansLlInit () : + over_sample_ (1.5f), seed_ (0), k_ (0) {} + + /* + * Initialize KmeansLlInit algorithm, with default: + * seed = 0, + * + * @param over_sample over_sample rate. + * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + * Note that when \f$over_sample != 1\f$, the probability for each data + * point doesn't add to 1. + */ + KmeansLlInit (T _over_sample) : + over_sample_ (_over_sample), seed_ (0), k_ (0) {} + + /* + * Initialize KmeansLlInit algorithm. + * + * @param seed Seed used to generate threshold for sampling centroids. + * @param over_sample over_sample rate. + * p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)} + */ + KmeansLlInit (int _seed, T _over_sample) : + seed_(_seed), seed_(_seed), k_(0) {} + virtual ~KmeansLlInit () override {} - KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); + /* + * Select k centroids from data. + * @param data data points stored in row major matrix. + * @param k number of centroids. + */ KmMatrix operator()(KmMatrix& data, size_t k) override; }; diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 2f9c5356b..fc2e3ec10 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -36,5 +36,7 @@ TEST(KmeansLL, KmeansLLInit) { H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); - auto result = kmeans_ll_init (h_data); + auto result = kmeans_ll_init (h_data, 1.0f); + result.set_name ("kmeans result"); + std::cout << result << std::endl; } \ No newline at end of file From 7a791690af7d662d01bbc558f1496875ffcdbb23 Mon Sep 17 00:00:00 2001 From: fis Date: Sat, 21 Jul 2018 18:54:36 +0800 Subject: [PATCH 18/49] Create Generatorbase and refactor code. * Create Generatorbase class for interface. * Move generator related code into KmMatrix. * Move shared memory related code into KmMatrix. * Create mock object based on Generatorbase for later test. --- src/gpu/kmeans/KmMatrix/Generator.cuh | 95 ++++++++++++ src/gpu/kmeans/KmMatrix/Generator.hpp | 20 +++ src/gpu/kmeans/KmMatrix/GeneratorKernels.cu | 45 ++++++ src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 2 - src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 2 - src/gpu/kmeans/KmMatrix/utils.cuh | 31 ++++ src/gpu/kmeans/kmeans_init.cu | 56 ++------ src/gpu/kmeans/kmeans_init.cuh | 152 ++++---------------- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 31 +++- 9 files changed, 252 insertions(+), 182 deletions(-) create mode 100644 src/gpu/kmeans/KmMatrix/Generator.cuh create mode 100644 src/gpu/kmeans/KmMatrix/Generator.hpp create mode 100644 src/gpu/kmeans/KmMatrix/GeneratorKernels.cu diff --git a/src/gpu/kmeans/KmMatrix/Generator.cuh b/src/gpu/kmeans/KmMatrix/Generator.cuh new file mode 100644 index 000000000..a2b6bb013 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/Generator.cuh @@ -0,0 +1,95 @@ +#include +#include + +#include + +#include "Generator.hpp" +#include "KmMatrix.hpp" +#include "utils.cuh" + + +namespace H2O4GPU { +namespace KMeans { + +namespace kernel { +// Split the definition to avoid multiple definition. +__global__ void setup_random_states(int _seed, curandState *_state, + size_t _size); + +__global__ void generate_uniform_kernel(float *_res, + curandState *_state, + int _size); + +__global__ void generate_uniform_kernel(double *_res, + curandState *_state, + int _size); +} + +template +struct UniformGenerator : public GeneratorBase { + private: + // FIXME: Use KmMatrix + curandState *dev_states_; + size_t size_; + // FIXME: Cache random_numbers_ in a safer way. + KmMatrix random_numbers_; + int seed_; + + void initialize (size_t _size) { + size_ = _size; + random_numbers_ = KmMatrix (1, size_); + + if (dev_states_ != nullptr) { + CUDA_CHECK(cudaFree(dev_states_)); + } + CUDA_CHECK(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState))); + kernel::setup_random_states<<>>( + seed_, dev_states_, size_); + } + + public: + UniformGenerator() : dev_states_ (nullptr), size_ (0) { + std::random_device rd; + seed_ = rd(); + } + + UniformGenerator (size_t _size, int _seed) { + if (_size == 0) { + M_ERROR("Zero size for generate is not allowed."); + } + initialize(_size); + } + + UniformGenerator(int _seed) : + seed_(_seed), dev_states_(nullptr), size_ (0) {} + + ~UniformGenerator () { + if (dev_states_ != nullptr) { + CUDA_CHECK(cudaFree(dev_states_)); + } + } + + UniformGenerator(const UniformGenerator& _rhs) = delete; + UniformGenerator(UniformGenerator&& _rhs) = delete; + void operator=(const UniformGenerator& _rhs) = delete; + void operator=(UniformGenerator&& _rhs) = delete; + + KmMatrix generate() override { + kernel::generate_uniform_kernel<<>> + (random_numbers_.k_param().ptr, dev_states_, size_); + return random_numbers_; + } + + KmMatrix generate(size_t _size) override { + if (_size == 0) { + M_ERROR("Zero size for generate is not allowed."); + } + if (_size != size_) { + initialize(_size); + } + return generate(); + } +}; + +} // H2O4GPU +} // KMeans \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/Generator.hpp b/src/gpu/kmeans/KmMatrix/Generator.hpp new file mode 100644 index 000000000..3a37d8bdd --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/Generator.hpp @@ -0,0 +1,20 @@ +#ifndef GENERATOR_HPP_ +#define GENERATOR_HPP_ + +#include "KmMatrix.hpp" + +namespace H2O4GPU { +namespace KMeans { + +template +class GeneratorBase { + public: + virtual KmMatrix generate() {}; + virtual KmMatrix generate(size_t _size) {}; +}; + +} +} + + +#endif // GENERATOR_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu b/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu new file mode 100644 index 000000000..420712a85 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu @@ -0,0 +1,45 @@ +#include +#include + +namespace H2O4GPU { +namespace KMeans { +namespace kernel { + +__global__ void setup_random_states(int _seed, curandState *_state, + size_t _size) { + int id = threadIdx.x + blockIdx.x * threadIdx.x; + /* Each thread gets same seed, a different sequence + number, no offset */ + if (id < _size) + curand_init(_seed, id, 0, &_state[id]); +} + +__global__ void generate_uniform_kernel(float *_res, + curandState *_state, + int _size) { + int idx = threadIdx.x + blockIdx.x * threadIdx.x; + if (idx < _size) { + float x; + curandState local_state = _state[idx]; + x = curand_uniform(&local_state); + _state[idx] = local_state; + _res[idx] = x; + } +} + +__global__ void generate_uniform_kernel(double *_res, + curandState *_state, + int _size) { + int idx = threadIdx.x + blockIdx.x * threadIdx.x; + if (idx < _size) { + double x; + curandState local_state = _state[idx]; + x = curand_uniform_double(&local_state); + _state[idx] = local_state; + _res[idx] = x; + } +} + +} // namespace kernel +} // namespace KMeans +} // namespace H2O4GPU \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index f00a07a0b..579156854 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -84,8 +84,6 @@ class KmMatrixImpl { virtual KmMatrix stack(KmMatrix&, KmMatrixDim _dim) = 0; virtual bool equal(KmMatrix& _val) = 0; - - friend KmMatrix; }; template diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index 95e9b27a5..0e9d7f520 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -78,8 +78,6 @@ class CudaKmMatrixImpl : public KmMatrixImpl { virtual bool equal(KmMatrix& _rhs); virtual bool on_device() const override; - - friend KmMatrix; }; } // MkMatrix diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh index 296ab08b2..8555cf1e1 100644 --- a/src/gpu/kmeans/KmMatrix/utils.cuh +++ b/src/gpu/kmeans/KmMatrix/utils.cuh @@ -32,6 +32,37 @@ T1 M_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) { return static_cast(ceil(static_cast(a) / b)); } + +// Work around for shared memory +// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name +template +struct KmShardMem; + +template <> +struct KmShardMem { + __device__ float * ptr() { + extern __shared__ __align__(sizeof(float)) float s_float[]; + return s_float; + } +}; + +template <> +struct KmShardMem { + __device__ double * ptr() { + extern __shared__ __align__(sizeof(double)) double s_double[]; + return s_double; + } +}; + +template <> +struct KmShardMem { + __device__ int * ptr() { + extern __shared__ __align__(sizeof(int)) int s_int[]; + return s_int; + } +}; + + } // KMeans } // H2O4GPU diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 62affdf9d..4c38c5892 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -27,40 +27,6 @@ namespace KMeans { namespace kernel { -__global__ void setup_random_states(curandState *state, size_t size) { - int id = threadIdx.x + blockIdx.x * threadIdx.x; - /* Each thread gets same seed, a different sequence - number, no offset */ - if (id < size) - curand_init(1234, id, 0, &state[id]); -} - -__global__ void generate_uniform_kernel(float *_res, - curandState *_state, - int _size) { - int idx = threadIdx.x + blockIdx.x * threadIdx.x; - if (idx < _size) { - float x; - curandState local_state = _state[idx]; - x = curand_uniform(&local_state); - _state[idx] = local_state; - _res[idx] = x; - } -} - -__global__ void generate_uniform_kernel(double *_res, - curandState *_state, - int _size) { - int idx = threadIdx.x + blockIdx.x * threadIdx.x; - if (idx < _size) { - double x; - curandState local_state = _state[idx]; - x = curand_uniform_double(&local_state); - _state[idx] = local_state; - _res[idx] = x; - } -} - /* * Compute min value for each row. * @tparam T Numeric type of the data @@ -134,9 +100,9 @@ struct VecBatchDotOp { } }; -// FIXME: Using struct for operations is just keeping the possibility to create -// some unified operations for KmMatrix. For example, let KmMatrix -// inherit those left associative ops, or create a inferface for elementwise +// FIXME: Using struct for operations is just keeping the possibility of +// creating an unified operations for KmMatrix. For example, let KmMatrix +// inherit those left associative ops, or create an inferface for elementwise // operations. template struct SumOp { @@ -295,7 +261,7 @@ template KmMatrix KmeansLlInit::sample_centroids( KmMatrix& _data, KmMatrix& _prob) { - KmMatrix thresholds = uniform_dist.generate(_data.rows()); + KmMatrix thresholds = generator_->generate(_data.rows()); T * thresholds_ptr = thresholds.dev_ptr(); @@ -378,13 +344,13 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { return centroids; } -#define INSTANTIATE(T) \ - template KmMatrix KmeansLlInit::operator()( \ - KmMatrix& _data, size_t _k); \ - template KmMatrix KmeansLlInit::probability(KmMatrix& data, \ - KmMatrix& centroids); \ - template KmMatrix KmeansLlInit::sample_centroids( \ - KmMatrix& data, KmMatrix& centroids); \ +#define INSTANTIATE(T) \ + template KmMatrix KmeansLlInit::operator()( \ + KmMatrix& _data, size_t _k); \ + template KmMatrix KmeansLlInit::probability( \ + KmMatrix& data, KmMatrix& centroids); \ + template KmMatrix KmeansLlInit::sample_centroids( \ + KmMatrix& data, KmMatrix& centroids); \ INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 3e71c09ab..de15a82d2 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -2,138 +2,23 @@ * Copyright 2018 H2O.ai, Inc. * License Apache License Version 2.0 (see LICENSE for details) */ + #ifndef KMEANS_INIT_H_ #define KMEANS_INIT_H_ -#include -#include + +#include #include "KmMatrix/KmConfig.h" #include "KmMatrix/KmMatrix.hpp" #include "KmMatrix/utils.cuh" +#include "KmMatrix/Generator.hpp" +#include "KmMatrix/Generator.cuh" +#include "KmMatrix/GpuInfo.cuh" namespace H2O4GPU{ namespace KMeans { -// Work around for shared memory -// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name -template -struct KmShardMem; - -template <> -struct KmShardMem { - __device__ float * ptr() { - extern __shared__ __align__(sizeof(float)) float s_float[]; - return s_float; - } -}; - -template <> -struct KmShardMem { - __device__ double * ptr() { - extern __shared__ __align__(sizeof(double)) double s_double[]; - return s_double; - } -}; - -template <> -struct KmShardMem { - __device__ int * ptr() { - extern __shared__ __align__(sizeof(int)) int s_int[]; - return s_int; - } -}; - -template -struct kMParam { - T* ptr; - size_t rows; - size_t cols; - - kMParam(T* _ptr, size_t _rows, size_t _cols) : - ptr (_ptr), rows (_rows), cols (_cols) {} - kMParam(size_t _rows, size_t _cols): - rows (_rows), cols (_cols) {} - kMParam(size_t _cols) : cols (_cols) {} -}; - -template -struct kVParam { - T* ptr; - size_t size; - kVParam(T* _ptr, size_t _size) : ptr(_ptr), size(_size) {} -}; - -namespace kernel { - -__global__ void setup_random_states(curandState *state, size_t size); -__global__ void generate_uniform_kernel(float *_res, - curandState *_state, - int _size); -__global__ void generate_uniform_kernel(double *_res, - curandState *_state, - int _size); -} - -template -struct UniformGenerator { - // private: - - // FIXME: Use KmMatrix - curandState *dev_states_; - size_t size_; - // FIXME: Cache random_numbers_ in a safer way. - KmMatrix random_numbers_; - - void initialize (size_t _size) { - size_ = _size; - random_numbers_ = KmMatrix (1, size_); - - if (dev_states_ != nullptr) { - CUDA_CHECK(cudaFree(dev_states_)); - } - CUDA_CHECK(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState))); - kernel::setup_random_states<<>>( - dev_states_, size_); - } - - // public: - UniformGenerator() : dev_states_ (nullptr), size_ (0){} - - UniformGenerator (size_t _size) { - if (_size == 0) { - M_ERROR("Zero size for generate is not allowed."); - } - initialize(_size); - } - - ~UniformGenerator () { - if (dev_states_ != nullptr) { - CUDA_CHECK(cudaFree(dev_states_)); - } - } - - UniformGenerator(const UniformGenerator& _rhs) = delete; - UniformGenerator(UniformGenerator&& _rhs) = delete; - void operator=(const UniformGenerator& _rhs) = delete; - void operator=(UniformGenerator&& _rhs) = delete; - - KmMatrix generate() { - kernel::generate_uniform_kernel<<>> - (random_numbers_.k_param().ptr, dev_states_, size_); - return random_numbers_; - } - KmMatrix generate(size_t _size) { - if (_size == 0) { - M_ERROR("Zero size for generate is not allowed."); - } - if (_size != size_) { - initialize(_size); - } - return generate(); - } -}; - /* * Base class used for all K-Means initialization algorithms. */ @@ -167,7 +52,8 @@ struct KmeansLlInit : public KmeansInitBase { T over_sample_; int seed_; int k_; - UniformGenerator uniform_dist; + // UniformGenerator uniform_dist; + std::unique_ptr> generator_; // Buffer like variables // store the self dot product of each data point @@ -189,7 +75,8 @@ struct KmeansLlInit : public KmeansInitBase { * seed = 0, */ KmeansLlInit () : - over_sample_ (1.5f), seed_ (0), k_ (0) {} + over_sample_ (1.5f), seed_ (-1), k_ (0), + generator_ (new UniformGenerator) {} /* * Initialize KmeansLlInit algorithm, with default: @@ -201,17 +88,30 @@ struct KmeansLlInit : public KmeansInitBase { * point doesn't add to 1. */ KmeansLlInit (T _over_sample) : - over_sample_ (_over_sample), seed_ (0), k_ (0) {} + over_sample_ (_over_sample), seed_ (-1), k_ (0), + generator_ (new UniformGenerator) {} /* * Initialize KmeansLlInit algorithm. * * @param seed Seed used to generate threshold for sampling centroids. * @param over_sample over_sample rate. - * p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)} + * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ */ KmeansLlInit (int _seed, T _over_sample) : - seed_(_seed), seed_(_seed), k_(0) {} + seed_(_seed), k_(0), + generator_ (new UniformGenerator(seed_)) {} + + /* + * Initialize KmeansLlInit algorithm. + * + * @param gen Unique pointer to a generator used to generate threshold for + * sampling centroids. + * @param over_sample over_sample rate. + * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + */ + KmeansLlInit (std::unique_ptr>& _gen, T _over_sample) : + generator_(std::move(_gen)), over_sample_ (1.5f), seed_ (-1), k_(0) {} virtual ~KmeansLlInit () override {} diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index fc2e3ec10..ceab19603 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -5,18 +5,35 @@ #include -// #include "../../../../src/gpu/kmeans/Eigen/Dense" #include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/kmeans/KmMatrix/Generator.hpp"" #include "../../../../src/gpu/kmeans/kmeans_init.cuh" #include #include +#include + +using namespace H2O4GPU::KMeans; + +template +struct GeneratorMock : GeneratorBase { + public: + KmMatrix generate() override {} + + KmMatrix generate(size_t _size) override { + thrust::host_vector random_numbers (_size); + for (size_t i = 0; i < _size; ++i) { + random_numbers[i] = 1 / _size; + } + KmMatrix res (random_numbers, 1, _size); + return res; + } +}; TEST(KmeansLL, KmeansLLInit) { - int k = 2; - - H2O4GPU::KMeans::KmeansLlInit kmeans_ll_init; + std::unique_ptr> mock_ptr (new GeneratorMock); + KmeansLlInit kmeans_ll_init (mock_ptr, 2.5); thrust::host_vector _h_data (16); @@ -36,7 +53,7 @@ TEST(KmeansLL, KmeansLLInit) { H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); - auto result = kmeans_ll_init (h_data, 1.0f); - result.set_name ("kmeans result"); + auto result = kmeans_ll_init(h_data, 1.0f); + result.set_name("kmeans with mock"); std::cout << result << std::endl; -} \ No newline at end of file +} From 8c21e8e2118c78860867ffd5d29c5bc6daa0a943 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 23 Jul 2018 09:35:53 +0800 Subject: [PATCH 19/49] Rename shared memory structs, add license. --- src/gpu/kmeans/KmMatrix/Generator.cuh | 5 +++++ src/gpu/kmeans/KmMatrix/Generator.hpp | 5 +++++ src/gpu/kmeans/KmMatrix/GeneratorKernels.cu | 5 +++++ src/gpu/kmeans/KmMatrix/utils.cuh | 8 ++++---- src/gpu/kmeans/kmeans_init.cuh | 2 +- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/Generator.cuh b/src/gpu/kmeans/KmMatrix/Generator.cuh index a2b6bb013..890729f3c 100644 --- a/src/gpu/kmeans/KmMatrix/Generator.cuh +++ b/src/gpu/kmeans/KmMatrix/Generator.cuh @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #include #include diff --git a/src/gpu/kmeans/KmMatrix/Generator.hpp b/src/gpu/kmeans/KmMatrix/Generator.hpp index 3a37d8bdd..308a7b040 100644 --- a/src/gpu/kmeans/KmMatrix/Generator.hpp +++ b/src/gpu/kmeans/KmMatrix/Generator.hpp @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #ifndef GENERATOR_HPP_ #define GENERATOR_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu b/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu index 420712a85..b1bd799f0 100644 --- a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu +++ b/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu @@ -1,3 +1,8 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + #include #include diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh index 8555cf1e1..606359fda 100644 --- a/src/gpu/kmeans/KmMatrix/utils.cuh +++ b/src/gpu/kmeans/KmMatrix/utils.cuh @@ -36,10 +36,10 @@ T1 M_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) { // Work around for shared memory // https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name template -struct KmShardMem; +struct KmSharedMem; template <> -struct KmShardMem { +struct KmSharedMem { __device__ float * ptr() { extern __shared__ __align__(sizeof(float)) float s_float[]; return s_float; @@ -47,7 +47,7 @@ struct KmShardMem { }; template <> -struct KmShardMem { +struct KmSharedMem { __device__ double * ptr() { extern __shared__ __align__(sizeof(double)) double s_double[]; return s_double; @@ -55,7 +55,7 @@ struct KmShardMem { }; template <> -struct KmShardMem { +struct KmSharedMem { __device__ int * ptr() { extern __shared__ __align__(sizeof(int)) int s_int[]; return s_int; diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index de15a82d2..372c24dfd 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -52,7 +52,7 @@ struct KmeansLlInit : public KmeansInitBase { T over_sample_; int seed_; int k_; - // UniformGenerator uniform_dist; + std::unique_ptr> generator_; // Buffer like variables From 458e54e43c88ddbee0f0da6accf0a7c252cfd128 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 23 Jul 2018 14:57:07 +0800 Subject: [PATCH 20/49] Fix handles_ memory leak. --- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh index 646260e8a..723f9e557 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -37,6 +37,7 @@ class GpuInfo { for (size_t i = 0; i < n_gpu_; ++i) { CUBLAS_CHECK(cublasDestroy(handles_[i])); } + free (handles_); } static GpuInfo& ins() { From d3b18df8782d4c3a41ddd11b79929b6d10a5bf2c Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 23 Jul 2018 14:57:26 +0800 Subject: [PATCH 21/49] Add argmin op. Fix min op for redundant memory access. --- src/gpu/kmeans/kmeans_init.cu | 43 +++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 4c38c5892..17ca61c41 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -33,14 +33,12 @@ namespace kernel { * @param _res The output matrix with shape m x 1 * @param _val The input matrix with shape m x n */ -template +template __global__ void row_min_sequential(kParam _res, kParam _val) { size_t idx = global_thread_idx(); size_t stride = grid_stride_x () * _val.cols; - size_t n_batches = div_roundup(_val.cols, 128); - for (size_t i = idx; i < _val.size(); i += stride) { T min = std::numeric_limits::max(); @@ -48,8 +46,31 @@ __global__ void row_min_sequential(kParam _res, kParam _val) { T tmp = _val.ptr[i+j]; if (tmp < min) min = tmp; - _res.ptr[idx] = tmp; } + + _res.ptr[idx] = min; + } +} + +template +__global__ void row_argmin_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + size_t stride = grid_stride_x () * _val.cols; + + for (size_t i = idx; i < _val.size(); i += stride) { + T min = std::numeric_limits::max(); + int min_idx = -1; + + for (size_t j = 0; j < _val.cols; ++j) { + T tmp = _val.ptr[i+j]; + if (tmp < min) { + min_idx = i; + min = tmp; + } + } + + _res.ptr[idx] = min_idx; } } @@ -134,6 +155,20 @@ struct MulOp { } }; +template +struct ArgMinOp { + void argmin(KmMatrix& _res, KmMatrix& _val, KmMatrixDim _dim) { + size_t blocks = GpuInfo::ins().blocks(32); + if (_dim == KmMatrixDim::ROW) { + kernel::row_argmin_sequential<<>>( + _res.k_param(), _val.k_param()); + } else { + // FIXME + M_ERROR("Not implemented"); + } + } +}; + template struct MinOp { From 9d5a09470b56c11cc774c23648774c42b173b635 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 23 Jul 2018 22:31:41 +0800 Subject: [PATCH 22/49] Add centroids weighting. --- src/gpu/kmeans/KmMatrix/KmConfig.h | 8 +++- src/gpu/kmeans/kmeans_init.cu | 71 +++++++++++++++++++++++++----- src/gpu/kmeans/kmeans_init.cuh | 1 + 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h index dd0c43367..fa718a221 100644 --- a/src/gpu/kmeans/KmMatrix/KmConfig.h +++ b/src/gpu/kmeans/KmMatrix/KmConfig.h @@ -68,8 +68,14 @@ } while (false) #define M_ERROR(msg) \ - printf("%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, __func__); \ + printf("%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, \ + __PRETTY_FUNCTION__); \ abort(); +#define M_USER_ERROR(msg) \ + fprintf(stderr, \ + "%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, \ + __PRETTY_FUNCTION__); \ + exit(1) #endif // KM_CONFIG_H_ diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 17ca61c41..cd7ca84e9 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -157,11 +158,13 @@ struct MulOp { template struct ArgMinOp { - void argmin(KmMatrix& _res, KmMatrix& _val, KmMatrixDim _dim) { + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { size_t blocks = GpuInfo::ins().blocks(32); if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); kernel::row_argmin_sequential<<>>( _res.k_param(), _val.k_param()); + return _res; } else { // FIXME M_ERROR("Not implemented"); @@ -172,11 +175,13 @@ struct ArgMinOp { template struct MinOp { - void min(KmMatrix& _res, KmMatrix& _val, KmMatrixDim _dim) { + KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { size_t blocks = GpuInfo::ins().blocks(32); if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); kernel::row_min_sequential<<>>( _res.k_param(), _val.k_param()); + return _res; } else { // FIXME M_ERROR("Not implemented"); @@ -265,6 +270,33 @@ struct PairWiseDistanceOp { } }; +template +KmMatrix KmeansLlInit::weight_centroids(KmMatrix& _centroids) { + KmMatrix min_indices = ArgMinOp().argmin(_centroids, KmMatrixDim::ROW); + KmMatrix weights (1, _centroids.rows()); + + size_t temp_storage_bytes = 0; + void *d_temp_storage = NULL; + + cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + min_indices.dev_ptr(), + weights.dev_ptr(), + _centroids.rows(), + (T)0.0, + (T)_centroids.rows(), + (int)_centroids.rows()); + + CUDA_CHECK(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); + cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + min_indices.dev_ptr(), + weights.dev_ptr(), + _centroids.rows(), + (T)0.0, + (T)_centroids.rows(), + (int)_centroids.rows()); + CUDA_CHECK(cudaFree(d_temp_storage)); + return weights; +} template KmMatrix KmeansLlInit::probability( @@ -279,9 +311,7 @@ KmMatrix KmeansLlInit::probability( PairWiseDistanceOp distance_op (data_dot_, centroids_dot, distance_pairs_); distance_pairs_ = distance_op(_data, _centroids); - KmMatrix min_distances (_data.rows(), 1); - - MinOp().min(min_distances, distance_pairs_, KmMatrixDim::ROW); + KmMatrix min_distances = MinOp().min(distance_pairs_, KmMatrixDim::ROW); T cost = SumOp().sum(min_distances); @@ -341,6 +371,16 @@ template KmMatrix KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { + if (_k > _data.size()) { + char err_msg[128]; + sprintf( + err_msg, + "k must be less than or equal to the number of data points" + ", k: %u, data points: %u", + _k, _data.rows()); + M_USER_ERROR(err_msg); + } + if (seed_ < 0) { std::random_device rd; seed_ = rd(); @@ -372,20 +412,27 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { if (centroids.rows() < k_) { // FIXME: When n_centroids < k // Get random selection in? + M_ERROR("Not implemented."); } + KmMatrix weights = weight_centroids(centroids); + weights.set_name ("weights"); + std::cout << weights << std::endl; + // FIXME: re-cluster // kmeans_plus_plus(centroids); return centroids; } -#define INSTANTIATE(T) \ - template KmMatrix KmeansLlInit::operator()( \ - KmMatrix& _data, size_t _k); \ - template KmMatrix KmeansLlInit::probability( \ - KmMatrix& data, KmMatrix& centroids); \ - template KmMatrix KmeansLlInit::sample_centroids( \ - KmMatrix& data, KmMatrix& centroids); \ +#define INSTANTIATE(T) \ + template KmMatrix KmeansLlInit::operator()( \ + KmMatrix& _data, size_t _k); \ + template KmMatrix KmeansLlInit::weight_centroids( \ + KmMatrix& centroids); \ + template KmMatrix KmeansLlInit::probability( \ + KmMatrix& data, KmMatrix& centroids); \ + template KmMatrix KmeansLlInit::sample_centroids( \ + KmMatrix& data, KmMatrix& centroids); \ INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 372c24dfd..68f7a64c3 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -61,6 +61,7 @@ struct KmeansLlInit : public KmeansInitBase { // store distances between each data point and centroids KmMatrix distance_pairs_; + KmMatrix weight_centroids(KmMatrix& centroids); KmMatrix probability(KmMatrix& data, KmMatrix& centroids); public: // sample_centroids should not be part of the interface, but following error From 27a5ec144e3ffa01e5ea95e78e0e99ebc86676bd Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 24 Jul 2018 02:08:18 +0800 Subject: [PATCH 23/49] Add simple re-clustering. --- src/gpu/kmeans/kmeans_init.cu | 64 +++++++++++++++++++----- src/gpu/kmeans/kmeans_init.cuh | 6 +-- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 2 +- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index cd7ca84e9..4bc827080 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -54,7 +54,7 @@ __global__ void row_min_sequential(kParam _res, kParam _val) { } template -__global__ void row_argmin_sequential(kParam _res, kParam _val) { +__global__ void row_argmin_sequential(kParam _res, kParam _val) { size_t idx = global_thread_idx(); size_t stride = grid_stride_x () * _val.cols; @@ -158,10 +158,10 @@ struct MulOp { template struct ArgMinOp { - KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { size_t blocks = GpuInfo::ins().blocks(32); if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); + KmMatrix _res(_val.rows(), 1); kernel::row_argmin_sequential<<>>( _res.k_param(), _val.k_param()); return _res; @@ -270,14 +270,26 @@ struct PairWiseDistanceOp { } }; +// We use counting to construct the weight as described in the paper. Counting +// is performed by histogram algorithm. +// For re-cluster, the paper suggests using K-Means++, but that will require +// copying data back to host. So we simply use those selected centroids with +// highest probability. + +// FIXME: +// Operations performed in K-Means|| loop leads to a-approximate. +// Intuitively, choosing those centroids with highest probability should not +// break this property. But I haven't make the proof. +// And benchmarking should be performed to check the result. template -KmMatrix KmeansLlInit::weight_centroids(KmMatrix& _centroids) { - KmMatrix min_indices = ArgMinOp().argmin(_centroids, KmMatrixDim::ROW); +KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { + KmMatrix min_indices = ArgMinOp().argmin(_centroids, KmMatrixDim::ROW); KmMatrix weights (1, _centroids.rows()); size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; + // determine the temp_storage_bytes cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, min_indices.dev_ptr(), weights.dev_ptr(), @@ -295,7 +307,38 @@ KmMatrix KmeansLlInit::weight_centroids(KmMatrix& _centroids) { (T)_centroids.rows(), (int)_centroids.rows()); CUDA_CHECK(cudaFree(d_temp_storage)); - return weights; + + // Sort the indices by weights in ascending order, then use those at front + // as result. + thrust::sort_by_key(thrust::device, + weights.dev_ptr(), + weights.dev_ptr() + weights.size(), + min_indices.dev_ptr(), + thrust::greater()); + + int * min_indices_ptr = min_indices.dev_ptr(); + + KmMatrix centroids (k_, _centroids.cols()); + int cols = _centroids.cols(); + size_t k = k_; + + min_indices.set_name ("min_indices"); + std::cout << min_indices << std::endl; + + thrust::copy_if( + thrust::device, + _centroids.dev_ptr(), _centroids.dev_ptr() + _centroids.size(), + centroids.dev_ptr(), + [=] __device__ (int idx) { + size_t row = idx / cols; + for (size_t i = 0; i < k; ++i) { + if (row == min_indices_ptr[i]) + return true; + } + return false; + }); + + return centroids; } template @@ -415,19 +458,14 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { M_ERROR("Not implemented."); } - KmMatrix weights = weight_centroids(centroids); - weights.set_name ("weights"); - std::cout << weights << std::endl; - - // FIXME: re-cluster - // kmeans_plus_plus(centroids); + centroids = recluster(centroids); return centroids; } #define INSTANTIATE(T) \ template KmMatrix KmeansLlInit::operator()( \ KmMatrix& _data, size_t _k); \ - template KmMatrix KmeansLlInit::weight_centroids( \ + template KmMatrix KmeansLlInit::recluster( \ KmMatrix& centroids); \ template KmMatrix KmeansLlInit::probability( \ KmMatrix& data, KmMatrix& centroids); \ diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 68f7a64c3..6805217da 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -61,14 +61,14 @@ struct KmeansLlInit : public KmeansInitBase { // store distances between each data point and centroids KmMatrix distance_pairs_; - KmMatrix weight_centroids(KmMatrix& centroids); KmMatrix probability(KmMatrix& data, KmMatrix& centroids); public: - // sample_centroids should not be part of the interface, but following error - // is generated when put in private section: + // sample_centroids/recluster should not be part of the interface, but + // following error is generated when put in private section: // The enclosing parent function ("sample_centroids") for an extended // __device__ lambda cannot have private or protected access within its class KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); + KmMatrix recluster(KmMatrix& centroids); /* * Initialize KmeansLlInit algorithm, with default: diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index ceab19603..e86ccf3b4 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -53,7 +53,7 @@ TEST(KmeansLL, KmeansLLInit) { H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); - auto result = kmeans_ll_init(h_data, 1.0f); + auto result = kmeans_ll_init(h_data, 2); result.set_name("kmeans with mock"); std::cout << result << std::endl; } From 09635e0666f760c2539337b86df6d6fb286d96de Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 24 Jul 2018 14:42:57 +0800 Subject: [PATCH 24/49] Factor out basic arith ops, add tests for them. --- src/gpu/kmeans/KmMatrix/Arith.hpp | 182 +++++++++++++++++++++++++++ src/gpu/kmeans/kmeans_init.cu | 171 +------------------------ tests/cpp/gpu/KmMatrix/test_arith.cu | 129 +++++++++++++++++++ 3 files changed, 315 insertions(+), 167 deletions(-) create mode 100644 src/gpu/kmeans/KmMatrix/Arith.hpp create mode 100644 tests/cpp/gpu/KmMatrix/test_arith.cu diff --git a/src/gpu/kmeans/KmMatrix/Arith.hpp b/src/gpu/kmeans/KmMatrix/Arith.hpp new file mode 100644 index 000000000..98e717d4c --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/Arith.hpp @@ -0,0 +1,182 @@ +#ifndef M_ARITH_HPP_ +#define M_ARITH_HPP_ + +#include "KmMatrix.hpp" +#include "blas.cuh" +#include "utils.cuh" + +namespace H2O4GPU { +namespace KMeans { + +namespace kernel { + +/* + * Compute min value for each row. + * @tparam T Numeric type of the data + * @param _res The output matrix with shape m x 1 + * @param _val The input matrix with shape m x n + */ +template +__global__ void row_min_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + size_t stride = grid_stride_x(); + + size_t cols = _val.cols; + + for (size_t i = idx; i < _val.rows; i += stride) { + T min = std::numeric_limits::max(); + printf("cols outer: %u\n", cols); + for (size_t j = 0; j < cols; ++j) { + T tmp = _val.ptr[i+j]; + printf("i: %u, j: %u, tmp: %f, cols: %u\n", i, j, tmp, cols); + if (tmp < min) + min = tmp; + } + printf("i: %u, min: %f\n", i, min); + _res.ptr[idx] = min; + } +} + +template +__global__ void row_argmin_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + size_t stride = grid_stride_x () * _val.cols; + + for (size_t i = idx; i < _val.size(); i += stride) { + T min = std::numeric_limits::max(); + int min_idx = -1; + + for (size_t j = 0; j < _val.cols; ++j) { + T tmp = _val.ptr[i+j]; + if (tmp < min) { + min_idx = i; + min = tmp; + } + } + + _res.ptr[idx] = min_idx; + } +} + +} // namespace kernel + +// FIXME: Using struct for operations is just keeping the possibility of +// creating an unified operations for KmMatrix. For example, let KmMatrix +// inherit those left associative ops, or create an inferface for elementwise +// operations. + +// FIXME: Use return value instead. +template +struct DotOp { + void dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); + } + void dot(KmMatrix& _res, KmMatrix& _lhs, + KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, // FIXME + _lhs.rows(), _rhs.cols(), _lhs.cols(), + &alpha, + _lhs.dev_ptr(), _lhs.cols(), + _rhs.dev_ptr(), _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols()); + } +}; + +template +struct VecBatchDotOp { + void dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); + } + void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm_strided_batched( + handle, + CUBLAS_OP_N, CUBLAS_OP_T, + 1, 1, _rhs.cols(), // m, n, k + &alpha, + _lhs.dev_ptr(), 1, _lhs.cols(), + _rhs.dev_ptr(), 1, _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols(), 1, // c should be columun vector + _lhs.rows()); + } +}; + +template +struct SumOp { + T sum(KmMatrix& _val) { + T* raw_ptr = _val.dev_ptr(); + thrust::device_ptr ptr (raw_ptr); + T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus()); + return res; + } +}; + +template +struct MulOp { + void mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs) { + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::axpy( + handle, _lhs.size(), // handle, n + &_rhs, // alpha + _lhs.dev_ptr(), 1, + _res.dev_ptr(), 1); + } +}; + + +template +struct MeanOp { + T mean(KmMatrix& _val) { + T res = SumOp().sum(_val); + res = res / _val.size(); + return res; + } +}; + +template +struct ArgMinOp { + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { + size_t blocks = GpuInfo::ins().blocks(32); + if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); + kernel::row_argmin_sequential<<>>( + _res.k_param(), _val.k_param()); + return _res; + } else { + // FIXME + M_ERROR("Not implemented"); + } + } +}; + +template +struct MinOp { + + KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { + size_t blocks = GpuInfo::ins().blocks(32); + if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); + kernel::row_min_sequential<<>>(_res.k_param(), + _val.k_param()); + return _res; + } else { + // FIXME + M_ERROR("Not implemented"); + } + } +}; + +} // namespace KMenas +} // namespace H204GPU + +#endif // M_ARITH_HPP_ diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 4bc827080..e7ff565a2 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -4,7 +4,6 @@ */ #include -#include #include #include @@ -18,176 +17,14 @@ #include "kmeans_init.cuh" #include "KmMatrix/KmMatrix.hpp" +#include "KmMatrix/Arith.hpp" #include "KmMatrix/utils.cuh" #include "KmMatrix/GpuInfo.cuh" #include "KmMatrix/blas.cuh" - namespace H2O4GPU { namespace KMeans { -namespace kernel { - -/* - * Compute min value for each row. - * @tparam T Numeric type of the data - * @param _res The output matrix with shape m x 1 - * @param _val The input matrix with shape m x n - */ -template -__global__ void row_min_sequential(kParam _res, kParam _val) { - - size_t idx = global_thread_idx(); - size_t stride = grid_stride_x () * _val.cols; - - for (size_t i = idx; i < _val.size(); i += stride) { - T min = std::numeric_limits::max(); - - for (size_t j = 0; j < _val.cols; ++j) { - T tmp = _val.ptr[i+j]; - if (tmp < min) - min = tmp; - } - - _res.ptr[idx] = min; - } -} - -template -__global__ void row_argmin_sequential(kParam _res, kParam _val) { - - size_t idx = global_thread_idx(); - size_t stride = grid_stride_x () * _val.cols; - - for (size_t i = idx; i < _val.size(); i += stride) { - T min = std::numeric_limits::max(); - int min_idx = -1; - - for (size_t j = 0; j < _val.cols; ++j) { - T tmp = _val.ptr[i+j]; - if (tmp < min) { - min_idx = i; - min = tmp; - } - } - - _res.ptr[idx] = min_idx; - } -} - -} // namespace kernel - - -template -struct DotOp { - void dot(KmMatrix& _res, KmMatrix& _val) { - this->dot(_res, _val, _val); - } - void dot(KmMatrix& _res, KmMatrix& _lhs, - KmMatrix& _rhs) { - constexpr T alpha = 1.0; - constexpr T beta = 1.0; - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); - Blas::gemm(handle, - CUBLAS_OP_T, CUBLAS_OP_N, // FIXME - _lhs.rows(), _rhs.cols(), _lhs.cols(), - &alpha, - _lhs.dev_ptr(), _lhs.cols(), - _rhs.dev_ptr(), _rhs.cols(), - &beta, - _res.dev_ptr(), _res.cols()); - } -}; - -template -struct VecBatchDotOp { - void dot(KmMatrix& _res, KmMatrix& _val) { - this->dot(_res, _val, _val); - } - void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs) { - constexpr T alpha = 1.0; - constexpr T beta = 1.0; - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); - Blas::gemm_strided_batched( - handle, - // k-means use row major, so transpose the second vector. - CUBLAS_OP_N, CUBLAS_OP_T, - 1, 1, _rhs.cols(), // m, n, k - &alpha, - _lhs.dev_ptr(), 1, _lhs.cols(), - _rhs.dev_ptr(), 1, _rhs.cols(), - &beta, - _res.dev_ptr(), _res.cols(), 1, // c should be columun vector - _lhs.rows()); - } -}; - -// FIXME: Using struct for operations is just keeping the possibility of -// creating an unified operations for KmMatrix. For example, let KmMatrix -// inherit those left associative ops, or create an inferface for elementwise -// operations. -template -struct SumOp { - T sum(KmMatrix& _val) { - T* raw_ptr = _val.dev_ptr(); - thrust::device_ptr ptr (raw_ptr); - T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus()); - return res; - } -}; - -template -struct MeanOp { - T mean(KmMatrix& _val) { - T res = SumOp().sum(_val); - return res; - } -}; - -template -struct MulOp { - void mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs) { - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); - Blas::axpy( - handle, _lhs.size(), // handle, n - &_rhs, // alpha - _lhs.dev_ptr(), 1, - _res.dev_ptr(), 1); - } -}; - -template -struct ArgMinOp { - KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { - size_t blocks = GpuInfo::ins().blocks(32); - if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::row_argmin_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; - } else { - // FIXME - M_ERROR("Not implemented"); - } - } -}; - -template -struct MinOp { - - KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { - size_t blocks = GpuInfo::ins().blocks(32); - if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::row_min_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; - } else { - // FIXME - M_ERROR("Not implemented"); - } - } -}; namespace kernel { // X^2 + Y^2, here only calculates the + operation. @@ -277,9 +114,9 @@ struct PairWiseDistanceOp { // highest probability. // FIXME: -// Operations performed in K-Means|| loop leads to a-approximate. +// Operations performed in K-Means|| loop leads to a-approximation. // Intuitively, choosing those centroids with highest probability should not -// break this property. But I haven't make the proof. +// break this property. But I haven't made the proof. // And benchmarking should be performed to check the result. template KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { @@ -359,7 +196,7 @@ KmMatrix KmeansLlInit::probability( T cost = SumOp().sum(min_distances); KmMatrix prob (min_distances.rows(), 1); - MulOp().mul(prob, min_distances, over_sample_ / cost); + MulOp().mul(prob, min_distances, over_sample_ * k_ / cost); return prob; } diff --git a/tests/cpp/gpu/KmMatrix/test_arith.cu b/tests/cpp/gpu/KmMatrix/test_arith.cu new file mode 100644 index 000000000..9d72a3731 --- /dev/null +++ b/tests/cpp/gpu/KmMatrix/test_arith.cu @@ -0,0 +1,129 @@ +#include +#include +#include + +#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/kmeans/KmMatrix/Arith.hpp" + +#include + +using namespace H2O4GPU::KMeans; + +constexpr float esp = 0.001f; + +TEST(KmMatrix, ArithDot) { + thrust::host_vector h_data (9); + for (size_t i = 0; i < 9; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 3, 3); + KmMatrix res (3, 3); + + DotOp().dot(res, mat); + + std::vector answer_vec + { + 153.0f, 212.0f, 281.0f, + 1044.0f, 1490.0f, 2036.0f, + 2745.0f, 3956.0f, 5465.0f + }; + + KmMatrix answer (answer_vec, 3, 3); + + ASSERT_TRUE(answer == res); +} + +TEST(KmMatrix, ArithVecBatchDot) { + thrust::host_vector h_data (20); + for (size_t i = 0; i < 20; ++i) { + h_data[i] = (float) i * i; + } + + KmMatrix data (h_data, 4, 5); + + KmMatrix res (4, 1); + VecBatchDotOp().dot(res, data); + + thrust::host_vector h_sol (4); + h_sol[0] = 354; + h_sol[1] = 14979; + h_sol[2] = 112354; + h_sol[3] = 434979; + KmMatrix sol (h_sol, 4, 1); + + ASSERT_TRUE(res == sol); +} + +TEST(KmMatrix, ArithSum) { + thrust::host_vector h_data (16); + for (size_t i = 0; i < 16; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 4, 4); + float res = SumOp().sum(mat); + EXPECT_NEAR(res, 1240.0f, esp); +} + +TEST(KmMatrix, ArithMean) { + thrust::host_vector h_data (16); + for (size_t i = 0; i < 16; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 4, 4); + float res = MeanOp().mean(mat); + EXPECT_NEAR(res, 77.5, esp); +} + +TEST(KmMatrix, ArithMul) { + thrust::host_vector h_data (16); + for (size_t i = 0; i < 16; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 4, 4); + KmMatrix res (4, 4); + MulOp().mul(res, mat, 2.0f); + + thrust::host_vector h_sol(16); + for (size_t i = 0; i < 16; ++i) { + h_sol[i] = h_data[i] * 2.0f; + } + KmMatrix solution {h_sol, 4, 4}; + + ASSERT_TRUE(res == solution); +} + +TEST(KmMatrix, ArithArgMin) { + std::vector h_data + { + 1.0f, 3.0f, 2.0f, 0.0f, + 3.0f, 1.0f, 0.0f, 2.0f, + 1.0f, 1.0f, 1.0f, 1.0f + }; + + KmMatrix mat (h_data, 3, 4); + KmMatrix res = ArgMinOp().argmin(mat, KmMatrixDim::ROW); + + // std::cout << res << std::endl; + + std::vector solution_vec {3, 2, 0}; + KmMatrix solution (solution_vec, 3, 1); + + ASSERT_TRUE(res == solution); +} + +TEST(KmMatrix, ArithMin) { + std::vector h_data + { + 1.0f, 3.0f, 2.0f, 0.0f, + 3.0f, 1.0f, 0.0f, 2.0f, + 1.0f, 1.0f, 1.0f, 1.0f + }; + KmMatrix mat (h_data, 3, 4); + + KmMatrix res = MinOp().min(mat, KmMatrixDim::ROW); + + std::vector solution_vec {0.0f, 0.0f, 1.0f}; + KmMatrix solution (solution_vec, 3, 1); + + ASSERT_TRUE(res == solution); +} \ No newline at end of file From 62694bf1db250f88bd4d4b8e142bc6e3203bdef7 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 24 Jul 2018 16:59:59 +0800 Subject: [PATCH 25/49] Pass all tests for arith. --- src/gpu/kmeans/KmMatrix/Arith.hpp | 40 +++++++++++----------------- tests/cpp/gpu/KmMatrix/test_arith.cu | 2 +- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/Arith.hpp b/src/gpu/kmeans/KmMatrix/Arith.hpp index 98e717d4c..5c3a488fe 100644 --- a/src/gpu/kmeans/KmMatrix/Arith.hpp +++ b/src/gpu/kmeans/KmMatrix/Arith.hpp @@ -20,20 +20,14 @@ template __global__ void row_min_sequential(kParam _res, kParam _val) { size_t idx = global_thread_idx(); - size_t stride = grid_stride_x(); - - size_t cols = _val.cols; - - for (size_t i = idx; i < _val.rows; i += stride) { + if (idx < _val.rows) { T min = std::numeric_limits::max(); - printf("cols outer: %u\n", cols); - for (size_t j = 0; j < cols; ++j) { - T tmp = _val.ptr[i+j]; - printf("i: %u, j: %u, tmp: %f, cols: %u\n", i, j, tmp, cols); - if (tmp < min) - min = tmp; + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min) { + min = value; + } } - printf("i: %u, min: %f\n", i, min); _res.ptr[idx] = min; } } @@ -42,20 +36,16 @@ template __global__ void row_argmin_sequential(kParam _res, kParam _val) { size_t idx = global_thread_idx(); - size_t stride = grid_stride_x () * _val.cols; - - for (size_t i = idx; i < _val.size(); i += stride) { + if (idx < _val.rows) { T min = std::numeric_limits::max(); int min_idx = -1; - - for (size_t j = 0; j < _val.cols; ++j) { - T tmp = _val.ptr[i+j]; - if (tmp < min) { + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min) { + min = value; min_idx = i; - min = tmp; } } - _res.ptr[idx] = min_idx; } } @@ -145,11 +135,11 @@ struct MeanOp { template struct ArgMinOp { + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { - size_t blocks = GpuInfo::ins().blocks(32); if (_dim == KmMatrixDim::ROW) { KmMatrix _res(_val.rows(), 1); - kernel::row_argmin_sequential<<>>( + kernel::row_argmin_sequential<<>>( _res.k_param(), _val.k_param()); return _res; } else { @@ -166,8 +156,8 @@ struct MinOp { size_t blocks = GpuInfo::ins().blocks(32); if (_dim == KmMatrixDim::ROW) { KmMatrix _res(_val.rows(), 1); - kernel::row_min_sequential<<>>(_res.k_param(), - _val.k_param()); + kernel::row_min_sequential<<>>( + _res.k_param(), _val.k_param()); return _res; } else { // FIXME diff --git a/tests/cpp/gpu/KmMatrix/test_arith.cu b/tests/cpp/gpu/KmMatrix/test_arith.cu index 9d72a3731..a388b0e00 100644 --- a/tests/cpp/gpu/KmMatrix/test_arith.cu +++ b/tests/cpp/gpu/KmMatrix/test_arith.cu @@ -122,7 +122,7 @@ TEST(KmMatrix, ArithMin) { KmMatrix res = MinOp().min(mat, KmMatrixDim::ROW); - std::vector solution_vec {0.0f, 0.0f, 1.0f}; + std::vector solution_vec {0.0f, 0.0f, 1.0f}; KmMatrix solution (solution_vec, 3, 1); ASSERT_TRUE(res == solution); From ec6197483b5651f9268af4022b0922b2d4a6be21 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 24 Jul 2018 18:18:05 +0800 Subject: [PATCH 26/49] Add tests for distance_pairs. --- src/gpu/kmeans/kmeans_init.cu | 131 ++++++++++++++--------- src/gpu/kmeans/kmeans_init.cuh | 26 +++++ tests/cpp/gpu/KmMatrix/test_arith.cu | 2 - tests/cpp/gpu/kmeans/test_kmeans_init.cu | 59 +++++++++- 4 files changed, 159 insertions(+), 59 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index e7ff565a2..2e783d57d 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -41,7 +41,7 @@ __global__ void construct_distance_pairs_kernel( size_t stride_x = grid_stride_x () * _data_dots.cols; // strides only for data. for (size_t i = idx; i < _data_dots.rows; i += stride_x) { - if (i < _data_dots.rows && idy < _centroids_dots.rows ) { + if (idy < _centroids_dots.rows ) { // i + idy: x^2 + y^2 between i^th data (a.k.a x) and idy^th // centroid (a.k.a y) _distance_pairs.ptr[i + idy] = @@ -52,60 +52,64 @@ __global__ void construct_distance_pairs_kernel( } // namespace kernel -// Extracted as an independent Op for k-means use. +namespace detail { template -struct PairWiseDistanceOp { - KmMatrix data_dot_; - KmMatrix centroids_dot_; - KmMatrix distance_pairs_; - - bool initialized_; - - void initialize(size_t _n_data, size_t k, size_t _dim) { - // FIXME - } - - PairWiseDistanceOp () : initialized_(false) {} - - PairWiseDistanceOp (KmMatrix& _data_dot, KmMatrix& _centroids_dot, - KmMatrix& _distance_pairs) : - data_dot_(_data_dot), centroids_dot_(_centroids_dot), - distance_pairs_(_distance_pairs), initialized_(true) { - data_dot_.set_name ("data dot"); - centroids_dot_.set_name ("centroids_dot"); - distance_pairs_.set_name ("distance pairs"); - } - - KmMatrix operator()(KmMatrix& _data, KmMatrix& _centroids) { - - kernel::construct_distance_pairs_kernel<<< - dim3(GpuInfo::ins().blocks(32), div_roundup(_centroids.rows(), 16)), - dim3(32, 16)>>>( // FIXME: Tune this. - distance_pairs_.k_param(), - data_dot_.k_param(), - centroids_dot_.k_param()); +void PairWiseDistanceOp::initialize(KmMatrix& _data_dot, + KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs) { + _data_dot = _data_dot; + centroids_dot_ = _centroids_dot; + distance_pairs_ = _distance_pairs; + initialized_ = true; +} - CUDA_CHECK(cudaGetLastError()); +template +PairWiseDistanceOp::PairWiseDistanceOp (KmMatrix& _data_dot, + KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs) : + data_dot_(_data_dot), centroids_dot_(_centroids_dot), + distance_pairs_(_distance_pairs), initialized_(true) {} - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); +template +KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, + KmMatrix& _centroids) { + + kernel::construct_distance_pairs_kernel<<< + dim3(GpuInfo::ins().blocks(32), div_roundup(_centroids.rows(), 16)), + dim3(32, 16)>>>( // FIXME: Tune this. + distance_pairs_.k_param(), + data_dot_.k_param(), + centroids_dot_.k_param()); + + CUDA_CHECK(cudaGetLastError()); + + distance_pairs_.set_name ("distance pairs"); + std::cout << distance_pairs_ << std::endl; + + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + + T alpha = -2.0; + T beta = 1.0; + + Blas::gemm( + handle, + CUBLAS_OP_T, CUBLAS_OP_N, + // n, d, d/k + _data.rows(), _data.cols(), _data.cols(), + &alpha, + _data.dev_ptr(), _data.rows(), + _centroids.dev_ptr(), _centroids.cols(), + &beta, + distance_pairs_.dev_ptr(), distance_pairs_.rows()); + + return distance_pairs_; +} - T alpha = -2.0; - T beta = 1.0; +} // namespace detail - Blas::gemm( - handle, - CUBLAS_OP_T, CUBLAS_OP_N, - // n, d, d/k - _data.rows(), _data.cols(), _data.cols(), - &alpha, - _data.dev_ptr(), _data.rows(), - _centroids.dev_ptr(), _centroids.cols(), - &beta, - distance_pairs_.dev_ptr(), distance_pairs_.rows()); - return distance_pairs_; - } -}; + +/* ============== Class member functions ============== */ // We use counting to construct the weight as described in the paper. Counting // is performed by histogram algorithm. @@ -159,9 +163,6 @@ KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { int cols = _centroids.cols(); size_t k = k_; - min_indices.set_name ("min_indices"); - std::cout << min_indices << std::endl; - thrust::copy_if( thrust::device, _centroids.dev_ptr(), _centroids.dev_ptr() + _centroids.size(), @@ -188,7 +189,8 @@ KmMatrix KmeansLlInit::probability( // FIXME: Time this distance_pairs_ = KmMatrix(_data.rows(), _centroids.rows()); - PairWiseDistanceOp distance_op (data_dot_, centroids_dot, distance_pairs_); + detail::PairWiseDistanceOp distance_op ( + data_dot_, centroids_dot, distance_pairs_); distance_pairs_ = distance_op(_data, _centroids); KmMatrix min_distances = MinOp().min(distance_pairs_, KmMatrixDim::ROW); @@ -311,6 +313,29 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { INSTANTIATE(float) INSTANTIATE(double) + +#undef INSTANTIATE + +namespace detail { + +#define INSTANTIATE(T) \ + template PairWiseDistanceOp::PairWiseDistanceOp ( \ + KmMatrix& _data_dot, \ + KmMatrix& _centroids_dot, \ + KmMatrix& _distance_pairs); \ + template void PairWiseDistanceOp::initialize( \ + KmMatrix& _data_dot, \ + KmMatrix& _centroids_dot, \ + KmMatrix& _distance_pairs); \ + template KmMatrix PairWiseDistanceOp::operator()( \ + KmMatrix& _data, \ + KmMatrix& _centroids); + +INSTANTIATE(float) +INSTANTIATE(double) + +#undef INSTANTIATE +} // FIXME: int is not supported due to random kernel } // namespace Kmeans diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 6805217da..f7274d427 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -19,6 +19,32 @@ namespace H2O4GPU{ namespace KMeans { +namespace detail { + +// Extracted as an independent Op for k-means use. +template +struct PairWiseDistanceOp { + private: + KmMatrix data_dot_; + KmMatrix centroids_dot_; + KmMatrix distance_pairs_; + + bool initialized_; + + public: + void initialize (KmMatrix& _data_dot, KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs); + + PairWiseDistanceOp () : initialized_(false) {} + + PairWiseDistanceOp (KmMatrix& _data_dot, KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs); + + KmMatrix operator()(KmMatrix& _data, KmMatrix& _centroids); +}; + +} // namespace detail + /* * Base class used for all K-Means initialization algorithms. */ diff --git a/tests/cpp/gpu/KmMatrix/test_arith.cu b/tests/cpp/gpu/KmMatrix/test_arith.cu index a388b0e00..fda0db018 100644 --- a/tests/cpp/gpu/KmMatrix/test_arith.cu +++ b/tests/cpp/gpu/KmMatrix/test_arith.cu @@ -103,8 +103,6 @@ TEST(KmMatrix, ArithArgMin) { KmMatrix mat (h_data, 3, 4); KmMatrix res = ArgMinOp().argmin(mat, KmMatrixDim::ROW); - // std::cout << res << std::endl; - std::vector solution_vec {3, 2, 0}; KmMatrix solution (solution_vec, 3, 1); diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index e86ccf3b4..4b9af3193 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -6,7 +6,8 @@ #include #include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" -#include "../../../../src/gpu/kmeans/KmMatrix/Generator.hpp"" +#include "../../../../src/gpu/kmeans/KmMatrix/Generator.hpp" +#include "../../../../src/gpu/kmeans/KmMatrix/Arith.hpp" #include "../../../../src/gpu/kmeans/kmeans_init.cuh" #include @@ -30,6 +31,56 @@ struct GeneratorMock : GeneratorBase { } }; +TEST(KmeansLL, PairWiseDistance) { + + thrust::host_vector h_data (20); + for (size_t i = 0; i < 20; ++i) { + h_data[i] = i * 2; + } + KmMatrix data (h_data, 4, 5); + data.set_name ("data"); + + thrust::host_vector h_centroids(10); + for (size_t i = 0; i < 10; ++i) { + h_centroids[i] = i; + } + KmMatrix centroids (h_centroids, 2, 5); + centroids.set_name ("centroids"); + + KmMatrix data_dot (4, 1); + data_dot.set_name ("data_dot"); + VecBatchDotOp().dot(data_dot, data); + std::cout << data_dot << std::endl; + + KmMatrix centroids_dot (2, 1); + centroids_dot.set_name ("centroids dot"); + VecBatchDotOp().dot(centroids_dot, centroids); + std::cout << centroids_dot << std::endl; + + + thrust::host_vector h_pairs (8); + for (size_t i = 0; i < 8; ++i) { + h_pairs[i] = 1; + } + KmMatrix distance_pairs (h_pairs, 4, 2); + + KmMatrix res = detail::PairWiseDistanceOp( + data_dot, centroids_dot, distance_pairs)(data, centroids); + res.set_name ("pw res"); + std::cout << res << std::endl; + + std::vector h_sol + { + 151, 376, + 1051, 1276, + 2951, 3176, + 5851, 6076 + }; + KmMatrix sol (h_sol, 4, 2); + + ASSERT_TRUE(sol == res); +} + TEST(KmeansLL, KmeansLLInit) { int k = 2; std::unique_ptr> mock_ptr (new GeneratorMock); @@ -53,7 +104,7 @@ TEST(KmeansLL, KmeansLLInit) { H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); - auto result = kmeans_ll_init(h_data, 2); - result.set_name("kmeans with mock"); - std::cout << result << std::endl; + // auto result = kmeans_ll_init(h_data, 2); + // result.set_name("kmeans with mock"); + // std::cout << result << std::endl; } From c5d8bfe2ae20ab9c0763187b304bf1441e242a9d Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 01:20:29 +0800 Subject: [PATCH 27/49] Pass pairwise distance test. --- src/gpu/kmeans/kmeans_init.cu | 18 +++++++--------- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 27 +++++++++--------------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 2e783d57d..2ead2ff0a 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -38,13 +38,13 @@ __global__ void construct_distance_pairs_kernel( // FIXME: Is using shared memory necessary? - size_t stride_x = grid_stride_x () * _data_dots.cols; + size_t stride_x = grid_stride_x (); // strides only for data. for (size_t i = idx; i < _data_dots.rows; i += stride_x) { if (idy < _centroids_dots.rows ) { // i + idy: x^2 + y^2 between i^th data (a.k.a x) and idy^th // centroid (a.k.a y) - _distance_pairs.ptr[i + idy] = + _distance_pairs.ptr[i*_centroids_dots.rows + idy] = _data_dots.ptr[idx] + _centroids_dots.ptr[idy]; } } @@ -53,11 +53,12 @@ __global__ void construct_distance_pairs_kernel( } // namespace kernel namespace detail { + template void PairWiseDistanceOp::initialize(KmMatrix& _data_dot, KmMatrix& _centroids_dot, KmMatrix& _distance_pairs) { - _data_dot = _data_dot; + data_dot_ = _data_dot; centroids_dot_ = _centroids_dot; distance_pairs_ = _distance_pairs; initialized_ = true; @@ -83,9 +84,6 @@ KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, CUDA_CHECK(cudaGetLastError()); - distance_pairs_.set_name ("distance pairs"); - std::cout << distance_pairs_ << std::endl; - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); T alpha = -2.0; @@ -95,12 +93,12 @@ KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, handle, CUBLAS_OP_T, CUBLAS_OP_N, // n, d, d/k - _data.rows(), _data.cols(), _data.cols(), + _centroids.rows(), _data.rows(), _data.cols(), &alpha, - _data.dev_ptr(), _data.rows(), _centroids.dev_ptr(), _centroids.cols(), + _data.dev_ptr(), _data.cols(), &beta, - distance_pairs_.dev_ptr(), distance_pairs_.rows()); + distance_pairs_.dev_ptr(), _centroids.rows()); return distance_pairs_; } @@ -120,7 +118,7 @@ KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, // FIXME: // Operations performed in K-Means|| loop leads to a-approximation. // Intuitively, choosing those centroids with highest probability should not -// break this property. But I haven't made the proof. +// break this property. But I haven't made the argument. // And benchmarking should be performed to check the result. template KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 4b9af3193..1e8559ac8 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -31,6 +31,7 @@ struct GeneratorMock : GeneratorBase { } }; +// r --gtest_filter=KmeansLL.PairWiseDistance TEST(KmeansLL, PairWiseDistance) { thrust::host_vector h_data (20); @@ -38,49 +39,41 @@ TEST(KmeansLL, PairWiseDistance) { h_data[i] = i * 2; } KmMatrix data (h_data, 4, 5); - data.set_name ("data"); thrust::host_vector h_centroids(10); for (size_t i = 0; i < 10; ++i) { h_centroids[i] = i; } KmMatrix centroids (h_centroids, 2, 5); - centroids.set_name ("centroids"); KmMatrix data_dot (4, 1); - data_dot.set_name ("data_dot"); VecBatchDotOp().dot(data_dot, data); - std::cout << data_dot << std::endl; KmMatrix centroids_dot (2, 1); - centroids_dot.set_name ("centroids dot"); VecBatchDotOp().dot(centroids_dot, centroids); - std::cout << centroids_dot << std::endl; - thrust::host_vector h_pairs (8); for (size_t i = 0; i < 8; ++i) { - h_pairs[i] = 1; + h_pairs[i] = 0; } KmMatrix distance_pairs (h_pairs, 4, 2); KmMatrix res = detail::PairWiseDistanceOp( data_dot, centroids_dot, distance_pairs)(data, centroids); - res.set_name ("pw res"); - std::cout << res << std::endl; std::vector h_sol { - 151, 376, - 1051, 1276, - 2951, 3176, - 5851, 6076 + 30., 55., + 730., 255., + 2430., 1455., + 5130., 3655., }; KmMatrix sol (h_sol, 4, 2); ASSERT_TRUE(sol == res); } +// r --gtest_filter=KmeansLL.KmeansLLInit TEST(KmeansLL, KmeansLLInit) { int k = 2; std::unique_ptr> mock_ptr (new GeneratorMock); @@ -104,7 +97,7 @@ TEST(KmeansLL, KmeansLLInit) { H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); - // auto result = kmeans_ll_init(h_data, 2); - // result.set_name("kmeans with mock"); - // std::cout << result << std::endl; + auto result = kmeans_ll_init(h_data, 2); + result.set_name("kmeans with mock"); + std::cout << result << std::endl; } From 940d20e41228682837af01fe385247fef238df87 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 03:14:59 +0800 Subject: [PATCH 28/49] Add test for GreedyRecluster. * Re-clustering is now a policy class, for easier testing and flexibility. --- src/gpu/kmeans/kmeans_init.cu | 123 ++++++++++++++++------- src/gpu/kmeans/kmeans_init.cuh | 16 ++- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 30 ++++++ 3 files changed, 128 insertions(+), 41 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 2ead2ff0a..825f1e946 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -50,6 +50,24 @@ __global__ void construct_distance_pairs_kernel( } } +template +__global__ void self_row_argmin_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + if (idx < _val.rows) { + T min = std::numeric_limits::max(); + int min_idx = -1; + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min && value != 0) { + min = value; + min_idx = i; + } + } + _res.ptr[idx] = min_idx; + } +} + } // namespace kernel namespace detail { @@ -103,48 +121,72 @@ KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, return distance_pairs_; } -} // namespace detail - +// ArgMin operation that exclude 0. Used when dealing with distance_pairs +// in recluster where distance between points with itself is calculated, +// hence the name. +// FIXME: Maybe generalize it to selection algorithm. +template +struct SelfArgMinOp { + + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { + if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); + kernel::self_row_argmin_sequential<<< + div_roundup(_val.rows(), 256), 256>>>(_res.k_param(), + _val.k_param()); + return _res; + } else { + // FIXME + M_ERROR("Not implemented"); + } + } - -/* ============== Class member functions ============== */ +}; // We use counting to construct the weight as described in the paper. Counting // is performed by histogram algorithm. // For re-cluster, the paper suggests using K-Means++, but that will require // copying data back to host. So we simply use those selected centroids with // highest probability. - -// FIXME: -// Operations performed in K-Means|| loop leads to a-approximation. -// Intuitively, choosing those centroids with highest probability should not -// break this property. But I haven't made the argument. -// And benchmarking should be performed to check the result. template -KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { - KmMatrix min_indices = ArgMinOp().argmin(_centroids, KmMatrixDim::ROW); +KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { + // Get the distance pairs for centroids + KmMatrix centroids_dot (_centroids.rows(), 1); + VecBatchDotOp().dot(centroids_dot, _centroids); + KmMatrix distance_pairs (_centroids.rows(), _centroids.rows()); + PairWiseDistanceOp centroids_distance_op( + centroids_dot, centroids_dot, distance_pairs); + distance_pairs = centroids_distance_op(_centroids, _centroids); + + // get the closest x_j for each x_i in centroids. + KmMatrix min_indices = SelfArgMinOp().argmin(distance_pairs, + KmMatrixDim::ROW); + + // use historgram to get counting for weights KmMatrix weights (1, _centroids.rows()); size_t temp_storage_bytes = 0; void *d_temp_storage = NULL; // determine the temp_storage_bytes - cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - min_indices.dev_ptr(), - weights.dev_ptr(), - _centroids.rows(), - (T)0.0, - (T)_centroids.rows(), - (int)_centroids.rows()); + CUDA_CHECK(cub::DeviceHistogram::HistogramEven( + d_temp_storage, temp_storage_bytes, + min_indices.dev_ptr(), + weights.dev_ptr(), + min_indices.rows() + 1, + (T)0.0, + (T)min_indices.rows(), + (int)_centroids.rows())); CUDA_CHECK(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); - cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - min_indices.dev_ptr(), - weights.dev_ptr(), - _centroids.rows(), - (T)0.0, - (T)_centroids.rows(), - (int)_centroids.rows()); + CUDA_CHECK(cub::DeviceHistogram::HistogramEven( + d_temp_storage, temp_storage_bytes, + min_indices.dev_ptr(), // d_samples + weights.dev_ptr(), // d_histogram + min_indices.rows() + 1, // num_levels + (T)0.0, // lower_level + (T)min_indices.rows(), // upper_level + (int)_centroids.rows())); // num_samples CUDA_CHECK(cudaFree(d_temp_storage)); // Sort the indices by weights in ascending order, then use those at front @@ -157,9 +199,8 @@ KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { int * min_indices_ptr = min_indices.dev_ptr(); - KmMatrix centroids (k_, _centroids.cols()); + KmMatrix centroids (_k, _centroids.cols()); int cols = _centroids.cols(); - size_t k = k_; thrust::copy_if( thrust::device, @@ -167,7 +208,7 @@ KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { centroids.dev_ptr(), [=] __device__ (int idx) { size_t row = idx / cols; - for (size_t i = 0; i < k; ++i) { + for (size_t i = 0; i < _k; ++i) { if (row == min_indices_ptr[i]) return true; } @@ -177,8 +218,13 @@ KmMatrix KmeansLlInit::recluster(KmMatrix& _centroids) { return centroids; } -template -KmMatrix KmeansLlInit::probability( +} // namespace detail + + + +/* ============== KmeansLlInit Class member functions ============== */ +template class ReclusterPolicy > +KmMatrix KmeansLlInit::probability( KmMatrix& _data, KmMatrix& _centroids) { KmMatrix centroids_dot (_centroids.rows(), 1); @@ -202,8 +248,8 @@ KmMatrix KmeansLlInit::probability( } -template -KmMatrix KmeansLlInit::sample_centroids( +template class ReclusterPolicy > +KmMatrix KmeansLlInit::sample_centroids( KmMatrix& _data, KmMatrix& _prob) { KmMatrix thresholds = generator_->generate(_data.rows()); @@ -247,9 +293,9 @@ KmMatrix KmeansLlInit::sample_centroids( return new_centroids; } -template +template class ReclusterPolicy> KmMatrix -KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { +KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { if (_k > _data.size()) { char err_msg[128]; @@ -295,15 +341,14 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { M_ERROR("Not implemented."); } - centroids = recluster(centroids); + std::cout << centroids << std::endl; + centroids = ReclusterPolicy::recluster(centroids, k_); return centroids; } #define INSTANTIATE(T) \ template KmMatrix KmeansLlInit::operator()( \ KmMatrix& _data, size_t _k); \ - template KmMatrix KmeansLlInit::recluster( \ - KmMatrix& centroids); \ template KmMatrix KmeansLlInit::probability( \ KmMatrix& data, KmMatrix& centroids); \ template KmMatrix KmeansLlInit::sample_centroids( \ @@ -327,7 +372,7 @@ namespace detail { KmMatrix& _distance_pairs); \ template KmMatrix PairWiseDistanceOp::operator()( \ KmMatrix& _data, \ - KmMatrix& _centroids); + KmMatrix& _centroids); \ INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index f7274d427..458244f20 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -21,6 +21,16 @@ namespace KMeans { namespace detail { +// FIXME: +// Operations performed in K-Means|| loop leads to a-approximation. +// Intuitively, choosing those centroids with highest probability should not +// break this property. But I haven't made the argument yet. +// And benchmarking should be performed to check the result. +template +struct GreedyRecluster { + static KmMatrix recluster(KmMatrix& _centroids, size_t _k); +}; + // Extracted as an independent Op for k-means use. template struct PairWiseDistanceOp { @@ -72,7 +82,10 @@ class KmeansInitBase { * * @tparam Data type, supported types are float and double. */ -template +template < + typename T, + template + class ReclusterPolicy = detail::GreedyRecluster> struct KmeansLlInit : public KmeansInitBase { private: T over_sample_; @@ -94,7 +107,6 @@ struct KmeansLlInit : public KmeansInitBase { // The enclosing parent function ("sample_centroids") for an extended // __device__ lambda cannot have private or protected access within its class KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); - KmMatrix recluster(KmMatrix& centroids); /* * Initialize KmeansLlInit algorithm, with default: diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 1e8559ac8..77cc9a09a 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -73,6 +73,36 @@ TEST(KmeansLL, PairWiseDistance) { ASSERT_TRUE(sol == res); } +// r --gtest_filter=KmeansLL.GreedyRecluster +TEST(KmeansLL, GreedyRecluster) { + thrust::host_vector h_centroids (20); + // close points + for (size_t i = 0; i < 5; ++i) { + h_centroids[i] = i + 4; + } + for (size_t i = 5; i < 10; ++i) { + h_centroids[i] = i; + } + for (size_t i = 10; i < 15; ++i) { + h_centroids[i] = i - 4; + } + // satelite + for (size_t i = 15; i < 20; ++i) { + h_centroids[i] = i; + } + + KmMatrix centroids (h_centroids, 4, 5); + KmMatrix res = detail::GreedyRecluster::recluster(centroids, + 2); + std::vector h_sol = + { + 4, 5, 6, 7, 8, + 5, 6, 7, 8, 9, + }; + KmMatrix sol (h_sol, 2, 5); + ASSERT_TRUE(res == sol); +} + // r --gtest_filter=KmeansLL.KmeansLLInit TEST(KmeansLL, KmeansLLInit) { int k = 2; From 2e47f71f607c6a3a525b99ae4f7ba15251f76eb1 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 10:45:02 +0800 Subject: [PATCH 29/49] Fix zero distance prob, and re-cluster with large centroids. --- src/gpu/kmeans/kmeans_init.cu | 73 +++++++++++++++++++++++++++------- src/gpu/kmeans/kmeans_init.cuh | 7 +++- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 825f1e946..9fb1f9d81 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -50,6 +50,7 @@ __global__ void construct_distance_pairs_kernel( } } +// See SelfArgMinOp template __global__ void self_row_argmin_sequential(kParam _res, kParam _val) { @@ -68,6 +69,22 @@ __global__ void self_row_argmin_sequential(kParam _res, kParam _val) { } } +template +__global__ void self_row_min_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + if (idx < _val.rows) { + T min = std::numeric_limits::max(); + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min && value != 0) { + min = value; + } + } + _res.ptr[idx] = min; + } +} + } // namespace kernel namespace detail { @@ -143,6 +160,22 @@ struct SelfArgMinOp { }; +template +struct SelfMinOp { + KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { + size_t blocks = GpuInfo::ins().blocks(32); + if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); + kernel::self_row_min_sequential<<>>( + _res.k_param(), _val.k_param()); + return _res; + } else { + // FIXME + M_ERROR("Not implemented"); + } + } +}; + // We use counting to construct the weight as described in the paper. Counting // is performed by histogram algorithm. // For re-cluster, the paper suggests using K-Means++, but that will require @@ -199,23 +232,30 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { int * min_indices_ptr = min_indices.dev_ptr(); - KmMatrix centroids (_k, _centroids.cols()); + KmMatrix new_centroids (_k, _centroids.cols()); + T * new_centroids_ptr = new_centroids.dev_ptr(); int cols = _centroids.cols(); - thrust::copy_if( + T * old_centroids_ptr = _centroids.dev_ptr(); + + auto k_iter = thrust::make_counting_iterator(0); + thrust::for_each( thrust::device, - _centroids.dev_ptr(), _centroids.dev_ptr() + _centroids.size(), - centroids.dev_ptr(), + k_iter, k_iter + _k, [=] __device__ (int idx) { - size_t row = idx / cols; - for (size_t i = 0; i < _k; ++i) { - if (row == min_indices_ptr[i]) - return true; + size_t index = min_indices_ptr[idx]; + + size_t in_begin = index * cols; + size_t in_end = (index + 1) * cols; + + size_t res_begin = idx * cols; + size_t res_end = (idx + 1) * cols; + for (size_t i = in_begin, j = res_begin; i < in_end; ++i, ++j) { + new_centroids_ptr[j] = old_centroids_ptr[i]; } - return false; }); - return centroids; + return new_centroids; } } // namespace detail @@ -223,12 +263,12 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { /* ============== KmeansLlInit Class member functions ============== */ + template class ReclusterPolicy > KmMatrix KmeansLlInit::probability( KmMatrix& _data, KmMatrix& _centroids) { KmMatrix centroids_dot (_centroids.rows(), 1); - VecBatchDotOp().dot(centroids_dot, _centroids); // FIXME: Time this @@ -237,7 +277,8 @@ KmMatrix KmeansLlInit::probability( data_dot_, centroids_dot, distance_pairs_); distance_pairs_ = distance_op(_data, _centroids); - KmMatrix min_distances = MinOp().min(distance_pairs_, KmMatrixDim::ROW); + KmMatrix min_distances = detail::SelfMinOp().min(distance_pairs_, + KmMatrixDim::ROW); T cost = SumOp().sum(min_distances); @@ -321,6 +362,8 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { // Calculate X^2 (point-wise) data_dot_ = KmMatrix(_data.rows(), 1); VecBatchDotOp().dot(data_dot_, _data); + data_dot_.set_name("data dot"); + std::cout << data_dot_ << std::endl; // First centroid KmMatrix centroids = _data.row(idx); @@ -329,7 +372,8 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { T cost = SumOp().sum(prob); - for (size_t i = 0; i < std::log(cost); ++i) { + size_t max_iter = std::max(T(MAX_ITER), std::log(cost)); + for (size_t i = 0; i < max_iter; ++i) { prob = probability(_data, centroids); KmMatrix new_centroids = sample_centroids(_data, prob); centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); @@ -341,8 +385,9 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { M_ERROR("Not implemented."); } - std::cout << centroids << std::endl; centroids = ReclusterPolicy::recluster(centroids, k_); + std::cout << centroids << std::endl; + return centroids; } diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 458244f20..4ac367500 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -16,7 +16,9 @@ #include "KmMatrix/Generator.cuh" #include "KmMatrix/GpuInfo.cuh" -namespace H2O4GPU{ +constexpr double ESP = 1e-8; + +namespace H2O4GPU { namespace KMeans { namespace detail { @@ -92,6 +94,9 @@ struct KmeansLlInit : public KmeansInitBase { int seed_; int k_; + // Suggested in original paper, 8 is usually enough. + constexpr static float MAX_ITER = 8; + std::unique_ptr> generator_; // Buffer like variables From b45825aaecbf327f9feef202b3e324ef6c62cff4 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 17:03:32 +0800 Subject: [PATCH 30/49] Extract Arith operation definitions to cuda file. * During the extraction, blas operations now convert int to float. --- src/gpu/kmeans/KmMatrix/Arith.cu | 163 ++++++++++++++++++++++++++++++ src/gpu/kmeans/KmMatrix/Arith.hpp | 132 ++---------------------- src/gpu/kmeans/KmMatrix/blas.cuh | 96 ++++++++++++++++-- 3 files changed, 260 insertions(+), 131 deletions(-) create mode 100644 src/gpu/kmeans/KmMatrix/Arith.cu diff --git a/src/gpu/kmeans/KmMatrix/Arith.cu b/src/gpu/kmeans/KmMatrix/Arith.cu new file mode 100644 index 000000000..88bc9d920 --- /dev/null +++ b/src/gpu/kmeans/KmMatrix/Arith.cu @@ -0,0 +1,163 @@ +#include "Arith.hpp" +namespace H2O4GPU { +namespace KMeans { + +namespace kernel { + +/* + * Compute min value for each row. + * @tparam T Numeric type of the data + * @param _res The output matrix with shape m x 1 + * @param _val The input matrix with shape m x n + */ +template +__global__ void row_min_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + if (idx < _val.rows) { + T min = std::numeric_limits::max(); + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min) { + min = value; + } + } + _res.ptr[idx] = min; + } +} + +template +__global__ void row_argmin_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + if (idx < _val.rows) { + T min = std::numeric_limits::max(); + int min_idx = -1; + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min) { + min = value; + min_idx = i; + } + } + _res.ptr[idx] = min_idx; + } +} + +} // namespace kernel + +template +void DotOp::dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); +} +template +void DotOp::dot(KmMatrix& _res, KmMatrix& _lhs, + KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, // FIXME + _lhs.rows(), _rhs.cols(), _lhs.cols(), + &alpha, + _lhs.dev_ptr(), _lhs.cols(), + _rhs.dev_ptr(), _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols()); +} + +template +void VecBatchDotOp::dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); +} +template +void VecBatchDotOp::dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm_strided_batched( + handle, + CUBLAS_OP_N, CUBLAS_OP_T, + 1, 1, _rhs.cols(), // m, n, k + &alpha, + _lhs.dev_ptr(), 1, _lhs.cols(), + _rhs.dev_ptr(), 1, _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols(), 1, // c should be columun vector + _lhs.rows()); +} + +template +T SumOp::sum(KmMatrix& _val) { + T* raw_ptr = _val.dev_ptr(); + thrust::device_ptr ptr (raw_ptr); + T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus()); + return res; +} + +template +void MulOp::mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs) { + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::axpy( + handle, _lhs.size(), // handle, n + &_rhs, // alpha + _lhs.dev_ptr(), 1, + _res.dev_ptr(), 1); +} + +template +T MeanOp::mean(KmMatrix& _val) { + T res = SumOp().sum(_val); + res = res / _val.size(); + return res; +} + +template +KmMatrix ArgMinOp::argmin(KmMatrix& _val, KmMatrixDim _dim) { + if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); + kernel::row_argmin_sequential<<>>( + _res.k_param(), _val.k_param()); + return _res; + } else { + // FIXME + M_ERROR("Not implemented"); + } +} + +template +KmMatrix MinOp::min(KmMatrix& _val, KmMatrixDim _dim) { + size_t blocks = GpuInfo::ins().blocks(32); + if (_dim == KmMatrixDim::ROW) { + KmMatrix _res(_val.rows(), 1); + kernel::row_min_sequential<<>>( + _res.k_param(), _val.k_param()); + return _res; + } else { + // FIXME + M_ERROR("Not implemented"); + } +} + +#define INSTANTIATE(T) \ + template void DotOp::dot(KmMatrix& _res, KmMatrix& _val); \ + template void DotOp::dot(KmMatrix& _res, KmMatrix& _lhs, \ + KmMatrix& _rhs); \ + template void VecBatchDotOp::dot( \ + KmMatrix& _res, KmMatrix& _val); \ + template void VecBatchDotOp::dot( \ + KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs); \ + template T SumOp::sum(KmMatrix& _val); \ + template void MulOp::mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs); \ + template T MeanOp::mean(KmMatrix& _val); \ + template KmMatrix ArgMinOp::argmin( \ + KmMatrix& _val, KmMatrixDim _dim); \ + template KmMatrix MinOp::min(KmMatrix& _val, KmMatrixDim _dim); \ + + +INSTANTIATE(double) +INSTANTIATE(float) +INSTANTIATE(int) + +} // namespace KMenas +} // namespace H204GPU \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/Arith.hpp b/src/gpu/kmeans/KmMatrix/Arith.hpp index 5c3a488fe..3ae493455 100644 --- a/src/gpu/kmeans/KmMatrix/Arith.hpp +++ b/src/gpu/kmeans/KmMatrix/Arith.hpp @@ -8,50 +8,6 @@ namespace H2O4GPU { namespace KMeans { -namespace kernel { - -/* - * Compute min value for each row. - * @tparam T Numeric type of the data - * @param _res The output matrix with shape m x 1 - * @param _val The input matrix with shape m x n - */ -template -__global__ void row_min_sequential(kParam _res, kParam _val) { - - size_t idx = global_thread_idx(); - if (idx < _val.rows) { - T min = std::numeric_limits::max(); - for (size_t i = 0; i < _val.cols; ++i) { - T value = _val.ptr[idx * _val.cols + i]; - if (value < min) { - min = value; - } - } - _res.ptr[idx] = min; - } -} - -template -__global__ void row_argmin_sequential(kParam _res, kParam _val) { - - size_t idx = global_thread_idx(); - if (idx < _val.rows) { - T min = std::numeric_limits::max(); - int min_idx = -1; - for (size_t i = 0; i < _val.cols; ++i) { - T value = _val.ptr[idx * _val.cols + i]; - if (value < min) { - min = value; - min_idx = i; - } - } - _res.ptr[idx] = min_idx; - } -} - -} // namespace kernel - // FIXME: Using struct for operations is just keeping the possibility of // creating an unified operations for KmMatrix. For example, let KmMatrix // inherit those left associative ops, or create an inferface for elementwise @@ -60,110 +16,40 @@ __global__ void row_argmin_sequential(kParam _res, kParam _val) { // FIXME: Use return value instead. template struct DotOp { - void dot(KmMatrix& _res, KmMatrix& _val) { - this->dot(_res, _val, _val); - } - void dot(KmMatrix& _res, KmMatrix& _lhs, - KmMatrix& _rhs) { - constexpr T alpha = 1.0; - constexpr T beta = 1.0; - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); - Blas::gemm(handle, - CUBLAS_OP_N, CUBLAS_OP_N, // FIXME - _lhs.rows(), _rhs.cols(), _lhs.cols(), - &alpha, - _lhs.dev_ptr(), _lhs.cols(), - _rhs.dev_ptr(), _rhs.cols(), - &beta, - _res.dev_ptr(), _res.cols()); - } + void dot(KmMatrix& _res, KmMatrix& _val); + void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs); }; template struct VecBatchDotOp { - void dot(KmMatrix& _res, KmMatrix& _val) { - this->dot(_res, _val, _val); - } - void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs) { - constexpr T alpha = 1.0; - constexpr T beta = 1.0; - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); - Blas::gemm_strided_batched( - handle, - CUBLAS_OP_N, CUBLAS_OP_T, - 1, 1, _rhs.cols(), // m, n, k - &alpha, - _lhs.dev_ptr(), 1, _lhs.cols(), - _rhs.dev_ptr(), 1, _rhs.cols(), - &beta, - _res.dev_ptr(), _res.cols(), 1, // c should be columun vector - _lhs.rows()); - } + void dot(KmMatrix& _res, KmMatrix& _val); + void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs); }; template struct SumOp { - T sum(KmMatrix& _val) { - T* raw_ptr = _val.dev_ptr(); - thrust::device_ptr ptr (raw_ptr); - T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus()); - return res; - } + T sum(KmMatrix& _val); }; template struct MulOp { - void mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs) { - cublasHandle_t handle = GpuInfo::ins().cublas_handle(); - Blas::axpy( - handle, _lhs.size(), // handle, n - &_rhs, // alpha - _lhs.dev_ptr(), 1, - _res.dev_ptr(), 1); - } + void mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs); }; template struct MeanOp { - T mean(KmMatrix& _val) { - T res = SumOp().sum(_val); - res = res / _val.size(); - return res; - } + T mean(KmMatrix& _val); }; template struct ArgMinOp { - - KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { - if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::row_argmin_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; - } else { - // FIXME - M_ERROR("Not implemented"); - } - } + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim); }; template struct MinOp { - - KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { - size_t blocks = GpuInfo::ins().blocks(32); - if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::row_min_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; - } else { - // FIXME - M_ERROR("Not implemented"); - } - } + KmMatrix min(KmMatrix& _val, KmMatrixDim _dim); }; } // namespace KMenas diff --git a/src/gpu/kmeans/KmMatrix/blas.cuh b/src/gpu/kmeans/KmMatrix/blas.cuh index fcbcd6339..20cf73151 100644 --- a/src/gpu/kmeans/KmMatrix/blas.cuh +++ b/src/gpu/kmeans/KmMatrix/blas.cuh @@ -17,23 +17,31 @@ namespace KMeans { namespace Blas { // LEVEL 1 inline void axpy(cublasHandle_t handle, int n, - const float *alpha, - const float *x, int incx, - float *y, int incy) { - CUBLAS_CHECK(cublasSaxpy(handle, n, + const double *alpha, + const double *x, int incx, + double *y, int incy) { + CUBLAS_CHECK(cublasDaxpy(handle, n, alpha, x, incx, y, incy));} inline void axpy(cublasHandle_t handle, int n, - const double *alpha, - const double *x, int incx, - double *y, int incy) { - CUBLAS_CHECK(cublasDaxpy(handle, n, + const float *alpha, + const float *x, int incx, + float *y, int incy) { + CUBLAS_CHECK(cublasSaxpy(handle, n, alpha, x, incx, y, incy));} +inline void axpy(cublasHandle_t handle, int n, + const int *alpha, + const int *x, int incx, + int *y, int incy) { + CUBLAS_CHECK(cublasSaxpy(handle, n, + (const float *)alpha, + (const float *)x, incx, + (float *)y, incy));} // LEVEL 3 inline void gemm(cublasHandle_t handle, cublasOperation_t transa, @@ -87,6 +95,30 @@ inline void gemm(cublasHandle_t handle, C, ldc));} +inline void gemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const int *alpha, /* host or device pointer */ + const int *A, + int lda, + const int *B, + int ldb, + const int *beta, /* host or device pointer */ + int *C, + int ldc) { + CUBLAS_CHECK(cublasSgemm(handle, + transa, transb, + m, n, k, + (const float*)alpha, /* host or device pointer */ + (const float*)A, lda, + (const float*)B, ldb, + (const float*)beta, /* host or device pointer */ + (float*)C, ldc));} + +/* -- gemm_batched --*/ inline void gemm_batched(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, @@ -131,6 +163,29 @@ inline void gemm_batched(cublasHandle_t handle, batchCount)); } +inline void gemm_batched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const int *alpha, + const int *Aarray[], int lda, + const int *Barray[], int ldb, + const int *beta, + float *Carray[], int ldc, + int batchCount) { + CUBLAS_CHECK(cublasSgemmBatched(handle, + transa, + transb, + m, n, k, + (const float *)alpha, + (const float * const *)Aarray, lda, + (const float * const *)Barray, ldb, + (const float *)beta, + (float * const *)Carray, ldc, + batchCount)); +} + +/* -- gemm_strided_batched -- */ inline void gemm_strided_batched( cublasHandle_t handle, cublasOperation_t transA, cublasOperation_t transB, @@ -181,6 +236,31 @@ inline void gemm_strided_batched( batchCount)); } +inline void gemm_strided_batched( + cublasHandle_t handle, + cublasOperation_t transA, cublasOperation_t transB, + int M, int N, int K, + const int* alpha, + const int* A, int ldA, int strideA, + const int* B, int ldB, int strideB, + const int* beta, + int* C, int ldC, int strideC, + int batchCount) { + CUBLAS_CHECK(cublasSgemmStridedBatched(handle, + transA, + transB, + M, N, K, + (const float*)alpha, + (const float*)A, ldA, + strideA, + (const float*)B, ldB, + strideB, + (const float*)beta, + (float*)C, ldC, + strideC, + batchCount)); +} + } // Blas } // KMeans } // H2O4GPU From c579d266f1a4fd64eca4a4747728b76918184505 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 18:46:19 +0800 Subject: [PATCH 31/49] Improve printing matrix. --- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 14 +++++++++----- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 5 +++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index 9fe8008b0..f62a88e13 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -250,16 +250,20 @@ KmMatrix KmMatrix::stack(KmMatrix &_second, template std::ostream& operator<<(std::ostream& os, KmMatrix& m) { - std::cout << "\nmatrix: " << m.name() << std::endl << "---" << std::endl; + std::cout << "\nmatrix: " << m.name() << std::endl; + std::cout << "shape: (" << m.rows() << ", " << m.cols() << ")\n" << "["; T * ptr = m.host_ptr(); kParam param = m.k_param(); for (size_t i = 0; i < param.rows; ++i) { + if (i == 0) std::cout << "["; + else std::cout << " ["; for (size_t j = 0; j < param.cols; ++j) { std::cout << std::setw(5) << ptr[i*param.cols + j] << ','; } - std::cout << std::endl; + std::cout << " ]"; + if (i != param.rows - 1) std::cout << "," << std::endl; } - std::cout << "---" << std::endl; + std::cout << "]\n" << std::endl; return os; } @@ -307,5 +311,5 @@ INSTANTIATE(double) INSTANTIATE(int) #undef INSTANTIATE -} -} // H2O4GPU +} // namespace KMeans +} // namepsace H2O4GPU diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 579156854..fd1595de0 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -141,6 +141,11 @@ KmMatrix stack(KmMatrix& _first, KmMatrix& _second, KmMatrixDim _dim); +/* + * Print the matrix. + * The printed format is mimicing numpy, one can just copy the output to Python + * shell and let numpy to verify it. + */ template std::ostream& operator<<(std::ostream& os, KmMatrix& m); From ca412a4242621907682fd9a109b57d49d665d827 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 19:19:50 +0800 Subject: [PATCH 32/49] Finish test for kmeans||. --- src/gpu/kmeans/kmeans_init.cu | 53 +++++++++--------------- src/gpu/kmeans/kmeans_init.cuh | 2 +- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 51 +++++++++++++++-------- 3 files changed, 55 insertions(+), 51 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 9fb1f9d81..51bc1f9ff 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -25,7 +25,6 @@ namespace H2O4GPU { namespace KMeans { - namespace kernel { // X^2 + Y^2, here only calculates the + operation. template @@ -77,10 +76,11 @@ __global__ void self_row_min_sequential(kParam _res, kParam _val) { T min = std::numeric_limits::max(); for (size_t i = 0; i < _val.cols; ++i) { T value = _val.ptr[idx * _val.cols + i]; - if (value < min && value != 0) { + if (value < min) { min = value; } } + min += ESP; _res.ptr[idx] = min; } } @@ -100,9 +100,9 @@ void PairWiseDistanceOp::initialize(KmMatrix& _data_dot, } template -PairWiseDistanceOp::PairWiseDistanceOp (KmMatrix& _data_dot, - KmMatrix& _centroids_dot, - KmMatrix& _distance_pairs) : +PairWiseDistanceOp::PairWiseDistanceOp(KmMatrix& _data_dot, + KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs) : data_dot_(_data_dot), centroids_dot_(_centroids_dot), distance_pairs_(_distance_pairs), initialized_(true) {} @@ -138,41 +138,30 @@ KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, return distance_pairs_; } -// ArgMin operation that exclude 0. Used when dealing with distance_pairs +// ArgMin operation that excludes 0. Used when dealing with distance_pairs // in recluster where distance between points with itself is calculated, // hence the name. // FIXME: Maybe generalize it to selection algorithm. template struct SelfArgMinOp { - - KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim) { - if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::self_row_argmin_sequential<<< - div_roundup(_val.rows(), 256), 256>>>(_res.k_param(), - _val.k_param()); - return _res; - } else { - // FIXME - M_ERROR("Not implemented"); - } + KmMatrix argmin(KmMatrix& _val) { + KmMatrix _res(_val.rows(), 1); + kernel::self_row_argmin_sequential<<< + div_roundup(_val.rows(), 256), 256>>>(_res.k_param(), + _val.k_param()); + return _res; } - }; +// MinOp that adds ESP to 0 value. template struct SelfMinOp { KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { size_t blocks = GpuInfo::ins().blocks(32); - if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::self_row_min_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; - } else { - // FIXME - M_ERROR("Not implemented"); - } + KmMatrix _res(_val.rows(), 1); + kernel::self_row_min_sequential<<>>( + _res.k_param(), _val.k_param()); + return _res; } }; @@ -183,17 +172,18 @@ struct SelfMinOp { // highest probability. template KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { + // Get the distance pairs for centroids KmMatrix centroids_dot (_centroids.rows(), 1); VecBatchDotOp().dot(centroids_dot, _centroids); KmMatrix distance_pairs (_centroids.rows(), _centroids.rows()); PairWiseDistanceOp centroids_distance_op( centroids_dot, centroids_dot, distance_pairs); + distance_pairs = centroids_distance_op(_centroids, _centroids); // get the closest x_j for each x_i in centroids. - KmMatrix min_indices = SelfArgMinOp().argmin(distance_pairs, - KmMatrixDim::ROW); + KmMatrix min_indices = SelfArgMinOp().argmin(distance_pairs); // use historgram to get counting for weights KmMatrix weights (1, _centroids.rows()); @@ -362,8 +352,6 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { // Calculate X^2 (point-wise) data_dot_ = KmMatrix(_data.rows(), 1); VecBatchDotOp().dot(data_dot_, _data); - data_dot_.set_name("data dot"); - std::cout << data_dot_ << std::endl; // First centroid KmMatrix centroids = _data.row(idx); @@ -386,7 +374,6 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { } centroids = ReclusterPolicy::recluster(centroids, k_); - std::cout << centroids << std::endl; return centroids; } diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 4ac367500..901c69e55 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -143,7 +143,7 @@ struct KmeansLlInit : public KmeansInitBase { * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ */ KmeansLlInit (int _seed, T _over_sample) : - seed_(_seed), k_(0), + seed_(_seed), over_sample_(_over_sample), k_(0), generator_ (new UniformGenerator(seed_)) {} /* diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 77cc9a09a..6d553a06d 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -19,12 +19,17 @@ using namespace H2O4GPU::KMeans; template struct GeneratorMock : GeneratorBase { public: - KmMatrix generate() override {} + KmMatrix generate() override { + M_ERROR("Not implemented"); + } KmMatrix generate(size_t _size) override { thrust::host_vector random_numbers (_size); for (size_t i = 0; i < _size; ++i) { - random_numbers[i] = 1 / _size; + if ( i % 2 == 0) + random_numbers[i] = 0.8; + else + random_numbers[i] = 0.2; } KmMatrix res (random_numbers, 1, _size); return res; @@ -108,26 +113,38 @@ TEST(KmeansLL, KmeansLLInit) { int k = 2; std::unique_ptr> mock_ptr (new GeneratorMock); KmeansLlInit kmeans_ll_init (mock_ptr, 2.5); - - thrust::host_vector _h_data (16); - - for (size_t i = 0; i < 4; ++i) { - _h_data[i] = double(i); + thrust::host_vector h_data (30); + // We split the points into two groups, but the result is statistic. + for (size_t i = 0; i < 5; ++i) { + h_data[i] = i + 4; } - for (size_t i = 4; i < 8; ++i) { - _h_data[i] = double(i - 2); + for (size_t i = 5; i < 10; ++i) { + h_data[i] = i; + } + for (size_t i = 10; i < 15; ++i) { + h_data[i] = i - 4; } - for (size_t i = 8; i < 12; ++i) { - _h_data[i] = double(i); + + for (size_t i = 15; i < 20; ++i) { + h_data[i] = i + 4; + } + for (size_t i = 20; i < 25; ++i) { + h_data[i] = i; } - for (size_t i = 12; i < 16; ++i) { - _h_data[i] = double(i + 2); + for (size_t i = 25; i < 30; ++i) { + h_data[i] = i - 4; } - H2O4GPU::KMeans::KmMatrix h_data (_h_data, 4, 4); + H2O4GPU::KMeans::KmMatrix data (h_data, 6, 5); - auto result = kmeans_ll_init(h_data, 2); - result.set_name("kmeans with mock"); - std::cout << result << std::endl; + auto res = kmeans_ll_init(data, 2); + + std::vector h_sol = + { + 19, 20, 21, 22, 23, + 5, 6, 7, 8, 9 + }; + KmMatrix sol (h_sol, 2, 5); + ASSERT_TRUE(sol == res); } From fcfeaf3a2aef0b591e1b494ebb27692ffe2daeb1 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 19:20:10 +0800 Subject: [PATCH 33/49] Remove warning message. --- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index 7eaa55c1d..2dc6de606 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -50,8 +50,6 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( assert (raw_ptr != nullptr && raw_ptr != NULL); - std::cerr << "Warning: Copying data from " << _other.name() - << "." << std::endl; if (_other.on_device()) { raw_ptr = _other.dev_ptr(); thrust::device_ptr ptr (raw_ptr); From ad18cbbd603eec8da7aeebd9f47724273376b2c9 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 20:52:08 +0800 Subject: [PATCH 34/49] Remove Eigen require. --- CMakeLists.txt | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 96e5b8a62..052f3704a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ OPTION(DEV_BUILD "Dev build" OFF) SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_CXX_STANDARD_REQUIRED ON) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") SET(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. -DGPU_COMPUTE_VER='35;61'") @@ -62,7 +62,6 @@ TARGET_LINK_LIBRARIES(cpuh2o4gpu ${BLAS_LIBRARIES}) if(USE_CUDA) FIND_PACKAGE(CUDA 8.0 REQUIRED) FIND_PACKAGE(NVML REQUIRED) - find_package(Eigen3 3.3 REQUIRED NO_MODULE) #============= BUILD GPU LIBRARY ADD_DEFINITIONS( @@ -90,7 +89,7 @@ if(USE_CUDA) FORMAT_GENCODE_FLAGS("${GPU_COMPUTE_VER}" GENCODE_FLAGS) MESSAGE("CUDA architecture flags ${GENCODE_FLAGS}") - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -w;") + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -Wall;") FILE(GLOB_RECURSE GPU_SOURCES src/*.cu @@ -107,15 +106,13 @@ if(USE_CUDA) SET(NVTX_LIBRARY nvToolsExt) endif() - include_directories(${EIGEN3_INCLUDE_DIR}) TARGET_LINK_LIBRARIES(gpuh2o4gpu ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusolver_LIBRARY} ${CUDA_cusparse_LIBRARY} ${BLAS_LIBRARIES} ${NVTX_LIBRARY} - ${NVML_LIBRARY} - Eigen3::Eigen) + ${NVML_LIBRARY}) #============= BUILD GPU LIBRARY endif() From d358c82d1f28ae7601f15c983206d9518fdbc16a Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 21:40:43 +0800 Subject: [PATCH 35/49] Fix compiler warnings. --- src/gpu/kmeans/KmMatrix/Generator.hpp | 4 ++-- src/gpu/kmeans/KmMatrix/GpuInfo.cuh | 2 +- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 2 +- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 4 ++-- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 2 +- src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp | 4 ++-- src/gpu/kmeans/kmeans_init.cu | 6 +++--- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 1 - 8 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/Generator.hpp b/src/gpu/kmeans/KmMatrix/Generator.hpp index 308a7b040..7200b85ff 100644 --- a/src/gpu/kmeans/KmMatrix/Generator.hpp +++ b/src/gpu/kmeans/KmMatrix/Generator.hpp @@ -14,8 +14,8 @@ namespace KMeans { template class GeneratorBase { public: - virtual KmMatrix generate() {}; - virtual KmMatrix generate(size_t _size) {}; + virtual KmMatrix generate() = 0; + virtual KmMatrix generate(size_t _size) = 0; }; } diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh index 723f9e557..22fe53e8d 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/kmeans/KmMatrix/GpuInfo.cuh @@ -34,7 +34,7 @@ class GpuInfo { } ~GpuInfo () { free (n_sm_); - for (size_t i = 0; i < n_gpu_; ++i) { + for (int i = 0; i < n_gpu_; ++i) { CUBLAS_CHECK(cublasDestroy(handles_[i])); } free (handles_); diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index fd1595de0..029e121b3 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -53,7 +53,7 @@ struct kParam { cols = _other.cols; ptr = _other.ptr; } - kParam operator=(const kParam& _other) { + M_HOSTDEV void operator=(const kParam& _other) { rows = _other.rows; cols = _other.cols; ptr = _other.ptr; diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index 2dc6de606..db20e4110 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -48,10 +48,9 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( T* raw_ptr; - assert (raw_ptr != nullptr && raw_ptr != NULL); - if (_other.on_device()) { raw_ptr = _other.dev_ptr(); + assert (raw_ptr); thrust::device_ptr ptr (raw_ptr); ptr += _start; d_vector_.resize(_size); @@ -59,6 +58,7 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( thrust::copy(ptr, ptr + _size, d_vector_.begin()); } else { raw_ptr = _other.host_ptr(); + assert (raw_ptr); raw_ptr += _start; h_vector_.resize(_size); on_device_ = false; diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index 0e9d7f520..1c7bbcde8 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -7,7 +7,7 @@ #define KM_MATRIX_CUDA_CUH_ #include "KmMatrix.hpp" -#include "thrust/device_vector.h"; +#include "thrust/device_vector.h" #include namespace H2O4GPU { diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp b/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp index d7f70e82f..0bdfa9ebf 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp @@ -12,8 +12,8 @@ template KmMatrixProxy::KmMatrixProxy(KmMatrix& _other, size_t _start, size_t _end, size_t _stride, kParam& _param) - : orgi_ (_other), start_(_start), end_(_end), stride_(_stride), - param_(_param) { + : orgi_ (_other), param_(_param), start_(_start), end_(_end), + stride_(_stride) { assert(size() > 0); } diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 51bc1f9ff..ad110af57 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -239,7 +239,7 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { size_t in_end = (index + 1) * cols; size_t res_begin = idx * cols; - size_t res_end = (idx + 1) * cols; + for (size_t i = in_begin, j = res_begin; i < in_end; ++i, ++j) { new_centroids_ptr[j] = old_centroids_ptr[i]; } @@ -333,7 +333,7 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { sprintf( err_msg, "k must be less than or equal to the number of data points" - ", k: %u, data points: %u", + ", k: %lu, data points: %lu", _k, _data.rows()); M_USER_ERROR(err_msg); } @@ -367,7 +367,7 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); } - if (centroids.rows() < k_) { + if (centroids.rows() < _k) { // FIXME: When n_centroids < k // Get random selection in? M_ERROR("Not implemented."); diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 6d553a06d..f0bf312a0 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -110,7 +110,6 @@ TEST(KmeansLL, GreedyRecluster) { // r --gtest_filter=KmeansLL.KmeansLLInit TEST(KmeansLL, KmeansLLInit) { - int k = 2; std::unique_ptr> mock_ptr (new GeneratorMock); KmeansLlInit kmeans_ll_init (mock_ptr, 2.5); thrust::host_vector h_data (30); From 1b2b3460a4e6c5badfc0de72a1775836870689ae Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 25 Jul 2018 23:50:30 +0800 Subject: [PATCH 36/49] Add rows functions for KmMatrix. --- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 29 ++++++++++++--- src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 5 +++ src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu | 45 ++++++++++++++++++++++-- src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh | 3 +- 4 files changed, 73 insertions(+), 9 deletions(-) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index f62a88e13..b05cd3442 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -204,15 +204,32 @@ KmMatrixProxy KmMatrix::col(size_t idx) { return KmMatrixProxy(*this, 0, 0, 0); } +template +KmMatrix KmMatrix::rows(KmMatrix& _index) { + KmMatrix res; + if (backend_ == Backend::CUDADense && + _index.backend_ == Backend::CUDADense) { + if (! _index.on_device()) { + _index.dev_ptr(); + } + res = impls[(int)Backend::CUDADense]->rows(_index); + } else { + M_ERROR("Not implemented."); + } + return res; +} + +template +KmMatrix KmMatrix::cols(KmMatrix& _index) { + M_ERROR("Not implemented."); + KmMatrix res; + return res; +} + template bool KmMatrix::operator==(KmMatrix& _rhs) { if (_rhs.backend_ == Backend::CUDADense && backend_ == Backend::CUDADense) { - // std::shared_ptr> tmp = - // std::dynamic_pointer_cast>( - // _rhs.impls[(int)Backend::CUDADense]); bool res = impls[(int)Backend::CUDADense]->equal(_rhs); - // bool res = std::dynamic_pointer_cast>( - // impls[(int)Backend::CUDADense])->equal(*tmp); return res; } else { M_ERROR("Not implemented."); @@ -297,6 +314,8 @@ KmMatrix stack(KmMatrix& _first, KmMatrix& _second, template T * KmMatrix::dev_ptr(); \ template bool KmMatrix::on_device() const; \ template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); \ + template KmMatrix KmMatrix::rows(KmMatrix& _index); \ + template KmMatrix KmMatrix::cols(KmMatrix& _index); \ template bool KmMatrix::operator==(KmMatrix &_rhs); \ template KmMatrix KmMatrix::stack(KmMatrix &_second, \ H2O4GPU::KMeans::KmMatrixDim _dim); \ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 029e121b3..5b15fecd9 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -84,6 +84,8 @@ class KmMatrixImpl { virtual KmMatrix stack(KmMatrix&, KmMatrixDim _dim) = 0; virtual bool equal(KmMatrix& _val) = 0; + + virtual KmMatrix rows(KmMatrix& _index) = 0; }; template @@ -133,6 +135,9 @@ class KmMatrix { KmMatrixProxy row(size_t idx, bool dev_mem=true); KmMatrixProxy col(size_t idx); + KmMatrix rows(KmMatrix& index); + KmMatrix cols(KmMatrix& index); + KmMatrix stack(KmMatrix& _second, KmMatrixDim _dim); }; diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu index db20e4110..a2d9a3c29 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu @@ -17,12 +17,15 @@ namespace KMeans { template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par) : - KmMatrixImpl(_par){} + KmMatrixImpl(_par){ + assert(_par); +} template CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, KmMatrix* _par) : on_device_(false), KmMatrixImpl(_par) { + assert(_par); h_vector_.resize(_h_vec.size()); thrust::copy(_h_vec.begin(), _h_vec.end(), h_vector_.begin()); } @@ -30,6 +33,7 @@ CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, template CudaKmMatrixImpl::CudaKmMatrixImpl(size_t _size, KmMatrix * _par) : KmMatrixImpl(_par) { + assert(_par); if (_size == 0) return; d_vector_.resize(_size); @@ -41,6 +45,7 @@ CudaKmMatrixImpl::CudaKmMatrixImpl( KmMatrix& _other, size_t _start, size_t _size, size_t _stride, KmMatrix * _par) : KmMatrixImpl(_par) { + assert(_par); assert (_size > 0); if (_size == 0) @@ -71,6 +76,7 @@ CudaKmMatrixImpl::~CudaKmMatrixImpl() {} template void CudaKmMatrixImpl::set_interface(KmMatrix* _par) { + assert(_par); KmMatrixImpl::matrix_ = _par; } @@ -130,6 +136,38 @@ bool CudaKmMatrixImpl::equal(KmMatrix& _rhs) { return res; } +template +KmMatrix CudaKmMatrixImpl::rows(KmMatrix& _index) { + + KmMatrix out (_index.rows(), KmMatrixImpl::matrix_->cols()); + + T * index_ptr = _index.dev_ptr(); + T * in_ptr = KmMatrixImpl::matrix_->dev_ptr(); + T * out_ptr = out.dev_ptr(); + + auto iter = thrust::make_counting_iterator(0); + + size_t cols = KmMatrixImpl::matrix_->cols(); + + thrust::for_each( + thrust::device, + iter, iter + _index.rows(), + [=] __device__ (int idx) { + size_t index = index_ptr[idx]; + + size_t in_begin = index * cols; + size_t in_end = (index + 1) * cols; + + size_t out_begin = idx * cols; + + for (size_t i = in_begin, j = out_begin; i != in_end; ++i, ++j) { + out_ptr[j] = in_ptr[i]; + } + }); + + return out; +} + template KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, KmMatrixDim _dim) { @@ -181,9 +219,10 @@ KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, template T* CudaKmMatrixImpl::dev_ptr(); \ template T* CudaKmMatrixImpl::host_ptr(); \ template size_t CudaKmMatrixImpl::size() const; \ - template bool CudaKmMatrixImpl::equal(KmMatrix& _rhs); \ + template bool CudaKmMatrixImpl::equal(KmMatrix& _rhs); \ template KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, \ - KmMatrixDim _dim); + KmMatrixDim _dim); \ + template KmMatrix CudaKmMatrixImpl::rows(KmMatrix& _index); INSTANTIATE(float) INSTANTIATE(double) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh index 1c7bbcde8..50500f20e 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh @@ -74,9 +74,10 @@ class CudaKmMatrixImpl : public KmMatrixImpl { virtual size_t size() const override; - // virtual bool equal(std::shared_ptr>& _rhs); virtual bool equal(KmMatrix& _rhs); + virtual KmMatrix rows(KmMatrix& _index) override; + virtual bool on_device() const override; }; From b1b2d512a42d1f010e2bfff5f2f6623ac754df40 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 00:12:40 +0800 Subject: [PATCH 37/49] Add SizeError. --- src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 7 +++++ src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 9 +++++- tests/cpp/gpu/KmMatrix/test_matrix.cu | 43 +++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index b05cd3442..2ca79f692 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -56,6 +56,13 @@ KmMatrix::KmMatrix(thrust::host_vector _vec, size_t _rows, size_t _cols) : param_ (_rows, _cols, nullptr) { init_impls(); + if (_vec.size() != _rows * _cols) { + throw KmMatrixSizeError("Expecting hv::size() == rows * cols. " + + std::string("Got hv::size(): ") + + std::to_string(_vec.size()) + + ", rows: " + std::to_string(_rows) + + ", cols: " + std::to_string(_cols)); + } #if USE_CUDA() KmMatrixImpl * ptr = new CudaKmMatrixImpl(_vec, this); impls[(int)Backend::CUDADense].reset(ptr); diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index 5b15fecd9..a6e02b879 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -7,10 +7,10 @@ #define KM_MATRIX_HPP_ #include -#include #include #include #include +#include #include "KmConfig.h" @@ -184,6 +184,13 @@ class KmMatrixProxy { friend KmMatrix; }; +struct KmMatrixSizeError: public std::runtime_error +{ + KmMatrixSizeError(std::string const& message) + : std::runtime_error(message) + {} +}; + } // namespace KMeans } // namespace H2O4GPU diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu index af2865800..e6bfb29e8 100644 --- a/tests/cpp/gpu/KmMatrix/test_matrix.cu +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -39,6 +39,49 @@ TEST(KmMatrix, KmMatrixAssig) { ASSERT_TRUE(mat1 == mat2); } +TEST(KmMatrix, KmMatrixRows) { + thrust::host_vector vec (12 * 16); + for (size_t i = 0; i < 12 * 16; ++i) { + vec[i] = i; + } + H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); + + thrust::host_vector h_index (4, 1); + h_index[0] = 0; + h_index[1] = 2; + h_index[2] = 9; + h_index[3] = 1; + H2O4GPU::KMeans::KmMatrix index (h_index, 4, 1); + + H2O4GPU::KMeans::KmMatrix rows = mat.rows(index); + + thrust::host_vector h_sol (4 * 16); + for (size_t i = 0; i < 16; ++i) { + h_sol[i] = vec[i]; + } + for (size_t i = 16; i < 32; ++i) { + h_sol[i] = vec[16 * 2 + (i - 16)]; + } + for (size_t i = 32; i < 48; ++i) { + h_sol[i] = vec[16 * 9 + (i - 32)]; + } + for (size_t i = 48; i < 64; ++i) { + h_sol[i] = vec[16 * 1 + (i - 48)]; + } + + H2O4GPU::KMeans::KmMatrix sol (h_sol, 4, 16); + + ASSERT_TRUE(rows == sol); +} + +TEST(KmMatrix, SizeError) { + thrust::host_vector vec (12 * 16); + ASSERT_THROW( + H2O4GPU::KMeans::KmMatrix mat (vec, 12, 4), + std::runtime_error); + +} + TEST(KmMatrix, KmMatrixUtils) { thrust::host_vector vec (12 * 16); H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); From 93ae39b65887f7dd48c3c990f37799818ec4065d Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 00:55:48 +0800 Subject: [PATCH 38/49] Add int generator, reshape for KmMatrix. --- src/gpu/kmeans/KmMatrix/GeneratorKernels.cu | 13 +++++++++++++ src/gpu/kmeans/KmMatrix/KmMatrix.cpp | 17 +++++++++++++++++ src/gpu/kmeans/KmMatrix/KmMatrix.hpp | 2 ++ tests/cpp/gpu/KmMatrix/test_matrix.cu | 1 - 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu b/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu index b1bd799f0..28b521f26 100644 --- a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu +++ b/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu @@ -45,6 +45,19 @@ __global__ void generate_uniform_kernel(double *_res, } } +__global__ void generate_uniform_kernel(int *_res, + curandState *_state, + int _size) { + int idx = threadIdx.x + blockIdx.x * threadIdx.x; + if (idx < _size) { + int x; + curandState local_state = _state[idx]; + x = (int) curand_uniform_double(&local_state); + _state[idx] = local_state; + _res[idx] = x; + } +} + } // namespace kernel } // namespace KMeans } // namespace H2O4GPU \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp index 2ca79f692..d350bd7e2 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.cpp @@ -160,6 +160,21 @@ size_t KmMatrix::cols() const { return param_.cols; } +template +KmMatrix KmMatrix::reshape(size_t _rows, size_t _cols) const { + if (_rows * _cols != param_.rows * param_.cols) { + throw KmMatrixSizeError("Expecting rows * cols == " + + std::to_string(param_.rows * param_.cols) + + ", get " + + "rows: " + std::to_string(_rows) + + ", cols: " + std::to_string(_cols)); + } + KmMatrix res (*this); + res.param_.rows = _rows; + res.param_.cols = _cols; + return res; +} + template kParam KmMatrix::k_param () { T * ptr = dev_ptr(); @@ -316,6 +331,8 @@ KmMatrix stack(KmMatrix& _first, KmMatrix& _second, template size_t KmMatrix::size() const; \ template size_t KmMatrix::rows() const; \ template size_t KmMatrix::cols() const; \ + template KmMatrix KmMatrix::reshape( \ + size_t _rows, size_t _cols) const; \ template kParam KmMatrix::k_param (); \ template T * KmMatrix::host_ptr(); \ template T * KmMatrix::dev_ptr(); \ diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp index a6e02b879..31ad2dcd4 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/kmeans/KmMatrix/KmMatrix.hpp @@ -122,6 +122,8 @@ class KmMatrix { size_t rows () const; size_t cols () const; + KmMatrix reshape(size_t _rows, size_t _cols) const; + T* host_ptr(); T* dev_ptr(); diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu index e6bfb29e8..3b36b35fe 100644 --- a/tests/cpp/gpu/KmMatrix/test_matrix.cu +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -79,7 +79,6 @@ TEST(KmMatrix, SizeError) { ASSERT_THROW( H2O4GPU::KMeans::KmMatrix mat (vec, 12, 4), std::runtime_error); - } TEST(KmMatrix, KmMatrixUtils) { From c95c9bacd048365566fa542c93a42f098890ae88 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 00:56:14 +0800 Subject: [PATCH 39/49] Add random kmeans init class. --- src/gpu/kmeans/kmeans_init.cu | 36 +++++++++++++++++----- src/gpu/kmeans/kmeans_init.cuh | 38 ++++++++++++++++++++++-- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 20 +++++++++++++ 3 files changed, 83 insertions(+), 11 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index ad110af57..3a93cc138 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -5,7 +5,6 @@ #include -#include #include #include @@ -250,10 +249,28 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { } // namespace detail + +/* ============ KmeansRandomInit Class member functions ============ */ + +template +KmMatrix KmeansRandomInit::operator()(KmMatrix& _data, size_t _k) { + + KmMatrix dist = generator_impl_->generate(_k); + MulOp().mul(dist, dist, (T)_data.rows() - 1); + + dist = dist.reshape(dist.cols(), dist.rows()); + + KmMatrix centroids = _data.rows(dist); + + return centroids; +} /* ============== KmeansLlInit Class member functions ============== */ +// Although the paper suggested calculating the probability independently, +// but due to zero distance between a point and itself, the already selected +// points will have very low probability. template class ReclusterPolicy > KmMatrix KmeansLlInit::probability( KmMatrix& _data, KmMatrix& _centroids) { @@ -360,7 +377,8 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { T cost = SumOp().sum(prob); - size_t max_iter = std::max(T(MAX_ITER), std::log(cost)); + size_t max_iter = std::max((size_t)(MAX_ITER), + (size_t)std::ceil(std::log(cost))); for (size_t i = 0; i < max_iter; ++i) { prob = probability(_data, centroids); KmMatrix new_centroids = sample_centroids(_data, prob); @@ -368,9 +386,9 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { } if (centroids.rows() < _k) { - // FIXME: When n_centroids < k - // Get random selection in? - M_ERROR("Not implemented."); + KmMatrix new_centroids = KmeansRandomInit(generator_)(_data, + _k - centroids.rows()); + centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); } centroids = ReclusterPolicy::recluster(centroids, k_); @@ -385,9 +403,12 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { KmMatrix& data, KmMatrix& centroids); \ template KmMatrix KmeansLlInit::sample_centroids( \ KmMatrix& data, KmMatrix& centroids); \ + template KmMatrix KmeansRandomInit::operator()( \ + KmMatrix& _data, size_t _k); INSTANTIATE(float) INSTANTIATE(double) +INSTANTIATE(int) #undef INSTANTIATE @@ -403,15 +424,14 @@ namespace detail { KmMatrix& _centroids_dot, \ KmMatrix& _distance_pairs); \ template KmMatrix PairWiseDistanceOp::operator()( \ - KmMatrix& _data, \ - KmMatrix& _centroids); \ + KmMatrix& _data, KmMatrix& _centroids); \ INSTANTIATE(float) INSTANTIATE(double) +INSTANTIATE(int) #undef INSTANTIATE } -// FIXME: int is not supported due to random kernel } // namespace Kmeans } // namespace H2O4GPU diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 901c69e55..6f067fc9f 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -73,6 +73,38 @@ class KmeansInitBase { virtual KmMatrix operator()(KmMatrix& data, size_t k) = 0; }; +/* + * Random initialization. + * @tparam Numeric data type. + */ +template +class KmeansRandomInit : public KmeansInitBase { + private: + int seed_; + std::unique_ptr> generator_impl_; + + public: + /* + * @param seed Random seed for generating centroids. + */ + KmeansRandomInit(size_t _seed) : + seed_(_seed), generator_impl_ (new UniformGenerator) {} + + /* + * @param gen Unique pointer to Random generator for generating centroids. + */ + KmeansRandomInit(std::unique_ptr>& _gen) : + generator_impl_(std::move(_gen)) {} + + virtual ~KmeansRandomInit() override {} + + /* + * @param data Data points stored in row major matrix. + * @param k Number of centroids. + */ + virtual KmMatrix operator()(KmMatrix& data, size_t k) override; +}; + /* * Each instance of KmeansLlInit corresponds to one dataset, if a new data set * is used, users need to create a new instance. @@ -82,7 +114,7 @@ class KmeansInitBase { * Scalable K-Means++ * * - * @tparam Data type, supported types are float and double. + * @tparam Numeric data type. */ template < typename T, @@ -107,8 +139,8 @@ struct KmeansLlInit : public KmeansInitBase { KmMatrix probability(KmMatrix& data, KmMatrix& centroids); public: - // sample_centroids/recluster should not be part of the interface, but - // following error is generated when put in private section: + // sample_centroids should not be part of the interface, but following error + // is generated when put in private section: // The enclosing parent function ("sample_centroids") for an extended // __device__ lambda cannot have private or protected access within its class KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index f0bf312a0..d15b6d4d7 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -36,6 +36,26 @@ struct GeneratorMock : GeneratorBase { } }; +TEST(KmeansRandom, Init) { + thrust::host_vector h_data (20); + for (size_t i = 0; i < 20; ++i) { + h_data[i] = i * 2; + } + KmMatrix data (h_data, 4, 5); + std::unique_ptr> gen (new GeneratorMock()); + KmeansRandomInit init (gen); + + auto res = init(data, 2); + + std::vector h_sol = + { + 30, 32, 34, 36, 38, + 0, 2, 4, 6, 8 + }; + KmMatrix sol (h_sol, 2, 5); + ASSERT_TRUE(sol == res); +} + // r --gtest_filter=KmeansLL.PairWiseDistance TEST(KmeansLL, PairWiseDistance) { From 07c6c3e87039840722263c7b89504c6cf39b9a5e Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 01:51:29 +0800 Subject: [PATCH 40/49] Revert format change for kmeans_h2o4gpu.cu. Revert b9d71f for kmeans_h2o4gpu.cu. --- src/gpu/kmeans/kmeans_h2o4gpu.cu | 1653 +++++++++++++++--------------- 1 file changed, 819 insertions(+), 834 deletions(-) diff --git a/src/gpu/kmeans/kmeans_h2o4gpu.cu b/src/gpu/kmeans/kmeans_h2o4gpu.cu index e9a183e56..d05750a86 100644 --- a/src/gpu/kmeans/kmeans_h2o4gpu.cu +++ b/src/gpu/kmeans/kmeans_h2o4gpu.cu @@ -1,37 +1,37 @@ /*! - * copyright 2017-2018 H2O.ai, Inc. + * Copyright 2017-2018 H2O.ai, Inc. * License Apache License Version 2.0 (see LICENSE for details) */ -#include "../../common/utils.h" -#include "cuda.h" -#include "kmeans_general.h" -#include "kmeans_h2o4gpu.h" -#include "kmeans_impl.h" -#include "solver/kmeans.h" -#include -#include -#include -#include -#include -#include -#include #include +#include #include #include #include -#include +#include +#include "cuda.h" +#include #include +#include "solver/kmeans.h" +#include "kmeans_impl.h" +#include "kmeans_general.h" +#include "kmeans_h2o4gpu.h" +#include +#include #include +#include +#include +#include "../../common/utils.h" +#include /** * METHODS FOR DATA COPYING AND GENERATION */ -template +template void random_data(int verbose, thrust::device_vector &array, int m, int n) { thrust::host_vector host_array(m * n); for (int i = 0; i < m * n; i++) { - host_array[i] = (T)rand() / (T)RAND_MAX; + host_array[i] = (T) rand() / (T) RAND_MAX; } array = host_array; } @@ -48,23 +48,22 @@ void random_data(int verbose, thrust::device_vector &array, int m, int n) { * @param npergpu * @param d */ -template -void copy_data(int verbose, const char ord, thrust::device_vector &array, - const T *srcdata, int q, int n, size_t npergpu, int d) { +template +void copy_data(int verbose, const char ord, thrust::device_vector &array, const T *srcdata, + int q, int n, size_t npergpu, int d) { if (ord == 'c') { thrust::host_vector host_array(npergpu * d); log_debug(verbose, "Copy data COL ORDER -> ROW ORDER"); for (size_t i = 0; i < npergpu * d; i++) { - size_t indexi = i % d; // col + size_t indexi = i % d; // col size_t indexj = i / d + q * npergpu; // row (shifted by which gpu) host_array[i] = srcdata[indexi * n + indexj]; } array = host_array; } else { log_debug(verbose, "Copy data ROW ORDER not changed"); - thrust::host_vector host_array(srcdata + q * npergpu * d, - srcdata + q * npergpu * d + npergpu * d); + thrust::host_vector host_array(srcdata + q * npergpu * d, srcdata + q * npergpu * d + npergpu * d); array = host_array; } } @@ -82,18 +81,16 @@ void copy_data(int verbose, const char ord, thrust::device_vector &array, * @param npergpu * @param d */ -template -void copy_data_shuffled(int verbose, std::vector v, const char ord, - thrust::device_vector &array, const T *srcdata, - int q, int n, int npergpu, int d) { +template +void copy_data_shuffled(int verbose, std::vector v, const char ord, thrust::device_vector &array, + const T *srcdata, int q, int n, int npergpu, int d) { thrust::host_vector host_array(npergpu * d); if (ord == 'c') { log_debug(verbose, "Copy data shuffle COL ORDER -> ROW ORDER"); for (int i = 0; i < npergpu; i++) { for (size_t j = 0; j < d; j++) { - host_array[i * d + j] = - srcdata[v[q * npergpu + i] + j * n]; // shift by which gpu + host_array[i * d + j] = srcdata[v[q * npergpu + i] + j * n]; // shift by which gpu } } } else { @@ -101,18 +98,16 @@ void copy_data_shuffled(int verbose, std::vector v, const char ord, for (int i = 0; i < npergpu; i++) { for (size_t j = 0; j < d; j++) { - host_array[i * d + j] = - srcdata[v[q * npergpu + i] * d + j]; // shift by which gpu + host_array[i * d + j] = srcdata[v[q * npergpu + i] * d + j]; // shift by which gpu } } } array = host_array; } -template -void copy_centroids_shuffled(int verbose, std::vector v, const char ord, - thrust::device_vector &array, const T *srcdata, - int n, int k, int d) { +template +void copy_centroids_shuffled(int verbose, std::vector v, const char ord, thrust::device_vector &array, + const T *srcdata, int n, int k, int d) { copy_data_shuffled(verbose, v, ord, array, srcdata, 0, n, k, d); } @@ -130,34 +125,30 @@ void copy_centroids_shuffled(int verbose, std::vector v, const char ord, * @param d * @param k */ -template +template void random_centroids(int verbose, int seed, const char ord, - thrust::device_vector &array, const T *srcdata, int q, - int n, int npergpu, int d, int k) { + thrust::device_vector &array, const T *srcdata, + int q, int n, int npergpu, int d, int k) { thrust::host_vector host_array(k * d); if (seed < 0) { - std::random_device - rd; // Will be used to obtain a seed for the random number engine + std::random_device rd; //Will be used to obtain a seed for the random number engine seed = rd(); } std::mt19937 gen(seed); - std::uniform_int_distribution<> dis( - 0, - n - 1); // random i in range from 0..n-1 (i.e. only 1 gpu gets centroids) + std::uniform_int_distribution<> dis(0, n - 1); // random i in range from 0..n-1 (i.e. only 1 gpu gets centroids) if (ord == 'c') { log_debug(verbose, "Random centroids COL ORDER -> ROW ORDER"); for (int i = 0; i < k; i++) { // clusters - size_t reali = - dis(gen); // + q*npergpu; // row sampled (called indexj above) + size_t reali = dis(gen); // + q*npergpu; // row sampled (called indexj above) for (size_t j = 0; j < d; j++) { // cols host_array[i * d + j] = srcdata[reali + j * n]; } } } else { log_debug(verbose, "Random centroids ROW ORDER not changed"); - for (int i = 0; i < k; i++) { // rows - size_t reali = dis(gen); // + q*npergpu ; // row sampled + for (int i = 0; i < k; i++) { // rows + size_t reali = dis(gen); // + q*npergpu ; // row sampled for (size_t j = 0; j < d; j++) { // cols host_array[i * d + j] = srcdata[reali * d + j]; } @@ -166,974 +157,968 @@ void random_centroids(int verbose, int seed, const char ord, array = host_array; } - /** - * KMEANS METHODS FIT, PREDICT, TRANSFORM - */ +/** + * KMEANS METHODS FIT, PREDICT, TRANSFORM + */ -#define __HBAR__ \ - "--------------------------------------------------------------------------" \ - "--\n" +#define __HBAR__ \ + "----------------------------------------------------------------------------\n" namespace h2o4gpukmeans { -template -int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, size_t rows, - size_t cols, const char ord, int k, int max_iterations, - int init_from_data, T threshold, const T *srcdata, - T **pred_centroids, int **pred_labels); - -template -int pick_point_idx_weighted(int seed, std::vector *data, - thrust::host_vector weights) { - T weighted_sum = 0; - - for (int i = 0; i < weights.size(); i++) { - if (data) { - weighted_sum += (data->data()[i] * weights.data()[i]); - } else { - weighted_sum += weights.data()[i]; +template +int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, const char ord, + int k, int max_iterations, int init_from_data, + T threshold, + const T *srcdata, T **pred_centroids, int **pred_labels); + + template + int pick_point_idx_weighted( + int seed, + std::vector *data, + thrust::host_vector weights) { + T weighted_sum = 0; + + for(int i = 0; i < weights.size(); i++) { + if(data) { + weighted_sum += (data->data()[i] * weights.data()[i]); + } else { + weighted_sum += weights.data()[i]; + } } - } - T best_prob = 0.0; - int best_prob_idx = 0; + T best_prob = 0.0; + int best_prob_idx = 0; - std::mt19937 mt(seed); - std::uniform_real_distribution<> dist(0.0, 1.0); + std::mt19937 mt(seed); + std::uniform_real_distribution<> dist(0.0, 1.0); - int i = 0; - for (i = 0; i <= weights.size(); i++) { - if (weights.size() == i) { - break; - } + int i = 0; + for(i = 0; i <= weights.size(); i++) { + if(weights.size() == i) { + break; + } - T prob_threshold = (T)dist(mt); + T prob_threshold = (T) dist(mt); - T data_val = weights.data()[i]; - if (data) { - data_val *= data->data()[i]; - } + T data_val = weights.data()[i]; + if (data) { + data_val *= data->data()[i]; + } - T prob_x = (data_val / weighted_sum); + T prob_x = (data_val / weighted_sum); - if (prob_x > prob_threshold) { - break; - } + if(prob_x > prob_threshold) { + break; + } - if (prob_x >= best_prob) { - best_prob = prob_x; - best_prob_idx = i; + if (prob_x >= best_prob) { + best_prob = prob_x; + best_prob_idx = i; + } } - } - return weights.size() == i ? best_prob_idx : i; -} - -/** - * Copies cols records, starting at position idx*cols from data to centroids. - * Removes them afterwards from data. Removes record from weights at position - * idx. - * @tparam T - * @param idx - * @param cols - * @param data - * @param weights - * @param centroids - */ -template -void add_centroid(int idx, int cols, thrust::host_vector &data, - thrust::host_vector &weights, std::vector ¢roids) { - for (int i = 0; i < cols; i++) { - centroids.push_back(data[idx * cols + i]); + return weights.size() == i ? best_prob_idx : i; } - weights[idx] = 0; -} -/** - * K-Means++ algorithm - * @tparam T - * @param seed - * @param data - * @param weights - * @param k - * @param cols - * @param centroids - */ -template -void kmeans_plus_plus(int verbose, int seed, thrust::host_vector data, - thrust::host_vector weights, int k, int cols, - thrust::host_vector ¢roids) { + /** + * Copies cols records, starting at position idx*cols from data to centroids. Removes them afterwards from data. + * Removes record from weights at position idx. + * @tparam T + * @param idx + * @param cols + * @param data + * @param weights + * @param centroids + */ + template + void add_centroid(int idx, int cols, + thrust::host_vector &data, + thrust::host_vector &weights, + std::vector ¢roids) { + for (int i = 0; i < cols; i++) { + centroids.push_back(data[idx * cols + i]); + } + weights[idx] = 0; + } - std::vector std_centroids(0); - std_centroids.reserve(k * cols); + /** + * K-Means++ algorithm + * @tparam T + * @param seed + * @param data + * @param weights + * @param k + * @param cols + * @param centroids + */ + template + void kmeans_plus_plus( + int verbose, + int seed, + thrust::host_vector data, + thrust::host_vector weights, + int k, + int cols, + thrust::host_vector ¢roids) { + + std::vector std_centroids(0); + std_centroids.reserve(k * cols); + + int centroid_idx = pick_point_idx_weighted( + seed, + (std::vector *) NULL, + weights + ); - int centroid_idx = - pick_point_idx_weighted(seed, (std::vector *)NULL, weights); + add_centroid(centroid_idx, cols, data, weights, std_centroids); - add_centroid(centroid_idx, cols, data, weights, std_centroids); + std::vector best_pairwise_distances(data.size() / cols); // one for each row in data + std::vector std_data(data.begin(), data.end()); - std::vector best_pairwise_distances(data.size() / - cols); // one for each row in data - std::vector std_data(data.begin(), data.end()); + compute_distances(std_data, + std_centroids, + best_pairwise_distances, + data.size() / cols, cols, 1); - compute_distances(std_data, std_centroids, best_pairwise_distances, - data.size() / cols, cols, 1); + std::vector curr_pairwise_distances( std_data.size() / cols); - std::vector curr_pairwise_distances(std_data.size() / cols); + for (int iter = 0; iter < k - 1; iter++) { + log_verbose(verbose, "KMeans++ - Iteraton %d/%d.", iter, k-1); - for (int iter = 0; iter < k - 1; iter++) { - log_verbose(verbose, "KMeans++ - Iteraton %d/%d.", iter, k - 1); + centroid_idx = pick_point_idx_weighted( + seed, + &best_pairwise_distances, + weights + ); - centroid_idx = - pick_point_idx_weighted(seed, &best_pairwise_distances, weights); + add_centroid(centroid_idx, cols, data, weights, std_centroids); - add_centroid(centroid_idx, cols, data, weights, std_centroids); + std::vector most_recent_centroids; + most_recent_centroids.reserve(cols); + add_centroid(centroid_idx, cols, data, weights, most_recent_centroids); - std::vector most_recent_centroids; - most_recent_centroids.reserve(cols); - add_centroid(centroid_idx, cols, data, weights, most_recent_centroids); + best_pairwise_distances[centroid_idx] = 0; - best_pairwise_distances[centroid_idx] = 0; + compute_distances(std_data, + most_recent_centroids, + curr_pairwise_distances, + std_data.size() / cols, cols, 1); - compute_distances(std_data, most_recent_centroids, curr_pairwise_distances, - std_data.size() / cols, cols, 1); + for (int i = 0; i < curr_pairwise_distances.size(); i++) { + best_pairwise_distances[i] = std::min(curr_pairwise_distances[i], best_pairwise_distances[i]); + } - for (int i = 0; i < curr_pairwise_distances.size(); i++) { - best_pairwise_distances[i] = - std::min(curr_pairwise_distances[i], best_pairwise_distances[i]); + std::fill(curr_pairwise_distances.begin(), curr_pairwise_distances.end(), (T)0.0); } - std::fill(curr_pairwise_distances.begin(), curr_pairwise_distances.end(), - (T)0.0); + centroids.assign(std_centroids.begin(), std_centroids.end()); } - centroids.assign(std_centroids.begin(), std_centroids.end()); -} + template + struct min_calc_functor { + T* all_costs_ptr; + T* min_costs_ptr; + T max = std::numeric_limits::max(); + int potential_k_rows; + int rows_per_run; + + min_calc_functor(T* _all_costs_ptr, T* _min_costs_ptr, int _potential_k_rows, int _rows_per_run) { + all_costs_ptr = _all_costs_ptr; + min_costs_ptr = _min_costs_ptr; + potential_k_rows = _potential_k_rows; + rows_per_run = _rows_per_run; + } -template struct min_calc_functor { - T *all_costs_ptr; - T *min_costs_ptr; - T max = std::numeric_limits::max(); - int potential_k_rows; - int rows_per_run; - - min_calc_functor(T *_all_costs_ptr, T *_min_costs_ptr, int _potential_k_rows, - int _rows_per_run) { - all_costs_ptr = _all_costs_ptr; - min_costs_ptr = _min_costs_ptr; - potential_k_rows = _potential_k_rows; - rows_per_run = _rows_per_run; - } + __host__ __device__ + void operator()(int idx) const { + T best = max; + for (int j = 0; j < potential_k_rows; j++) { + best = min(best, std::abs(all_costs_ptr[j * rows_per_run + idx])); + } + min_costs_ptr[idx] = min(min_costs_ptr[idx], best); + } + }; - __host__ __device__ void operator()(int idx) const { - T best = max; - for (int j = 0; j < potential_k_rows; j++) { - best = min(best, std::abs(all_costs_ptr[j * rows_per_run + idx])); + /** + * K-Means|| initialization method implementation as described in "Scalable K-Means++". + * + * This is a probabilistic method, which tries to choose points as much spread out as possible as centroids. + * + * In case it finds more than k centroids a K-Means++ algorithm is ran on potential centroids to pick k best suited ones. + * + * http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf + * + * @tparam T + * @param verbose + * @param seed + * @param ord + * @param data + * @param data_dots + * @param centroids + * @param rows + * @param cols + * @param k + * @param num_gpu + * @param threshold + */ + template + thrust::host_vector kmeans_parallel(int verbose, int seed, const char ord, + thrust::device_vector **data, + thrust::device_vector **data_dots, + size_t rows, int cols, int k, int num_gpu, T threshold) { + if (seed < 0) { + std::random_device rd; + int seed = rd(); } - min_costs_ptr[idx] = min(min_costs_ptr[idx], best); - } -}; + size_t rows_per_gpu = rows / num_gpu; -/** - * K-Means|| initialization method implementation as described in "Scalable - * K-Means++". - * - * This is a probabilistic method, which tries to choose points as much spread - * out as possible as centroids. - * - * In case it finds more than k centroids a K-Means++ algorithm is ran on - * potential centroids to pick k best suited ones. - * - * http://theory.stanford.edu/~sergei/papers/vldb12-kmpar.pdf - * - * @tparam T - * @param verbose - * @param seed - * @param ord - * @param data - * @param data_dots - * @param centroids - * @param rows - * @param cols - * @param k - * @param num_gpu - * @param threshold - */ -template -thrust::host_vector kmeans_parallel(int verbose, int seed, const char ord, - thrust::device_vector **data, - thrust::device_vector **data_dots, - size_t rows, int cols, int k, - int num_gpu, T threshold) { - if (seed < 0) { - std::random_device rd; - int seed = rd(); - } + std::mt19937 gen(seed); + std::uniform_int_distribution<> dis(0, rows - 1); - size_t rows_per_gpu = rows / num_gpu; + // Find the position (GPU idx and idx on that GPU) of the initial centroid + int first_center = dis(gen); + int first_center_idx = first_center % rows_per_gpu; + int first_center_gpu = first_center / rows_per_gpu; - std::mt19937 gen(seed); - std::uniform_int_distribution<> dis(0, rows - 1); - - // Find the position (GPU idx and idx on that GPU) of the initial centroid - int first_center = dis(gen); - int first_center_gpu = first_center / rows_per_gpu; // gpu id - int first_center_idx = first_center % rows_per_gpu; // id on that gpu - - log_verbose(verbose, "KMeans|| - Initial centroid %d on GPU %d.", - first_center_idx, first_center_gpu); - - // Copies the initial centroid to potential centroids vector. That vector will - // store all potential centroids found in the previous iteration. - thrust::host_vector h_potential_centroids(cols); - std::vector> h_potential_centroids_per_gpu(num_gpu); - - CUDACHECK(cudaSetDevice(first_center_gpu)); - - // copy the first center to h_potential_centroids - thrust::copy((*data[first_center_gpu]).begin() + first_center_idx * cols, - (*data[first_center_gpu]).begin() + - (first_center_idx + 1) * cols, - h_potential_centroids.begin()); - - thrust::host_vector h_all_potential_centroids = h_potential_centroids; - - // Initial the cost-to-potential-centroids and - // cost-to-closest-potential-centroid matrices. Initial cost is +infinity - std::vector> d_min_costs(num_gpu); - for (int q = 0; q < num_gpu; q++) { - CUDACHECK(cudaSetDevice(q)); - d_min_costs[q].resize(rows_per_gpu); - thrust::fill(d_min_costs[q].begin(), d_min_costs[q].end(), - std::numeric_limits::max()); - } + log_verbose(verbose, "KMeans|| - Initial centroid %d on GPU %d.", first_center_idx, first_center_gpu); - double t0 = timer(); + // Copies the initial centroid to potential centroids vector. That vector will store all potential centroids found + // in the previous iteration. + thrust::host_vector h_potential_centroids(cols); + std::vector> h_potential_centroids_per_gpu(num_gpu); - // The original white paper claims 8 should be enough - int max_iter = std::min(8, (int)(2 + log(k))); - for (int counter = 0; counter < max_iter; counter++) { - log_verbose(verbose, "KMeans|| - Iteration %d.", counter); - T total_min_cost = 0.0; + CUDACHECK(cudaSetDevice(first_center_gpu)); - int new_potential_centroids = 0; -#pragma omp parallel for - for (int i = 0; i < num_gpu; i++) { - CUDACHECK(cudaSetDevice(i)); - - thrust::device_vector d_potential_centroids = h_potential_centroids; - - int potential_k_rows = d_potential_centroids.size() / cols; - - // Compute all the costs to each potential centroid from previous - // iteration - thrust::device_vector centroid_dots(potential_k_rows); - - kmeans::detail::batch_calculate_distances( - - verbose, 0, rows_per_gpu, cols, potential_k_rows, *data[i], - d_potential_centroids, *data_dots[i], centroid_dots, - - [&](int rows_per_run, size_t offset, - thrust::device_vector &pairwise_distances) { - // Find the closest potential center cost for each row - auto min_cost_counter = thrust::make_counting_iterator(0); - auto all_costs_ptr = - thrust::raw_pointer_cast(pairwise_distances.data()); - auto min_costs_ptr = - thrust::raw_pointer_cast(d_min_costs[i].data() + offset); - thrust::for_each( - min_cost_counter, min_cost_counter + rows_per_run, - // Functor instead of a lambda b/c nvcc is complaining about - // nesting a __device__ lambda inside a regular lambda - min_calc_functor(all_costs_ptr, min_costs_ptr, - potential_k_rows, rows_per_run)); - }); - } + thrust::copy( + (*data[first_center_gpu]).begin() + first_center_idx * cols, + (*data[first_center_gpu]).begin() + (first_center_idx + 1) * cols, + h_potential_centroids.begin() + ); - for (int i = 0; i < num_gpu; i++) { - CUDACHECK(cudaSetDevice(i)); - total_min_cost += - thrust::reduce(d_min_costs[i].begin(), d_min_costs[i].end()); + thrust::host_vector h_all_potential_centroids = h_potential_centroids; + + // Initial the cost-to-potential-centroids and cost-to-closest-potential-centroid matrices. Initial cost is +infinity + std::vector> d_min_costs(num_gpu); + for (int q = 0; q < num_gpu; q++) { + CUDACHECK(cudaSetDevice(q)); + d_min_costs[q].resize(rows_per_gpu); + thrust::fill(d_min_costs[q].begin(), d_min_costs[q].end(), std::numeric_limits::max()); } - log_verbose(verbose, "KMeans|| - Total min cost from centers %g.", - total_min_cost); + double t0 = timer(); - if (total_min_cost == (T)0.0) { - continue; - } + // The original white paper claims 8 should be enough + int max_iter = std::min(8, (int)(2 + log(k)) ); + for (int counter = 0; counter < max_iter; counter++) { + log_verbose(verbose, "KMeans|| - Iteration %d.", counter); + T total_min_cost = 0.0; - std::set copy_from_gpus; + int new_potential_centroids = 0; #pragma omp parallel for - for (int i = 0; i < num_gpu; i++) { - CUDACHECK(cudaSetDevice(i)); - - // Count how many potential centroids there are using probabilities - // The further the row is from the closest cluster center the higher the - // probability - auto pot_cent_filter_counter = thrust::make_counting_iterator(0); - auto min_costs_ptr = thrust::raw_pointer_cast(d_min_costs[i].data()); - int pot_cent_num = thrust::count_if( - pot_cent_filter_counter, pot_cent_filter_counter + rows_per_gpu, - [=] __device__(int idx) { - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - int device; - cudaGetDevice(&device); - rng.discard(idx + device * rows_per_gpu); - T prob_threshold = (T)dist(rng); - - T prob_x = ((2.0 * k * min_costs_ptr[idx]) / total_min_cost); - - return prob_x > prob_threshold; - }); - - log_debug(verbose, "KMeans|| - Potential centroids on GPU %d = %d.", i, - pot_cent_num); - - if (pot_cent_num > 0) { - copy_from_gpus.insert(i); - - // Copy all potential cluster centers - thrust::device_vector d_new_potential_centroids(pot_cent_num * cols); - - auto range = thrust::make_counting_iterator(0); - thrust::copy_if( - (*data[i]).begin(), (*data[i]).end(), range, - d_new_potential_centroids.begin(), - - [=] __device__(int idx) { - int row = idx / cols; - thrust::default_random_engine rng(seed); - thrust::uniform_real_distribution<> dist(0.0, 1.0); - int device; - cudaGetDevice(&device); - rng.discard(row + device * rows_per_gpu); - T prob_threshold = (T)dist(rng); - - T prob_x = ((2.0 * k * min_costs_ptr[row]) / total_min_cost); - - return prob_x > prob_threshold; - }); - - h_potential_centroids_per_gpu[i].clear(); - h_potential_centroids_per_gpu[i].resize( - d_new_potential_centroids.size()); - - new_potential_centroids += d_new_potential_centroids.size(); - - thrust::copy(d_new_potential_centroids.begin(), - d_new_potential_centroids.end(), - h_potential_centroids_per_gpu[i].begin()); + for (int i = 0; i < num_gpu; i++) { + CUDACHECK(cudaSetDevice(i)); + + thrust::device_vector d_potential_centroids = h_potential_centroids; + + int potential_k_rows = d_potential_centroids.size() / cols; + + // Compute all the costs to each potential centroid from previous iteration + thrust::device_vector centroid_dots(potential_k_rows); + + kmeans::detail::batch_calculate_distances(verbose, 0, rows_per_gpu, cols, potential_k_rows, + *data[i], d_potential_centroids, *data_dots[i], centroid_dots, + [&](int rows_per_run, size_t offset, thrust::device_vector &pairwise_distances) { + // Find the closest potential center cost for each row + auto min_cost_counter = thrust::make_counting_iterator(0); + auto all_costs_ptr = thrust::raw_pointer_cast(pairwise_distances.data()); + auto min_costs_ptr = thrust::raw_pointer_cast(d_min_costs[i].data() + offset); + thrust::for_each(min_cost_counter, + min_cost_counter + rows_per_run, + // Functor instead of a lambda b/c nvcc is complaining about + // nesting a __device__ lambda inside a regular lambda + min_calc_functor(all_costs_ptr, min_costs_ptr, potential_k_rows, rows_per_run)); + } + ); } - } - - log_verbose(verbose, "KMeans|| - New potential centroids %d.", - new_potential_centroids); - // Gather potential cluster centers from all GPUs - if (new_potential_centroids > 0) { - h_potential_centroids.clear(); - h_potential_centroids.resize(new_potential_centroids); - - int old_pot_centroids_size = h_all_potential_centroids.size(); - h_all_potential_centroids.resize(old_pot_centroids_size + - new_potential_centroids); - - int offset = 0; for (int i = 0; i < num_gpu; i++) { - if (copy_from_gpus.find(i) != copy_from_gpus.end()) { - thrust::copy(h_potential_centroids_per_gpu[i].begin(), - h_potential_centroids_per_gpu[i].end(), - h_potential_centroids.begin() + offset); - offset += h_potential_centroids_per_gpu[i].size(); - } + CUDACHECK(cudaSetDevice(i)); + total_min_cost += thrust::reduce( + d_min_costs[i].begin(), + d_min_costs[i].end() + ); } - thrust::copy(h_potential_centroids.begin(), h_potential_centroids.end(), - h_all_potential_centroids.begin() + old_pot_centroids_size); - } - } + log_verbose(verbose, "KMeans|| - Total min cost from centers %g.", total_min_cost); - double timeloop = static_cast(timer() - t0); + if(total_min_cost == (T) 0.0) { + continue; + } - thrust::host_vector final_centroids(0); - int potential_centroids_num = h_all_potential_centroids.size() / cols; + std::set copy_from_gpus; +#pragma omp parallel for + for (int i = 0; i < num_gpu; i++) { + CUDACHECK(cudaSetDevice(i)); + + // Count how many potential centroids there are using probabilities + // The further the row is from the closest cluster center the higher the probability + auto pot_cent_filter_counter = thrust::make_counting_iterator(0); + auto min_costs_ptr = thrust::raw_pointer_cast(d_min_costs[i].data()); + int pot_cent_num = thrust::count_if( + pot_cent_filter_counter, + pot_cent_filter_counter + rows_per_gpu, [=]__device__(int idx){ + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + int device; + cudaGetDevice(&device); + rng.discard(idx + device * rows_per_gpu); + T prob_threshold = (T) dist(rng); + + T prob_x = (( 2.0 * k * min_costs_ptr[idx]) / total_min_cost); + + return prob_x > prob_threshold; + } + ); + + log_debug(verbose, "KMeans|| - Potential centroids on GPU %d = %d.", i, pot_cent_num); + + if (pot_cent_num > 0) { + copy_from_gpus.insert(i); + + // Copy all potential cluster centers + thrust::device_vector d_new_potential_centroids(pot_cent_num * cols); + + auto range = thrust::make_counting_iterator(0); + thrust::copy_if( + (*data[i]).begin(), (*data[i]).end(), range, + d_new_potential_centroids.begin(), [=] __device__(int idx){ + int row = idx / cols; + thrust::default_random_engine rng(seed); + thrust::uniform_real_distribution<> dist(0.0, 1.0); + int device; + cudaGetDevice(&device); + rng.discard(row + device * rows_per_gpu); + T prob_threshold = (T) dist(rng); + + T prob_x = (( 2.0 * k * min_costs_ptr[row]) / total_min_cost); + + return prob_x > prob_threshold; + }); + + h_potential_centroids_per_gpu[i].clear(); + h_potential_centroids_per_gpu[i].resize(d_new_potential_centroids.size()); + + new_potential_centroids += d_new_potential_centroids.size(); + + thrust::copy( + d_new_potential_centroids.begin(), + d_new_potential_centroids.end(), + h_potential_centroids_per_gpu[i].begin() + ); - if (potential_centroids_num <= k) { - final_centroids.resize(k * cols); - thrust::copy(h_all_potential_centroids.begin(), - h_all_potential_centroids.end(), final_centroids.begin()); - // TODO what if potential_centroids_num < k ?? we don't want 0s - } else { - // If we found more than k potential cluster centers we need to take only a - // subset This is done using a weighted k-means++ method, since the set - // should be very small it should converge very fast and is all done on the - // CPU. - thrust::host_vector weights(potential_centroids_num); + } - double tc0 = timer(); + } - // Weights correspond to the number of data points assigned to each - // potential cluster center - count_pts_per_centroid(verbose, num_gpu, rows_per_gpu, cols, data, - data_dots, h_all_potential_centroids, weights); + log_verbose(verbose, "KMeans|| - New potential centroids %d.", new_potential_centroids); + + // Gather potential cluster centers from all GPUs + if (new_potential_centroids > 0) { + h_potential_centroids.clear(); + h_potential_centroids.resize(new_potential_centroids); + + int old_pot_centroids_size = h_all_potential_centroids.size(); + h_all_potential_centroids.resize(old_pot_centroids_size + new_potential_centroids); + + int offset = 0; + for (int i = 0; i < num_gpu; i++) { + if(copy_from_gpus.find(i) != copy_from_gpus.end()) { + thrust::copy( + h_potential_centroids_per_gpu[i].begin(), + h_potential_centroids_per_gpu[i].end(), + h_potential_centroids.begin() + offset + ); + offset += h_potential_centroids_per_gpu[i].size(); + } + } - double timecount = static_cast(timer() - tc0); + thrust::copy( + h_potential_centroids.begin(), + h_potential_centroids.end(), + h_all_potential_centroids.begin() + old_pot_centroids_size + ); + } + } - double tkpp = timer(); + double timeloop = static_cast(timer() - t0); - kmeans_plus_plus(verbose, seed, h_all_potential_centroids, weights, k, cols, - final_centroids); + thrust::host_vector final_centroids(0); + int potential_centroids_num = h_all_potential_centroids.size() / cols; - double timekpp = static_cast(timer() - tkpp); + if (potential_centroids_num <= k) { + final_centroids.resize(k * cols); + thrust::copy( + h_all_potential_centroids.begin(), + h_all_potential_centroids.end(), + final_centroids.begin() + ); + // TODO what if potential_centroids_num < k ?? we don't want 0s + } else { + // If we found more than k potential cluster centers we need to take only a subset + // This is done using a weighted k-means++ method, since the set should be very small + // it should converge very fast and is all done on the CPU. + thrust::host_vector weights(potential_centroids_num); + + double tc0 = timer(); + + // Weights correspond to the number of data points assigned to each potential cluster center + count_pts_per_centroid( + verbose, num_gpu, + rows_per_gpu, cols, + data, data_dots, + h_all_potential_centroids, + weights + ); + + double timecount = static_cast(timer() - tc0); + + double tkpp = timer(); + + kmeans_plus_plus( + verbose, + seed, + h_all_potential_centroids, + weights, + k, cols, + final_centroids + ); + + double timekpp = static_cast(timer() - tkpp); + + log_verbose(verbose, "KMeans|| - Time loop: %g Time count: %g Time kpp: %g.", timeloop, timecount, timekpp); + } - log_verbose(verbose, - "KMeans|| - Time loop: %g Time count: %g Time kpp: %g.", - timeloop, timecount, timekpp); + return final_centroids; } - return final_centroids; -} - -volatile std::atomic_int flaggpu(0); - -inline void my_function_gpu(int sig) { // can be called asynchronously - fprintf(stderr, "Caught signal %d. Terminating shortly.\n", sig); - flaggpu = 1; -} + volatile std::atomic_int flaggpu(0); -std::vector kmeans_init(int verbose, int *final_n_gpu, int n_gputry, - int gpu_idtry, int rows) { - if (rows > std::numeric_limits::max()) { - fprintf(stderr, "rows > %d not implemented\n", - std::numeric_limits::max()); - fflush(stderr); - exit(0); + inline void my_function_gpu(int sig) { // can be called asynchronously + fprintf(stderr, "Caught signal %d. Terminating shortly.\n", sig); + flaggpu = 1; } - std::signal(SIGINT, my_function_gpu); - std::signal(SIGTERM, my_function_gpu); - - // no more gpus than visible gpus - int n_gpuvis; - cudaGetDeviceCount(&n_gpuvis); - int n_gpu = std::min(n_gpuvis, n_gputry); + std::vector kmeans_init(int verbose, int *final_n_gpu, int n_gputry, int gpu_idtry, int rows) { + if (rows > std::numeric_limits::max()) { + fprintf(stderr, "rows > %d not implemented\n", std::numeric_limits::max()); + fflush(stderr); + exit(0); + } - // no more than rows - n_gpu = std::min(n_gpu, rows); + std::signal(SIGINT, my_function_gpu); + std::signal(SIGTERM, my_function_gpu); - if (verbose) { - std::cout << n_gpu << " gpus." << std::endl; - } + // no more gpus than visible gpus + int n_gpuvis; + cudaGetDeviceCount(&n_gpuvis); + int n_gpu = std::min(n_gpuvis, n_gputry); - int gpu_id = gpu_idtry % n_gpuvis; + // no more than rows + n_gpu = std::min(n_gpu, rows); - // setup GPU list to use - std::vector dList(n_gpu); - for (int idx = 0; idx < n_gpu; idx++) { - int device_idx = (gpu_id + idx) % n_gpuvis; - dList[idx] = device_idx; - } + if (verbose) { + std::cout << n_gpu << " gpus." << std::endl; + } - *final_n_gpu = n_gpu; - return dList; -} + int gpu_id = gpu_idtry % n_gpuvis; -template -H2O4GPUKMeans::H2O4GPUKMeans(const T *A, int k, int n, int d) { - _A = A; - _k = k; - _n = n; - _d = d; -} + // setup GPU list to use + std::vector dList(n_gpu); + for (int idx = 0; idx < n_gpu; idx++) { + int device_idx = (gpu_id + idx) % n_gpuvis; + dList[idx] = device_idx; + } -template -int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, size_t rows, - size_t cols, const char ord, int k, int max_iterations, - int init_from_data, T threshold, const T *srcdata, - T **pred_centroids, int **pred_labels) { - // init random seed if use the C function rand() - if (seed >= 0) { - srand(seed); - } else { - srand(unsigned(time(NULL))); + *final_n_gpu = n_gpu; + return dList; } - // no more clusters than rows - if (k > rows) { - k = static_cast(rows); - fprintf(stderr, - "Number of clusters adjusted to be equal to number of rows.\n"); - fflush(stderr); + template + H2O4GPUKMeans::H2O4GPUKMeans(const T *A, int k, int n, int d) { + _A = A; + _k = k; + _n = n; + _d = d; } - int n_gpu; - // device list - std::vector dList = - kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + template + int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, const char ord, + int k, int max_iterations, int init_from_data, + T threshold, + const T *srcdata, T **pred_centroids, int **pred_labels) { + // init random seed if use the C function rand() + if (seed >= 0) { + srand(seed); + } else { + srand(unsigned(time(NULL))); + } + + // no more clusters than rows + if (k > rows) { + k = static_cast(rows); + fprintf(stderr, "Number of clusters adjusted to be equal to number of rows.\n"); + fflush(stderr); + } - double t0t = timer(); - thrust::device_vector *data[n_gpu]; - thrust::device_vector *labels[n_gpu]; - thrust::device_vector *d_centroids[n_gpu]; - thrust::device_vector *data_dots[n_gpu]; + int n_gpu; + std::vector dList = kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + + double t0t = timer(); + thrust::device_vector *data[n_gpu]; + thrust::device_vector *labels[n_gpu]; + thrust::device_vector *d_centroids[n_gpu]; + thrust::device_vector *data_dots[n_gpu]; #pragma omp parallel for - for (int device_idx = 0; device_idx < n_gpu; device_idx++) { - CUDACHECK(cudaSetDevice(dList[device_idx])); - data[device_idx] = new thrust::device_vector(rows / n_gpu * cols); - d_centroids[device_idx] = new thrust::device_vector(k * cols); - data_dots[device_idx] = new thrust::device_vector(rows / n_gpu); + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + data[q] = new thrust::device_vector(rows / n_gpu * cols); + d_centroids[q] = new thrust::device_vector(k * cols); + data_dots[q] = new thrust::device_vector(rows / n_gpu); - kmeans::detail::labels_init(); - } + kmeans::detail::labels_init(); + } - log_debug(verbose, "Number of points: %d", rows); - log_debug(verbose, "Number of dimensions: %d", cols); - log_debug(verbose, "Number of clusters: %d", k); - log_debug(verbose, "Max. number of iterations: %d", max_iterations); - log_debug(verbose, "Stopping threshold: %d", threshold); + log_debug(verbose, "Number of points: %d", rows); + log_debug(verbose, "Number of dimensions: %d", cols); + log_debug(verbose, "Number of clusters: %d", k); + log_debug(verbose, "Max. number of iterations: %d", max_iterations); + log_debug(verbose, "Stopping threshold: %d", threshold); - std::vector v(rows); - std::iota(std::begin(v), std::end(v), 0); // Fill with 0, 1, ..., rows. + std::vector v(rows); + std::iota(std::begin(v), std::end(v), 0); // Fill with 0, 1, ..., rows. - if (seed >= 0) { - std::shuffle(v.begin(), v.end(), std::default_random_engine(seed)); - } else { - std::random_shuffle(v.begin(), v.end()); - } + if (seed >= 0) { + std::shuffle(v.begin(), v.end(), std::default_random_engine(seed)); + } else { + std::random_shuffle(v.begin(), v.end()); + } // Copy the data to devices #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - if (verbose) { - std::cout << "Copying data to device: " << dList[q] << std::endl; - } + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + if (verbose) { std::cout << "Copying data to device: " << dList[q] << std::endl; } - copy_data(verbose, ord, *data[q], &srcdata[0], q, rows, rows / n_gpu, cols); + copy_data(verbose, ord, *data[q], &srcdata[0], q, rows, rows / n_gpu, cols); - // Pre-compute the data matrix norms - kmeans::detail::make_self_dots(rows / n_gpu, cols, *data[q], *data_dots[q]); - } + // Pre-compute the data matrix norms + kmeans::detail::make_self_dots(rows / n_gpu, cols, *data[q], *data_dots[q]); + } - // Get random points as centroids - int bytecount = cols * k * sizeof(T); // all centroids - if (0 == init_from_data) { - log_debug(verbose, "KMeans - Using random initialization."); + // Get random points as centroids + int bytecount = cols * k * sizeof(T); // all centroids + if (0 == init_from_data) { + log_debug(verbose, "KMeans - Using random initialization."); - int masterq = 0; - CUDACHECK(cudaSetDevice(dList[masterq])); - copy_centroids_shuffled(verbose, v, ord, *d_centroids[masterq], &srcdata[0], - rows, k, cols); + int masterq = 0; + CUDACHECK(cudaSetDevice(dList[masterq])); + copy_centroids_shuffled(verbose, v, ord, *d_centroids[masterq], &srcdata[0], rows, k, cols); - // Copy centroids to all devices - std::vector streams; - streams.resize(n_gpu); + // Copy centroids to all devices + std::vector < cudaStream_t * > streams; + streams.resize(n_gpu); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - if (q == masterq) - continue; + for (int q = 0; q < n_gpu; q++) { + if (q == masterq) continue; - CUDACHECK(cudaSetDevice(dList[q])); - if (verbose > 0) { - std::cout << "Copying centroid data to device: " << dList[q] - << std::endl; - } + CUDACHECK(cudaSetDevice(dList[q])); + if (verbose > 0) { + std::cout << "Copying centroid data to device: " << dList[q] << std::endl; + } - streams[q] = - reinterpret_cast(malloc(sizeof(cudaStream_t))); - cudaStreamCreate(streams[q]); - cudaMemcpyPeerAsync(thrust::raw_pointer_cast(&(*d_centroids[q])[0]), - dList[q], - thrust::raw_pointer_cast(&(*d_centroids[masterq])[0]), - dList[masterq], bytecount, *(streams[q])); - } - //#pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - if (q == masterq) - continue; - cudaSetDevice(dList[q]); - cudaStreamDestroy(*(streams[q])); -#if (DEBUGKMEANS) - thrust::host_vector h_centroidq = *d_centroids[q]; - for (int ii = 0; ii < k * d; ii++) { - fprintf(stderr, "q=%d initcent[%d]=%g\n", q, ii, h_centroidq[ii]); - fflush(stderr); + streams[q] = reinterpret_cast(malloc(sizeof(cudaStream_t))); + cudaStreamCreate(streams[q]); + cudaMemcpyPeerAsync(thrust::raw_pointer_cast(&(*d_centroids[q])[0]), + dList[q], + thrust::raw_pointer_cast(&(*d_centroids[masterq])[0]), + dList[masterq], + bytecount, + *(streams[q])); } +//#pragma omp parallel for + for (int q = 0; q < n_gpu; q++) { + if (q == masterq) continue; + cudaSetDevice(dList[q]); + cudaStreamDestroy(*(streams[q])); +#if(DEBUGKMEANS) + thrust::host_vector h_centroidq=*d_centroids[q]; + for(int ii=0;ii final_centroids = kmeans_parallel( - verbose, seed, ord, data, data_dots, rows, cols, k, n_gpu, threshold); + thrust::host_vector final_centroids = kmeans_parallel(verbose, seed, ord, data, data_dots, rows, cols, k, n_gpu, threshold); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - cudaMemcpy(thrust::raw_pointer_cast(&(*d_centroids[q])[0]), - thrust::raw_pointer_cast(&final_centroids[0]), bytecount, - cudaMemcpyHostToDevice); + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + cudaMemcpy( + thrust::raw_pointer_cast(&(*d_centroids[q])[0]), + thrust::raw_pointer_cast(&final_centroids[0]), + bytecount, + cudaMemcpyHostToDevice); + } + } - } #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - labels[q] = new thrust::device_vector(rows / n_gpu); - } + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + labels[q] = new thrust::device_vector(rows / n_gpu); + } - double timetransfer = static_cast(timer() - t0t); + double timetransfer = static_cast(timer() - t0t); - double t0 = timer(); + double t0 = timer(); - int iter = kmeans::kmeans(verbose, &flaggpu, rows, cols, k, data, labels, - d_centroids, data_dots, dList, n_gpu, - max_iterations, threshold, true); + int iter = kmeans::kmeans(verbose, &flaggpu, rows, cols, k, data, labels, d_centroids, data_dots, + dList, n_gpu, max_iterations, threshold, true); - if (iter < 0) { - log_error(verbose, "KMeans algorithm failed."); - return iter; - } + if (iter < 0) { + log_error(verbose, "KMeans algorithm failed."); + return iter; + } - double timefit = static_cast(timer() - t0); + double timefit = static_cast(timer() - t0); - double t1 = timer(); + double t1 = timer(); // copy result of centroids (sitting entirely on each device) back to host // TODO FIXME: When do delete ctr and h_labels memory??? thrust::host_vector *ctr = new thrust::host_vector(*d_centroids[0]); *pred_centroids = ctr->data(); - // copy assigned labels - thrust::host_vector *h_labels = new thrust::host_vector(rows); - //#pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - int offset = labels[q]->size() * q; - h_labels->insert(h_labels->begin() + offset, labels[q]->begin(), - labels[q]->end()); - } + // copy assigned labels + thrust::host_vector *h_labels = new thrust::host_vector(rows); +//#pragma omp parallel for + for (int q = 0; q < n_gpu; q++) { + int offset = labels[q]->size()*q; + h_labels->insert(h_labels->begin() + offset, labels[q]->begin(), labels[q]->end()); + } - *pred_labels = h_labels->data(); + *pred_labels = h_labels->data(); - // debug - if (verbose >= H2O4GPU_LOG_VERBOSE) { - for (unsigned int ii = 0; ii < k; ii++) { - fprintf(stderr, "ii=%d of k=%d ", ii, k); - for (unsigned int jj = 0; jj < cols; jj++) { - fprintf(stderr, "%g ", (*pred_centroids)[cols * ii + jj]); + // debug + if (verbose >= H2O4GPU_LOG_VERBOSE) { + for (unsigned int ii = 0; ii < k; ii++) { + fprintf(stderr, "ii=%d of k=%d ", ii, k); + for (unsigned int jj = 0; jj < cols; jj++) { + fprintf(stderr, "%g ", (*pred_centroids)[cols * ii + jj]); + } + fprintf(stderr, "\n"); + fflush(stderr); } - fprintf(stderr, "\n"); - fflush(stderr); } - } #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - delete (data[q]); - delete (labels[q]); - delete (d_centroids[q]); - delete (data_dots[q]); - kmeans::detail::labels_close(); - } + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + delete (data[q]); + delete (labels[q]); + delete (d_centroids[q]); + delete (data_dots[q]); + kmeans::detail::labels_close(); + } - double timecleanup = static_cast(timer() - t1); + double timecleanup = static_cast(timer() - t1); - if (verbose) { - std::cout << " Time fit: " << timefit << " s" << std::endl; - fprintf(stderr, "Timetransfer: %g Timefit: %g Timecleanup: %g\n", - timetransfer, timefit, timecleanup); - fflush(stderr); - } + if (verbose) { + std::cout << " Time fit: " << timefit << " s" << std::endl; + fprintf(stderr, "Timetransfer: %g Timefit: %g Timecleanup: %g\n", timetransfer, timefit, timecleanup); + fflush(stderr); + } - return 0; -} + return 0; + } -template -int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, size_t rows, - size_t cols, const char ord, int k, const T *srcdata, - const T *centroids, int **pred_labels) { - // Print centroids - if (verbose >= H2O4GPU_LOG_VERBOSE) { - std::cout << std::endl; - for (int i = 0; i < cols * k; i++) { - std::cout << centroids[i] << " "; - if (i % cols == 1) { - std::cout << std::endl; + template + int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, + const char ord, int k, + const T *srcdata, const T *centroids, int **pred_labels) { + // Print centroids + if (verbose >= H2O4GPU_LOG_VERBOSE) { + std::cout << std::endl; + for (int i = 0; i < cols * k; i++) { + std::cout << centroids[i] << " "; + if (i % cols == 1) { + std::cout << std::endl; + } } } - } - int n_gpu; - std::vector dList = - kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + int n_gpu; + std::vector dList = kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); - thrust::device_vector *d_data[n_gpu]; - thrust::device_vector *d_centroids[n_gpu]; - thrust::device_vector *data_dots[n_gpu]; - thrust::device_vector *centroid_dots[n_gpu]; - thrust::host_vector *h_labels = new thrust::host_vector(0); + thrust::device_vector *d_data[n_gpu]; + thrust::device_vector *d_centroids[n_gpu]; + thrust::device_vector *data_dots[n_gpu]; + thrust::device_vector *centroid_dots[n_gpu]; + thrust::host_vector *h_labels = new thrust::host_vector(0); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - kmeans::detail::labels_init(); + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + kmeans::detail::labels_init(); - data_dots[q] = new thrust::device_vector(rows / n_gpu); - centroid_dots[q] = new thrust::device_vector(k); + data_dots[q] = new thrust::device_vector(rows / n_gpu); + centroid_dots[q] = new thrust::device_vector(k); - d_centroids[q] = new thrust::device_vector(k * cols); - d_data[q] = new thrust::device_vector(rows / n_gpu * cols); + d_centroids[q] = new thrust::device_vector(k * cols); + d_data[q] = new thrust::device_vector(rows / n_gpu * cols); - copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); + copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); - copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, - cols); + copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, cols); - kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], - *data_dots[q]); + kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], *data_dots[q]); - thrust::device_vector d_labels(rows / n_gpu); + thrust::device_vector d_labels(rows / n_gpu); - kmeans::detail::batch_calculate_distances( - verbose, q, rows / n_gpu, cols, k, *d_data[q], *d_centroids[q], - *data_dots[q], *centroid_dots[q], - [&](int n, size_t offset, - thrust::device_vector &pairwise_distances) { - kmeans::detail::relabel(n, k, pairwise_distances, d_labels, offset); - }); + kmeans::detail::batch_calculate_distances(verbose, q, rows / n_gpu, cols, k, + *d_data[q], *d_centroids[q], *data_dots[q], *centroid_dots[q], + [&](int n, size_t offset, thrust::device_vector &pairwise_distances) { + kmeans::detail::relabel(n, k, pairwise_distances, d_labels, offset); + } + ); - h_labels->insert(h_labels->end(), d_labels.begin(), d_labels.end()); - } + h_labels->insert(h_labels->end(), d_labels.begin(), d_labels.end()); + } - *pred_labels = h_labels->data(); + *pred_labels = h_labels->data(); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - safe_cuda(cudaSetDevice(dList[q])); - kmeans::detail::labels_close(); - delete (data_dots[q]); - delete (centroid_dots[q]); - delete (d_centroids[q]); - delete (d_data[q]); - } + for (int q = 0; q < n_gpu; q++) { + safe_cuda(cudaSetDevice(dList[q])); + kmeans::detail::labels_close(); + delete (data_dots[q]); + delete (centroid_dots[q]); + delete (d_centroids[q]); + delete (d_data[q]); + } - return 0; -} + return 0; + } -template -int kmeans_transform(int verbose, int gpu_idtry, int n_gputry, size_t rows, - size_t cols, const char ord, int k, const T *srcdata, - const T *centroids, T **preds) { - // Print centroids - if (verbose >= H2O4GPU_LOG_VERBOSE) { - std::cout << std::endl; - for (int i = 0; i < cols * k; i++) { - std::cout << centroids[i] << " "; - if (i % cols == 1) { - std::cout << std::endl; + template + int kmeans_transform(int verbose, + int gpu_idtry, int n_gputry, + size_t rows, size_t cols, const char ord, int k, + const T *srcdata, const T *centroids, + T **preds) { + // Print centroids + if (verbose >= H2O4GPU_LOG_VERBOSE) { + std::cout << std::endl; + for (int i = 0; i < cols * k; i++) { + std::cout << centroids[i] << " "; + if (i % cols == 1) { + std::cout << std::endl; + } } } - } - int n_gpu; - std::vector dList = - kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); + int n_gpu; + std::vector dList = kmeans_init(verbose, &n_gpu, n_gputry, gpu_idtry, rows); - thrust::device_vector *d_data[n_gpu]; - thrust::device_vector *d_centroids[n_gpu]; - thrust::device_vector *d_pairwise_distances[n_gpu]; - thrust::device_vector *data_dots[n_gpu]; - thrust::device_vector *centroid_dots[n_gpu]; + thrust::device_vector *d_data[n_gpu]; + thrust::device_vector *d_centroids[n_gpu]; + thrust::device_vector *d_pairwise_distances[n_gpu]; + thrust::device_vector *data_dots[n_gpu]; + thrust::device_vector *centroid_dots[n_gpu]; #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - CUDACHECK(cudaSetDevice(dList[q])); - kmeans::detail::labels_init(); + for (int q = 0; q < n_gpu; q++) { + CUDACHECK(cudaSetDevice(dList[q])); + kmeans::detail::labels_init(); - data_dots[q] = new thrust::device_vector(rows / n_gpu); - centroid_dots[q] = new thrust::device_vector(k); - d_pairwise_distances[q] = new thrust::device_vector(rows / n_gpu * k); + data_dots[q] = new thrust::device_vector(rows / n_gpu); + centroid_dots[q] = new thrust::device_vector(k); + d_pairwise_distances[q] = new thrust::device_vector(rows / n_gpu * k); - d_centroids[q] = new thrust::device_vector(k * cols); - d_data[q] = new thrust::device_vector(rows / n_gpu * cols); + d_centroids[q] = new thrust::device_vector(k * cols); + d_data[q] = new thrust::device_vector(rows / n_gpu * cols); - copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); + copy_data(verbose, 'r', *d_centroids[q], ¢roids[0], 0, k, k, cols); - copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, - cols); + copy_data(verbose, ord, *d_data[q], &srcdata[0], q, rows, rows / n_gpu, cols); - kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], - *data_dots[q]); + kmeans::detail::make_self_dots(rows / n_gpu, cols, *d_data[q], *data_dots[q]); - // TODO batch this - kmeans::detail::calculate_distances( - verbose, q, rows / n_gpu, cols, k, *d_data[q], 0, *d_centroids[q], - *data_dots[q], *centroid_dots[q], *d_pairwise_distances[q]); - } + // TODO batch this + kmeans::detail::calculate_distances(verbose, q, rows / n_gpu, cols, k, + *d_data[q], 0, *d_centroids[q], *data_dots[q], + *centroid_dots[q], *d_pairwise_distances[q]); + } - // Move the resulting labels into host memory from all devices - thrust::host_vector *h_pairwise_distances = new thrust::host_vector(0); + // Move the resulting labels into host memory from all devices + thrust::host_vector *h_pairwise_distances = new thrust::host_vector(0); #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - h_pairwise_distances->insert(h_pairwise_distances->end(), - d_pairwise_distances[q]->begin(), - d_pairwise_distances[q]->end()); - } - *preds = h_pairwise_distances->data(); - - // Print centroids - if (verbose >= H2O4GPU_LOG_VERBOSE) { - std::cout << std::endl; - for (int i = 0; i < rows * cols; i++) { - std::cout << h_pairwise_distances->data()[i] << " "; - if (i % cols == 1) { - std::cout << std::endl; + for (int q = 0; q < n_gpu; q++) { + h_pairwise_distances->insert(h_pairwise_distances->end(), + d_pairwise_distances[q]->begin(), + d_pairwise_distances[q]->end()); + } + *preds = h_pairwise_distances->data(); + + // Print centroids + if (verbose >= H2O4GPU_LOG_VERBOSE) { + std::cout << std::endl; + for (int i = 0; i < rows * cols; i++) { + std::cout << h_pairwise_distances->data()[i] << " "; + if (i % cols == 1) { + std::cout << std::endl; + } } } - } #pragma omp parallel for - for (int q = 0; q < n_gpu; q++) { - safe_cuda(cudaSetDevice(dList[q])); - kmeans::detail::labels_close(); - delete (d_pairwise_distances[q]); - delete (data_dots[q]); - delete (centroid_dots[q]); - delete (d_centroids[q]); - delete (d_data[q]); - } + for (int q = 0; q < n_gpu; q++) { + safe_cuda(cudaSetDevice(dList[q])); + kmeans::detail::labels_close(); + delete (d_pairwise_distances[q]); + delete (data_dots[q]); + delete (centroid_dots[q]); + delete (d_centroids[q]); + delete (d_data[q]); + } - return 0; -} + return 0; + } -template -int makePtr_dense(int dopredict, int verbose, int seed, int gpu_idtry, - int n_gputry, size_t rows, size_t cols, const char ord, int k, - int max_iterations, int init_from_data, T threshold, - const T *srcdata, const T *centroids, T **pred_centroids, - int **pred_labels) { - if (dopredict == 0) { - return kmeans_fit(verbose, seed, gpu_idtry, n_gputry, rows, cols, ord, k, - max_iterations, init_from_data, threshold, srcdata, - pred_centroids, pred_labels); - } else { - return kmeans_predict(verbose, gpu_idtry, n_gputry, rows, cols, ord, k, - srcdata, centroids, pred_labels); + template + int makePtr_dense(int dopredict, int verbose, int seed, int gpu_idtry, int n_gputry, size_t rows, size_t cols, + const char ord, int k, int max_iterations, int init_from_data, + T threshold, const T *srcdata, const T *centroids, + T **pred_centroids, int **pred_labels) { + if (dopredict == 0) { + return kmeans_fit(verbose, seed, gpu_idtry, n_gputry, rows, cols, + ord, k, max_iterations, init_from_data, threshold, + srcdata, pred_centroids, pred_labels); + } else { + return kmeans_predict(verbose, gpu_idtry, n_gputry, rows, cols, + ord, k, + srcdata, centroids, pred_labels); + } } -} -template int makePtr_dense(int dopredict, int verbose, int seed, - int gpu_id, int n_gpu, size_t rows, - size_t cols, const char ord, int k, - int max_iterations, int init_from_data, - float threshold, const float *srcdata, - const float *centroids, - float **pred_centroids, int **pred_labels); - -template int makePtr_dense(int dopredict, int verbose, int seed, - int gpu_id, int n_gpu, size_t rows, - size_t cols, const char ord, int k, - int max_iterations, int init_from_data, - double threshold, const double *srcdata, - const double *centroids, - double **pred_centroids, int **pred_labels); - -template int kmeans_fit(int verbose, int seed, int gpu_idtry, - int n_gputry, size_t rows, size_t cols, - const char ord, int k, int max_iterations, - int init_from_data, float threshold, - const float *srcdata, float **pred_centroids, - int **pred_labels); - -template int kmeans_fit(int verbose, int seed, int gpu_idtry, - int n_gputry, size_t rows, size_t cols, - const char ord, int k, int max_iterations, - int init_from_data, double threshold, - const double *srcdata, double **pred_centroids, - int **pred_labels); - -template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, const char ord, - int k, const float *srcdata, - const float *centroids, int **pred_labels); - -template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, - size_t rows, size_t cols, const char ord, - int k, const double *srcdata, - const double *centroids, int **pred_labels); - -template int kmeans_transform(int verbose, int gpu_id, int n_gpu, - size_t m, size_t n, const char ord, int k, - const float *src_data, - const float *centroids, float **preds); - -template int kmeans_transform(int verbose, int gpu_id, int n_gpu, - size_t m, size_t n, const char ord, int k, - const double *src_data, - const double *centroids, double **preds); - -// Explicit template instantiation. + template int + makePtr_dense(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t rows, size_t cols, + const char ord, int k, int max_iterations, int init_from_data, + float threshold, const float *srcdata, + const float *centroids, float **pred_centroids, int **pred_labels); + + template int + makePtr_dense(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t rows, size_t cols, + const char ord, int k, int max_iterations, int init_from_data, + double threshold, const double *srcdata, + const double *centroids, double **pred_centroids, int **pred_labels); + + template int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, + const char ord, int k, int max_iterations, + int init_from_data, float threshold, + const float *srcdata, + float **pred_centroids, int **pred_labels); + + template int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, + const char ord, int k, int max_iterations, + int init_from_data, double threshold, + const double *srcdata, + double **pred_centroids, int **pred_labels); + + template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, + const char ord, int k, + const float *srcdata, const float *centroids, int **pred_labels); + + template int kmeans_predict(int verbose, int gpu_idtry, int n_gputry, + size_t rows, size_t cols, + const char ord, int k, + const double *srcdata, const double *centroids, int **pred_labels); + + template int kmeans_transform(int verbose, + int gpu_id, int n_gpu, + size_t m, size_t n, const char ord, int k, + const float *src_data, const float *centroids, + float **preds); + + template int kmeans_transform(int verbose, + int gpu_id, int n_gpu, + size_t m, size_t n, const char ord, int k, + const double *src_data, const double *centroids, + double **preds); + + // Explicit template instantiation. #if !defined(H2O4GPU_DOUBLE) || H2O4GPU_DOUBLE == 1 -template class H2O4GPUKMeans; + template + class H2O4GPUKMeans; #endif #if !defined(H2O4GPU_SINGLE) || H2O4GPU_SINGLE == 1 -template class H2O4GPUKMeans; + template + class H2O4GPUKMeans; #endif -} // namespace h2o4gpukmeans +} // namespace h2o4gpukmeans -/* - * Interface for other languages - */ + /* + * Interface for other languages + */ -// Fit and Predict -int make_ptr_float_kmeans(int dopredict, int verbose, int seed, int gpu_id, - int n_gpu, size_t mTrain, size_t n, const char ord, - int k, int max_iterations, int init_from_data, - float threshold, const float *srcdata, - const float *centroids, float **pred_centroids, - int **pred_labels) { - return h2o4gpukmeans::makePtr_dense( - dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, - max_iterations, init_from_data, threshold, srcdata, centroids, - pred_centroids, pred_labels); -} + // Fit and Predict + int make_ptr_float_kmeans(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t mTrain, size_t n, + const char ord, int k, int max_iterations, int init_from_data, + float threshold, const float *srcdata, + const float *centroids, float **pred_centroids, int **pred_labels) { + return h2o4gpukmeans::makePtr_dense(dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, + max_iterations, init_from_data, threshold, + srcdata, centroids, pred_centroids, pred_labels); + } -int make_ptr_double_kmeans(int dopredict, int verbose, int seed, int gpu_id, - int n_gpu, size_t mTrain, size_t n, const char ord, - int k, int max_iterations, int init_from_data, - double threshold, const double *srcdata, - const double *centroids, double **pred_centroids, - int **pred_labels) { - return h2o4gpukmeans::makePtr_dense( - dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, - max_iterations, init_from_data, threshold, srcdata, centroids, - pred_centroids, pred_labels); -} + int make_ptr_double_kmeans(int dopredict, int verbose, int seed, int gpu_id, int n_gpu, size_t mTrain, size_t n, + const char ord, int k, int max_iterations, int init_from_data, + double threshold, const double *srcdata, + const double *centroids, double **pred_centroids, int **pred_labels) { + return h2o4gpukmeans::makePtr_dense(dopredict, verbose, seed, gpu_id, n_gpu, mTrain, n, ord, k, + max_iterations, init_from_data, threshold, + srcdata, centroids, pred_centroids, pred_labels); + } -// Transform -int kmeans_transform_float(int verbose, int gpu_id, int n_gpu, size_t m, - size_t n, const char ord, int k, - const float *src_data, const float *centroids, - float **preds) { - return h2o4gpukmeans::kmeans_transform( - verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); -} + // Transform + int kmeans_transform_float(int verbose, + int gpu_id, int n_gpu, + size_t m, size_t n, const char ord, int k, + const float *src_data, const float *centroids, + float **preds) { + return h2o4gpukmeans::kmeans_transform(verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); + } -int kmeans_transform_double(int verbose, int gpu_id, int n_gpu, size_t m, - size_t n, const char ord, int k, - const double *src_data, const double *centroids, - double **preds) { - return h2o4gpukmeans::kmeans_transform( - verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); -} + int kmeans_transform_double(int verbose, + int gpu_id, int n_gpu, + size_t m, size_t n, const char ord, int k, + const double *src_data, const double *centroids, + double **preds) { + return h2o4gpukmeans::kmeans_transform(verbose, gpu_id, n_gpu, m, n, ord, k, src_data, centroids, preds); + } From adb6ffaabc014ef770cefa9a7ef4f9f8592cd7d1 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 02:12:53 +0800 Subject: [PATCH 41/49] Revert warning flag --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 052f3704a..70e8dd524 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ OPTION(DEV_BUILD "Dev build" OFF) SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_CXX_STANDARD_REQUIRED ON) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") SET(GPU_COMPUTE_VER "" CACHE STRING "Semicolon separated list of compute versions to be built against, e.g. -DGPU_COMPUTE_VER='35;61'") @@ -89,7 +89,7 @@ if(USE_CUDA) FORMAT_GENCODE_FLAGS("${GPU_COMPUTE_VER}" GENCODE_FLAGS) MESSAGE("CUDA architecture flags ${GENCODE_FLAGS}") - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -Wall;") + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo;") FILE(GLOB_RECURSE GPU_SOURCES src/*.cu From 450051eb4cfd53f195665e3daec4f4ff634acd83 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 02:17:56 +0800 Subject: [PATCH 42/49] Inject new kmeans|| into kmeans algorithm. --- src/gpu/kmeans/kmeans_h2o4gpu.cu | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/gpu/kmeans/kmeans_h2o4gpu.cu b/src/gpu/kmeans/kmeans_h2o4gpu.cu index d05750a86..a07fd5087 100644 --- a/src/gpu/kmeans/kmeans_h2o4gpu.cu +++ b/src/gpu/kmeans/kmeans_h2o4gpu.cu @@ -15,6 +15,9 @@ #include "kmeans_impl.h" #include "kmeans_general.h" #include "kmeans_h2o4gpu.h" + +#include "kmeans_init.cuh" + #include #include #include @@ -764,7 +767,21 @@ int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, } else if (1 == init_from_data) { // kmeans|| log_debug(verbose, "KMeans - Using K-Means|| initialization."); - thrust::host_vector final_centroids = kmeans_parallel(verbose, seed, ord, data, data_dots, rows, cols, k, n_gpu, threshold); + thrust::host_vector h_init_data (rows * cols); + // Gather + for (size_t i = 0; i < n_gpu; ++i) { + thrust::copy( + thrust::device, + data[i]->begin(), data[i]->end(), h_init_data.begin()); + } + H2O4GPU::KMeans::KmMatrix init_data(h_init_data, rows, cols); + H2O4GPU::KMeans::KmMatrix final_centroids_matrix = + H2O4GPU::KMeans::KmeansLlInit(seed, 1.5)(init_data, k); + thrust::host_vector final_centroids (final_centroids_matrix.size()); + thrust::copy( + final_centroids_matrix.dev_ptr(), + final_centroids_matrix.dev_ptr() + final_centroids_matrix.size(), + final_centroids.begin()); #pragma omp parallel for for (int q = 0; q < n_gpu; q++) { From 6c8ff4e21fc498a48da54fa9191cddfb67b0004a Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 04:23:51 +0800 Subject: [PATCH 43/49] Re-structure. * Put KmMatrix under gpu/matrix * Use shared utils for both kmeans and KmMatrix. * Rename the namespaces. --- CMakeLists.txt | 3 +- src/common/utils.h | 33 ++++++++ src/gpu/kmeans/KmMatrix/KmConfig.h | 81 ------------------- src/gpu/kmeans/KmMatrix/utils.cuh | 69 ---------------- src/gpu/kmeans/kmeans_general.h | 3 +- src/gpu/kmeans/kmeans_h2o4gpu.cu | 6 +- src/gpu/kmeans/kmeans_init.cu | 32 ++++---- src/gpu/kmeans/kmeans_init.cuh | 21 ++--- src/gpu/kmeans/kmeans_labels.h | 37 ++------- src/gpu/{kmeans => matrix}/KmMatrix/Arith.cu | 18 +++-- src/gpu/{kmeans => matrix}/KmMatrix/Arith.hpp | 10 +-- .../{kmeans => matrix}/KmMatrix/Generator.cuh | 20 ++--- .../{kmeans => matrix}/KmMatrix/Generator.hpp | 8 +- .../KmMatrix/GeneratorKernels.cu | 8 +- .../{kmeans => matrix}/KmMatrix/KmMatrix.cpp | 26 +++--- .../{kmeans => matrix}/KmMatrix/KmMatrix.hpp | 15 ++-- .../KmMatrix/KmMatrixCuda.cu | 15 ++-- .../KmMatrix/KmMatrixCuda.cuh | 8 +- .../KmMatrix/KmMatrixProxy.cpp | 8 +- src/gpu/{kmeans => matrix}/KmMatrix/blas.cuh | 34 ++++---- .../{kmeans/KmMatrix => utils}/GpuInfo.cuh | 11 ++- src/gpu/utils/utils.cuh | 75 ++++++++++++----- tests/cpp/gpu/KmMatrix/test_arith.cu | 6 +- tests/cpp/gpu/KmMatrix/test_matrix.cu | 44 +++++----- tests/cpp/gpu/KmMatrix/test_proxy.cu | 14 ++-- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 16 ++-- 26 files changed, 263 insertions(+), 358 deletions(-) delete mode 100644 src/gpu/kmeans/KmMatrix/KmConfig.h delete mode 100644 src/gpu/kmeans/KmMatrix/utils.cuh rename src/gpu/{kmeans => matrix}/KmMatrix/Arith.cu (94%) rename src/gpu/{kmeans => matrix}/KmMatrix/Arith.hpp (90%) rename src/gpu/{kmeans => matrix}/KmMatrix/Generator.cuh (84%) rename src/gpu/{kmeans => matrix}/KmMatrix/Generator.hpp (80%) rename src/gpu/{kmeans => matrix}/KmMatrix/GeneratorKernels.cu (95%) rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrix.cpp (95%) rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrix.hpp (94%) rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrixCuda.cu (96%) rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrixCuda.cuh (95%) rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrixProxy.cpp (95%) rename src/gpu/{kmeans => matrix}/KmMatrix/blas.cuh (94%) rename src/gpu/{kmeans/KmMatrix => utils}/GpuInfo.cuh (88%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 70e8dd524..c696cad27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,8 +94,7 @@ if(USE_CUDA) FILE(GLOB_RECURSE GPU_SOURCES src/*.cu src/*.cuh - src/gpu/kmeans/KmMatrix/*.cpp - src/gpu/kmeans/KmMatrix/*.hpp + src/gpu/matrix/*.cpp src/common/*.cpp src/common/*.h) diff --git a/src/common/utils.h b/src/common/utils.h index 10c97ac03..a2fb9ef7b 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -4,8 +4,14 @@ */ #pragma once #include + +#include +#include + #include "cblas/cblas.h" +#define USE_CUDA() 1 + template void self_dot(std::vector array_in, int n, int dim, std::vector& dots); @@ -19,3 +25,30 @@ void compute_distances(std::vector data_in, std::vector centroids_in, std::vector &pairwise_distances, int n, int dim, int k); + +// Matrix host dev +#define HG_HOSTDEV __host__ __device__ +#define HG_DEV __device__ +#define HG_DEVINLINE __device__ __forceinline__ +#define HG_HOSTDEVINLINE __host__ __device__ __forceinline__ + +#define h2o4gpu_error(x) error(x, __FILE__, __LINE__); + +inline void error(const char* e, const char* file, int line) +{ + std::stringstream ss; + ss << e << " - " << file << "(" << line << ")"; + //throw error_text; + std::cerr << ss.str() << std::endl; + exit(-1); +} + +#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__); + +inline void check(bool val, const char* e, const char* file, int line) +{ + if (!val) + { + error(e, file, line); + } +} diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h deleted file mode 100644 index fa718a221..000000000 --- a/src/gpu/kmeans/KmMatrix/KmConfig.h +++ /dev/null @@ -1,81 +0,0 @@ -/*! - * Copyright 2018 H2O.ai, Inc. - * License Apache License Version 2.0 (see LICENSE for details) - */ - -#ifndef KM_CONFIG_H_ -#define KM_CONFIG_H_ - -#define USE_CUDA() 1 - -#include "stdio.h" - -// Matrix host dev -#define M_HOSTDEV __host__ __device__ -#define M_DEV __device__ -#define M_DEVINLINE __device__ __forceinline__ -#define M_HOSTDEVINLINE __host__ __device__ __forceinline__ - -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - printf("Cuda failure %s:%d '%s'\n", \ - __FILE__,__LINE__,cudaGetErrorString(e)); \ - fflush( stdout ); \ - exit(EXIT_FAILURE); \ - } \ - } while(0) - -#define CUBLAS_CHECK(cmd) do { \ - cublasStatus_t status = cmd; \ - if ( status != CUBLAS_STATUS_SUCCESS) { \ - const char* errmsg = nullptr; \ - switch(status) { \ - case CUBLAS_STATUS_NOT_INITIALIZED: \ - errmsg = "library not initialized"; \ - break; \ - \ - case CUBLAS_STATUS_ALLOC_FAILED: \ - errmsg = "resource allocation failed"; \ - break; \ - \ - case CUBLAS_STATUS_INVALID_VALUE: \ - errmsg = "an invalid numeric value was used as an argument"; \ - break; \ - \ - case CUBLAS_STATUS_ARCH_MISMATCH: \ - errmsg = "an absent device architectural feature is required"; \ - break; \ - \ - case CUBLAS_STATUS_MAPPING_ERROR: \ - errmsg = "an access to GPU memory space failed"; \ - break; \ - \ - case CUBLAS_STATUS_EXECUTION_FAILED: \ - errmsg = "the GPU program failed to execute"; \ - break; \ - \ - case CUBLAS_STATUS_INTERNAL_ERROR: \ - errmsg = "an internal operation failed"; \ - break; \ - \ - default: \ - errmsg = "unknown error"; \ - break; \ - } \ - printf("%s", errmsg); \ - } \ - } while (false) - -#define M_ERROR(msg) \ - printf("%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, \ - __PRETTY_FUNCTION__); \ - abort(); - -#define M_USER_ERROR(msg) \ - fprintf(stderr, \ - "%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__, \ - __PRETTY_FUNCTION__); \ - exit(1) - -#endif // KM_CONFIG_H_ diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh deleted file mode 100644 index 606359fda..000000000 --- a/src/gpu/kmeans/KmMatrix/utils.cuh +++ /dev/null @@ -1,69 +0,0 @@ -/*! - * Copyright 2018 H2O.ai, Inc. - * License Apache License Version 2.0 (see LICENSE for details) - */ - -#ifndef UTILS_CUH_ -#define UTILS_CUH_ - -#include "GpuInfo.cuh" - -namespace H2O4GPU { -namespace KMeans { - -M_DEVINLINE size_t global_thread_idx () { - return threadIdx.x + blockIdx.x * blockDim.x; -} - -M_DEVINLINE size_t global_thread_idy () { - return threadIdx.y + blockIdx.y * blockDim.y; -} - -M_DEVINLINE size_t grid_stride_x () { - return blockDim.x * gridDim.x; -} - -M_DEVINLINE size_t grid_stride_y () { - return blockDim.y * gridDim.y; -} - -template -T1 M_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) { - return static_cast(ceil(static_cast(a) / b)); -} - - -// Work around for shared memory -// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name -template -struct KmSharedMem; - -template <> -struct KmSharedMem { - __device__ float * ptr() { - extern __shared__ __align__(sizeof(float)) float s_float[]; - return s_float; - } -}; - -template <> -struct KmSharedMem { - __device__ double * ptr() { - extern __shared__ __align__(sizeof(double)) double s_double[]; - return s_double; - } -}; - -template <> -struct KmSharedMem { - __device__ int * ptr() { - extern __shared__ __align__(sizeof(int)) int s_int[]; - return s_int; - } -}; - - -} // KMeans -} // H2O4GPU - -#endif // UTILS_CUH_ \ No newline at end of file diff --git a/src/gpu/kmeans/kmeans_general.h b/src/gpu/kmeans/kmeans_general.h index 9697a48e3..d32dcbce8 100644 --- a/src/gpu/kmeans/kmeans_general.h +++ b/src/gpu/kmeans/kmeans_general.h @@ -4,6 +4,7 @@ */ #pragma once #include "../../common/logger.h" +#include "../utils/utils.cuh" #include "stdio.h" #define MAX_NGPUS 16 @@ -13,8 +14,6 @@ // TODO(pseudotensor): Avoid throw for python exception handling. Need to avoid all exit's and return exit code all the way back. #define gpuErrchk(ans) { gpu_assert((ans), __FILE__, __LINE__); } -#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__); -#define safe_cublas(ans) throw_on_cublas_error((ans), __FILE__, __LINE__); #define CUDACHECK(cmd) do { \ cudaError_t e = cmd; \ diff --git a/src/gpu/kmeans/kmeans_h2o4gpu.cu b/src/gpu/kmeans/kmeans_h2o4gpu.cu index a07fd5087..224070503 100644 --- a/src/gpu/kmeans/kmeans_h2o4gpu.cu +++ b/src/gpu/kmeans/kmeans_h2o4gpu.cu @@ -774,9 +774,9 @@ int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, thrust::device, data[i]->begin(), data[i]->end(), h_init_data.begin()); } - H2O4GPU::KMeans::KmMatrix init_data(h_init_data, rows, cols); - H2O4GPU::KMeans::KmMatrix final_centroids_matrix = - H2O4GPU::KMeans::KmeansLlInit(seed, 1.5)(init_data, k); + h2o4gpu::kMeans::KmMatrix init_data(h_init_data, rows, cols); + h2o4gpu::kMeans::KmMatrix final_centroids_matrix = + h2o4gpu::kMeans::KmeansLlInit(seed, 1.5)(init_data, k); thrust::host_vector final_centroids (final_centroids_matrix.size()); thrust::copy( final_centroids_matrix.dev_ptr(), diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 3a93cc138..091023a9c 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -15,14 +15,16 @@ #include "kmeans_init.cuh" -#include "KmMatrix/KmMatrix.hpp" -#include "KmMatrix/Arith.hpp" -#include "KmMatrix/utils.cuh" -#include "KmMatrix/GpuInfo.cuh" -#include "KmMatrix/blas.cuh" +#include "../matrix/KmMatrix/KmMatrix.hpp" +#include "../matrix/KmMatrix/Arith.hpp" +#include "../matrix/KmMatrix/blas.cuh" +#include "../utils/utils.cuh" +#include "../utils/GpuInfo.cuh" -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace kMeans { + +using namespace Matrix; namespace kernel { // X^2 + Y^2, here only calculates the + operation. @@ -116,7 +118,7 @@ KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, data_dot_.k_param(), centroids_dot_.k_param()); - CUDA_CHECK(cudaGetLastError()); + safe_cuda(cudaGetLastError()); cublasHandle_t handle = GpuInfo::ins().cublas_handle(); @@ -191,7 +193,7 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { void *d_temp_storage = NULL; // determine the temp_storage_bytes - CUDA_CHECK(cub::DeviceHistogram::HistogramEven( + safe_cuda(cub::DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, min_indices.dev_ptr(), weights.dev_ptr(), @@ -200,8 +202,8 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { (T)min_indices.rows(), (int)_centroids.rows())); - CUDA_CHECK(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); - CUDA_CHECK(cub::DeviceHistogram::HistogramEven( + safe_cuda(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); + safe_cuda(cub::DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, min_indices.dev_ptr(), // d_samples weights.dev_ptr(), // d_histogram @@ -209,7 +211,7 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { (T)0.0, // lower_level (T)min_indices.rows(), // upper_level (int)_centroids.rows())); // num_samples - CUDA_CHECK(cudaFree(d_temp_storage)); + safe_cuda(cudaFree(d_temp_storage)); // Sort the indices by weights in ascending order, then use those at front // as result. @@ -352,7 +354,7 @@ KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { "k must be less than or equal to the number of data points" ", k: %lu, data points: %lu", _k, _data.rows()); - M_USER_ERROR(err_msg); + h2o4gpu_error(err_msg); } if (seed_ < 0) { @@ -433,5 +435,5 @@ INSTANTIATE(int) #undef INSTANTIATE } -} // namespace Kmeans -} // namespace H2O4GPU +} // namespace kMeans +} // namespace h2o4gpu diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 6f067fc9f..28a055c2a 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -9,17 +9,18 @@ #include -#include "KmMatrix/KmConfig.h" -#include "KmMatrix/KmMatrix.hpp" -#include "KmMatrix/utils.cuh" -#include "KmMatrix/Generator.hpp" -#include "KmMatrix/Generator.cuh" -#include "KmMatrix/GpuInfo.cuh" +#include "../matrix/KmMatrix/KmMatrix.hpp" +#include "../matrix/KmMatrix/Generator.hpp" +#include "../matrix/KmMatrix/Generator.cuh" +#include "../utils/GpuInfo.cuh" +#include "../utils/utils.cuh" constexpr double ESP = 1e-8; -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace kMeans { + +using namespace Matrix; namespace detail { @@ -202,7 +203,7 @@ struct KmeansLlInit : public KmeansInitBase { // FIXME: Make kmeans++ a derived class of KmeansInitBase -} // namespace Kmeans -} // namespace H2O4GPU +} // namespace kMeans +} // namespace h2o4gpu #endif // KMEANS_INIT_H_ \ No newline at end of file diff --git a/src/gpu/kmeans/kmeans_labels.h b/src/gpu/kmeans/kmeans_labels.h index ada7f286e..87b7b5ed9 100644 --- a/src/gpu/kmeans/kmeans_labels.h +++ b/src/gpu/kmeans/kmeans_labels.h @@ -9,9 +9,13 @@ #include #include #include -#include "kmeans_general.h" #include +#include "kmeans_general.h" +#include "../utils/utils.cuh" + +using namespace h2o4gpu; + inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); @@ -23,20 +27,6 @@ inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort= } } - -inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file, - int line) { - if (code != cudaSuccess) { - std::stringstream ss; - ss << file << "(" << line << ")"; - std::string file_and_line; - ss >> file_and_line; - thrust::system_error(code, thrust::cuda_category(), file_and_line); - } - - return code; -} - #ifdef CUBLAS_API_H_ // cuBLAS API errors static const char *cudaGetErrorEnum(cublasStatus_t error) @@ -72,23 +62,6 @@ static const char *cudaGetErrorEnum(cublasStatus_t error) } #endif -inline cublasStatus_t throw_on_cublas_error(cublasStatus_t code, const char *file, - int line) { - - - if (code != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr,"cublas error: %s %s %d\n", cudaGetErrorEnum(code), file, line); - std::stringstream ss; - ss << file << "(" << line << ")"; - std::string file_and_line; - ss >> file_and_line; - thrust::system_error(code, thrust::cuda_category(), file_and_line); - } - - return code; -} - - extern cudaStream_t cuda_stream[MAX_NGPUS]; template diff --git a/src/gpu/kmeans/KmMatrix/Arith.cu b/src/gpu/matrix/KmMatrix/Arith.cu similarity index 94% rename from src/gpu/kmeans/KmMatrix/Arith.cu rename to src/gpu/matrix/KmMatrix/Arith.cu index 88bc9d920..2f9c4ccd5 100644 --- a/src/gpu/kmeans/KmMatrix/Arith.cu +++ b/src/gpu/matrix/KmMatrix/Arith.cu @@ -1,6 +1,8 @@ #include "Arith.hpp" -namespace H2O4GPU { -namespace KMeans { +#include "../../utils/GpuInfo.cuh" + +namespace h2o4gpu { +namespace Matrix { namespace kernel { @@ -121,7 +123,9 @@ KmMatrix ArgMinOp::argmin(KmMatrix& _val, KmMatrixDim _dim) { return _res; } else { // FIXME - M_ERROR("Not implemented"); + h2o4gpu_error("Not implemented"); + KmMatrix res; + return res; } } @@ -135,7 +139,9 @@ KmMatrix MinOp::min(KmMatrix& _val, KmMatrixDim _dim) { return _res; } else { // FIXME - M_ERROR("Not implemented"); + h2o4gpu_error("Not implemented"); + KmMatrix res; + return res; } } @@ -159,5 +165,5 @@ INSTANTIATE(double) INSTANTIATE(float) INSTANTIATE(int) -} // namespace KMenas -} // namespace H204GPU \ No newline at end of file +} // namespace Matrix +} // namespace h2o4gpu \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/Arith.hpp b/src/gpu/matrix/KmMatrix/Arith.hpp similarity index 90% rename from src/gpu/kmeans/KmMatrix/Arith.hpp rename to src/gpu/matrix/KmMatrix/Arith.hpp index 3ae493455..7760cd6aa 100644 --- a/src/gpu/kmeans/KmMatrix/Arith.hpp +++ b/src/gpu/matrix/KmMatrix/Arith.hpp @@ -3,10 +3,10 @@ #include "KmMatrix.hpp" #include "blas.cuh" -#include "utils.cuh" +#include "../../utils/utils.cuh" -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { // FIXME: Using struct for operations is just keeping the possibility of // creating an unified operations for KmMatrix. For example, let KmMatrix @@ -52,7 +52,7 @@ struct MinOp { KmMatrix min(KmMatrix& _val, KmMatrixDim _dim); }; -} // namespace KMenas -} // namespace H204GPU +} // namespace Matrix +} // namespace h2o4gpu #endif // M_ARITH_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/Generator.cuh b/src/gpu/matrix/KmMatrix/Generator.cuh similarity index 84% rename from src/gpu/kmeans/KmMatrix/Generator.cuh rename to src/gpu/matrix/KmMatrix/Generator.cuh index 890729f3c..061edbe96 100644 --- a/src/gpu/kmeans/KmMatrix/Generator.cuh +++ b/src/gpu/matrix/KmMatrix/Generator.cuh @@ -10,11 +10,11 @@ #include "Generator.hpp" #include "KmMatrix.hpp" -#include "utils.cuh" +#include "../../utils/utils.cuh" -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { namespace kernel { // Split the definition to avoid multiple definition. @@ -45,9 +45,9 @@ struct UniformGenerator : public GeneratorBase { random_numbers_ = KmMatrix (1, size_); if (dev_states_ != nullptr) { - CUDA_CHECK(cudaFree(dev_states_)); + safe_cuda(cudaFree(dev_states_)); } - CUDA_CHECK(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState))); + safe_cuda(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState))); kernel::setup_random_states<<>>( seed_, dev_states_, size_); } @@ -60,7 +60,7 @@ struct UniformGenerator : public GeneratorBase { UniformGenerator (size_t _size, int _seed) { if (_size == 0) { - M_ERROR("Zero size for generate is not allowed."); + h2o4gpu_error("Zero size for generate is not allowed."); } initialize(_size); } @@ -70,7 +70,7 @@ struct UniformGenerator : public GeneratorBase { ~UniformGenerator () { if (dev_states_ != nullptr) { - CUDA_CHECK(cudaFree(dev_states_)); + safe_cuda(cudaFree(dev_states_)); } } @@ -87,7 +87,7 @@ struct UniformGenerator : public GeneratorBase { KmMatrix generate(size_t _size) override { if (_size == 0) { - M_ERROR("Zero size for generate is not allowed."); + h2o4gpu_error("Zero size for generate is not allowed."); } if (_size != size_) { initialize(_size); @@ -96,5 +96,5 @@ struct UniformGenerator : public GeneratorBase { } }; -} // H2O4GPU -} // KMeans \ No newline at end of file +} // namespace h2o4gpu +} // namespace Matrix \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/Generator.hpp b/src/gpu/matrix/KmMatrix/Generator.hpp similarity index 80% rename from src/gpu/kmeans/KmMatrix/Generator.hpp rename to src/gpu/matrix/KmMatrix/Generator.hpp index 7200b85ff..9ef1ae9ac 100644 --- a/src/gpu/kmeans/KmMatrix/Generator.hpp +++ b/src/gpu/matrix/KmMatrix/Generator.hpp @@ -8,8 +8,8 @@ #include "KmMatrix.hpp" -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { template class GeneratorBase { @@ -18,8 +18,8 @@ class GeneratorBase { virtual KmMatrix generate(size_t _size) = 0; }; -} -} +} // namespace Matrix +} // namespace h2o4gpu #endif // GENERATOR_HPP_ diff --git a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu b/src/gpu/matrix/KmMatrix/GeneratorKernels.cu similarity index 95% rename from src/gpu/kmeans/KmMatrix/GeneratorKernels.cu rename to src/gpu/matrix/KmMatrix/GeneratorKernels.cu index 28b521f26..e81885f12 100644 --- a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu +++ b/src/gpu/matrix/KmMatrix/GeneratorKernels.cu @@ -6,8 +6,8 @@ #include #include -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { namespace kernel { __global__ void setup_random_states(int _seed, curandState *_state, @@ -59,5 +59,5 @@ __global__ void generate_uniform_kernel(int *_res, } } // namespace kernel -} // namespace KMeans -} // namespace H2O4GPU \ No newline at end of file +} // namespace Matrix +} // namespace h2o4gpu \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/matrix/KmMatrix/KmMatrix.cpp similarity index 95% rename from src/gpu/kmeans/KmMatrix/KmMatrix.cpp rename to src/gpu/matrix/KmMatrix/KmMatrix.cpp index d350bd7e2..67d3f3207 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp +++ b/src/gpu/matrix/KmMatrix/KmMatrix.cpp @@ -4,13 +4,13 @@ */ #include "KmMatrix.hpp" -#include "KmConfig.h" + #if USE_CUDA() #include "KmMatrixCuda.cuh" #endif -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { // ============================== // KmMatrixImpl implementation @@ -222,7 +222,7 @@ KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem) { template KmMatrixProxy KmMatrix::col(size_t idx) { - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); return KmMatrixProxy(*this, 0, 0, 0); } @@ -236,14 +236,14 @@ KmMatrix KmMatrix::rows(KmMatrix& _index) { } res = impls[(int)Backend::CUDADense]->rows(_index); } else { - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); } return res; } template KmMatrix KmMatrix::cols(KmMatrix& _index) { - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); KmMatrix res; return res; } @@ -254,7 +254,7 @@ bool KmMatrix::operator==(KmMatrix& _rhs) { bool res = impls[(int)Backend::CUDADense]->equal(_rhs); return res; } else { - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); return false; } } @@ -266,17 +266,17 @@ KmMatrix KmMatrix::stack(KmMatrix &_second, if (_dim == KmMatrixDim::ROW) { if (cols() != _second.cols()) { - M_ERROR("Columns of first is not equal to second."); + h2o4gpu_error("Columns of first is not equal to second."); } if (backend_ == Backend::CUDADense) { res = impls[(int)Backend::CUDADense]->stack(_second, _dim); } else { - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); } } else { - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); } return res; @@ -342,7 +342,7 @@ KmMatrix stack(KmMatrix& _first, KmMatrix& _second, template KmMatrix KmMatrix::cols(KmMatrix& _index); \ template bool KmMatrix::operator==(KmMatrix &_rhs); \ template KmMatrix KmMatrix::stack(KmMatrix &_second, \ - H2O4GPU::KMeans::KmMatrixDim _dim); \ + KmMatrixDim _dim); \ /* Helper functions */ \ template std::ostream& operator<<(std::ostream& os, KmMatrix& m); \ template KmMatrix stack(KmMatrix& _first, KmMatrix& _second, \ @@ -354,5 +354,5 @@ INSTANTIATE(double) INSTANTIATE(int) #undef INSTANTIATE -} // namespace KMeans -} // namepsace H2O4GPU +} // namespace Matrix +} // namepsace h2o4gpu diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/matrix/KmMatrix/KmMatrix.hpp similarity index 94% rename from src/gpu/kmeans/KmMatrix/KmMatrix.hpp rename to src/gpu/matrix/KmMatrix/KmMatrix.hpp index 31ad2dcd4..dbeb1943e 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp +++ b/src/gpu/matrix/KmMatrix/KmMatrix.hpp @@ -11,15 +11,14 @@ #include #include #include - -#include "KmConfig.h" +#include "../../../common/utils.h" #if USE_CUDA() #include "KmMatrixCuda.cuh" #endif -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { template class KmMatrixProxy; @@ -53,13 +52,13 @@ struct kParam { cols = _other.cols; ptr = _other.ptr; } - M_HOSTDEV void operator=(const kParam& _other) { + HG_HOSTDEV void operator=(const kParam& _other) { rows = _other.rows; cols = _other.cols; ptr = _other.ptr; } - M_HOSTDEV size_t size() const { + HG_HOSTDEV size_t size() const { return rows * cols; } }; @@ -193,7 +192,7 @@ struct KmMatrixSizeError: public std::runtime_error {} }; -} // namespace KMeans -} // namespace H2O4GPU +} // namespace Matrix +} // namespace h2o4gpu #endif diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu similarity index 96% rename from src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu rename to src/gpu/matrix/KmMatrix/KmMatrixCuda.cu index a2d9a3c29..6b6052ce1 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu +++ b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu @@ -12,8 +12,8 @@ #include "KmMatrixCuda.cuh" #include "KmMatrix.hpp" -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par) : @@ -173,7 +173,7 @@ KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, KmMatrixDim _dim) { if (_dim == KmMatrixDim::ROW) { if (KmMatrixImpl::matrix_->cols() != _second.cols()) { - M_ERROR("Columns of first is not equal to second."); + h2o4gpu_error("Columns of first is not equal to second."); } host_to_device(); @@ -195,7 +195,9 @@ KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, return res; } else { // FIXME - M_ERROR("Not implemented."); + h2o4gpu_error("Not implemented."); + KmMatrix res; + return res; } } @@ -229,5 +231,6 @@ INSTANTIATE(double) INSTANTIATE(int) #undef INSTANTIATE -} // namespace H204GPU -} // namespace Array + +} // namespace Matrix +} // namespace h2o4gpu diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh similarity index 95% rename from src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh rename to src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh index 50500f20e..02f8f6adc 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh +++ b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh @@ -10,8 +10,8 @@ #include "thrust/device_vector.h" #include -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { template class KmMatrix; @@ -81,7 +81,7 @@ class CudaKmMatrixImpl : public KmMatrixImpl { virtual bool on_device() const override; }; -} // MkMatrix -} // H204GPU +} // namespace Matrix +} // namespace h2o4gpu #endif diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp b/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp similarity index 95% rename from src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp rename to src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp index 0bdfa9ebf..9e1d5ffd9 100644 --- a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp +++ b/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp @@ -5,8 +5,8 @@ #include "KmMatrix.hpp" -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { template KmMatrixProxy::KmMatrixProxy(KmMatrix& _other, @@ -64,5 +64,5 @@ INSTANTIATE(float) INSTANTIATE(double) INSTANTIATE(int) -} -} +} // namespace Matrix +} // namespace h2o4gpu diff --git a/src/gpu/kmeans/KmMatrix/blas.cuh b/src/gpu/matrix/KmMatrix/blas.cuh similarity index 94% rename from src/gpu/kmeans/KmMatrix/blas.cuh rename to src/gpu/matrix/KmMatrix/blas.cuh index 20cf73151..50b69898b 100644 --- a/src/gpu/kmeans/KmMatrix/blas.cuh +++ b/src/gpu/matrix/KmMatrix/blas.cuh @@ -7,12 +7,12 @@ #define KM_BLAS_CUH_ #include -#include "KmConfig.h" +#include "../../utils/utils.cuh" // C++ Wrappers for cublas -namespace H2O4GPU { -namespace KMeans { +namespace h2o4gpu { +namespace Matrix { namespace Blas { // LEVEL 1 @@ -20,7 +20,7 @@ inline void axpy(cublasHandle_t handle, int n, const double *alpha, const double *x, int incx, double *y, int incy) { - CUBLAS_CHECK(cublasDaxpy(handle, n, + safe_cublas(cublasDaxpy(handle, n, alpha, x, incx, y, incy));} @@ -29,7 +29,7 @@ inline void axpy(cublasHandle_t handle, int n, const float *alpha, const float *x, int incx, float *y, int incy) { - CUBLAS_CHECK(cublasSaxpy(handle, n, + safe_cublas(cublasSaxpy(handle, n, alpha, x, incx, y, incy));} @@ -38,7 +38,7 @@ inline void axpy(cublasHandle_t handle, int n, const int *alpha, const int *x, int incx, int *y, int incy) { - CUBLAS_CHECK(cublasSaxpy(handle, n, + safe_cublas(cublasSaxpy(handle, n, (const float *)alpha, (const float *)x, incx, (float *)y, incy));} @@ -57,7 +57,7 @@ inline void gemm(cublasHandle_t handle, const float *beta, /* host or device pointer */ float *C, int ldc) { - CUBLAS_CHECK(cublasSgemm(handle, + safe_cublas(cublasSgemm(handle, transa, transb, m, n, k, alpha, /* host or device pointer */ @@ -80,7 +80,7 @@ inline void gemm(cublasHandle_t handle, const double *beta, /* host or device pointer */ double *C, int ldc) { - CUBLAS_CHECK(cublasDgemm(handle, + safe_cublas(cublasDgemm(handle, transa, transb, m, @@ -109,7 +109,7 @@ inline void gemm(cublasHandle_t handle, const int *beta, /* host or device pointer */ int *C, int ldc) { - CUBLAS_CHECK(cublasSgemm(handle, + safe_cublas(cublasSgemm(handle, transa, transb, m, n, k, (const float*)alpha, /* host or device pointer */ @@ -129,7 +129,7 @@ inline void gemm_batched(cublasHandle_t handle, const double *beta, double *Carray[], int ldc, int batchCount) { - CUBLAS_CHECK(cublasDgemmBatched(handle, + safe_cublas(cublasDgemmBatched(handle, transa, transb, m, n, k, @@ -151,7 +151,7 @@ inline void gemm_batched(cublasHandle_t handle, const float *beta, float *Carray[], int ldc, int batchCount) { - CUBLAS_CHECK(cublasSgemmBatched(handle, + safe_cublas(cublasSgemmBatched(handle, transa, transb, m, n, k, @@ -173,7 +173,7 @@ inline void gemm_batched(cublasHandle_t handle, const int *beta, float *Carray[], int ldc, int batchCount) { - CUBLAS_CHECK(cublasSgemmBatched(handle, + safe_cublas(cublasSgemmBatched(handle, transa, transb, m, n, k, @@ -196,7 +196,7 @@ inline void gemm_strided_batched( const double* beta, double* C, int ldC, int strideC, int batchCount) { - CUBLAS_CHECK(cublasDgemmStridedBatched(handle, + safe_cublas(cublasDgemmStridedBatched(handle, transA, transB, M, N, K, @@ -221,7 +221,7 @@ inline void gemm_strided_batched( const float* beta, float* C, int ldC, int strideC, int batchCount) { - CUBLAS_CHECK(cublasSgemmStridedBatched(handle, + safe_cublas(cublasSgemmStridedBatched(handle, transA, transB, M, N, K, @@ -246,7 +246,7 @@ inline void gemm_strided_batched( const int* beta, int* C, int ldC, int strideC, int batchCount) { - CUBLAS_CHECK(cublasSgemmStridedBatched(handle, + safe_cublas(cublasSgemmStridedBatched(handle, transA, transB, M, N, K, @@ -262,7 +262,7 @@ inline void gemm_strided_batched( } } // Blas -} // KMeans -} // H2O4GPU +} // Matrix +} // h2o4gpu #endif // KM_BLAS_CUH_ \ No newline at end of file diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/utils/GpuInfo.cuh similarity index 88% rename from src/gpu/kmeans/KmMatrix/GpuInfo.cuh rename to src/gpu/utils/GpuInfo.cuh index 22fe53e8d..0a57644f7 100644 --- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh +++ b/src/gpu/utils/GpuInfo.cuh @@ -6,13 +6,14 @@ #ifndef GPU_INFO_HPP_ #define GPU_INFO_HPP_ -#include "KmConfig.h" +#include "utils.cuh" #include #include #include +namespace h2o4gpu { // Singleton class storing gpu info. // Call GpuInfo::ins() to use the class; class GpuInfo { @@ -23,19 +24,19 @@ class GpuInfo { public: GpuInfo () { - CUDA_CHECK(cudaGetDeviceCount(&n_gpu_)); + safe_cuda(cudaGetDeviceCount(&n_gpu_)); n_sm_ = (int*) malloc (n_gpu_); handles_ = (cublasHandle_t*) malloc (n_gpu_); for (int i = 0; i < n_gpu_; ++i) { cudaDeviceGetAttribute(&n_sm_[i], cudaDevAttrMultiProcessorCount, i); - CUBLAS_CHECK(cublasCreate(&handles_[i])); + safe_cublas(cublasCreate(&handles_[i])); } } ~GpuInfo () { free (n_sm_); for (int i = 0; i < n_gpu_; ++i) { - CUBLAS_CHECK(cublasDestroy(handles_[i])); + safe_cublas(cublasDestroy(handles_[i])); } free (handles_); } @@ -74,4 +75,6 @@ class GpuInfo { } }; +} // namespace h2o4gpu + #endif // GPU_INFO_HPP_ diff --git a/src/gpu/utils/utils.cuh b/src/gpu/utils/utils.cuh index 3f35f375d..fed9e46e1 100644 --- a/src/gpu/utils/utils.cuh +++ b/src/gpu/utils/utils.cuh @@ -9,29 +9,10 @@ #include #include +#include "../../common/utils.h" + namespace h2o4gpu { -#define h2o4gpu_error(x) error(x, __FILE__, __LINE__); - - inline void error(const char* e, const char* file, int line) - { - std::stringstream ss; - ss << e << " - " << file << "(" << line << ")"; - //throw error_text; - std::cerr << ss.str() << std::endl; - exit(-1); - } - -#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__); - - inline void check(bool val, const char* e, const char* file, int line) - { - if (!val) - { - error(e, file, line); - } - } - #define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__) @@ -366,4 +347,56 @@ namespace h2o4gpu return idx * col_size; }); } + +HG_DEVINLINE size_t global_thread_idx () { + return threadIdx.x + blockIdx.x * blockDim.x; +} + +HG_DEVINLINE size_t global_thread_idy () { + return threadIdx.y + blockIdx.y * blockDim.y; +} + +HG_DEVINLINE size_t grid_stride_x () { + return blockDim.x * gridDim.x; +} + +HG_DEVINLINE size_t grid_stride_y () { + return blockDim.y * gridDim.y; +} + +template +T1 HG_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) { + return static_cast(ceil(static_cast(a) / b)); +} + + +// Work around for shared memory +// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name +template +struct KernelSharedMem; + +template <> +struct KernelSharedMem { + __device__ float * ptr() { + extern __shared__ __align__(sizeof(float)) float s_float[]; + return s_float; + } +}; + +template <> +struct KernelSharedMem { + __device__ double * ptr() { + extern __shared__ __align__(sizeof(double)) double s_double[]; + return s_double; + } +}; + +template <> +struct KernelSharedMem { + __device__ int * ptr() { + extern __shared__ __align__(sizeof(int)) int s_int[]; + return s_int; + } +}; + } diff --git a/tests/cpp/gpu/KmMatrix/test_arith.cu b/tests/cpp/gpu/KmMatrix/test_arith.cu index fda0db018..81bf4101c 100644 --- a/tests/cpp/gpu/KmMatrix/test_arith.cu +++ b/tests/cpp/gpu/KmMatrix/test_arith.cu @@ -2,12 +2,12 @@ #include #include -#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" -#include "../../../../src/gpu/kmeans/KmMatrix/Arith.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/Arith.hpp" #include -using namespace H2O4GPU::KMeans; +using namespace h2o4gpu::Matrix; constexpr float esp = 0.001f; diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu index 3b36b35fe..e1aae97f1 100644 --- a/tests/cpp/gpu/KmMatrix/test_matrix.cu +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -1,7 +1,7 @@ #include #include -#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" #include // r --gtest_filter=KmMatrix.KmMatrixEqual @@ -10,7 +10,7 @@ TEST(KmMatrix, KmMatrixEqual) { for (size_t i = 0; i < 2048 * 1024; ++i) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat (vec, 2048, 1024); + h2o4gpu::Matrix::KmMatrix mat (vec, 2048, 1024); ASSERT_TRUE (mat == mat); @@ -18,7 +18,7 @@ TEST(KmMatrix, KmMatrixEqual) { for (size_t i = 0; i < 2048 * 1024; ++i) { vec2[i] = i + i; } - H2O4GPU::KMeans::KmMatrix mat2 (vec2, 2048, 1024); + h2o4gpu::Matrix::KmMatrix mat2 (vec2, 2048, 1024); ASSERT_FALSE(mat == mat2); } @@ -29,9 +29,9 @@ TEST(KmMatrix, KmMatrixAssig) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat0 (vec, 2048, 1024); - H2O4GPU::KMeans::KmMatrix mat1 = mat0; - H2O4GPU::KMeans::KmMatrix mat2; + h2o4gpu::Matrix::KmMatrix mat0 (vec, 2048, 1024); + h2o4gpu::Matrix::KmMatrix mat1 = mat0; + h2o4gpu::Matrix::KmMatrix mat2; mat2 = mat0; @@ -44,16 +44,16 @@ TEST(KmMatrix, KmMatrixRows) { for (size_t i = 0; i < 12 * 16; ++i) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 16); thrust::host_vector h_index (4, 1); h_index[0] = 0; h_index[1] = 2; h_index[2] = 9; h_index[3] = 1; - H2O4GPU::KMeans::KmMatrix index (h_index, 4, 1); + h2o4gpu::Matrix::KmMatrix index (h_index, 4, 1); - H2O4GPU::KMeans::KmMatrix rows = mat.rows(index); + h2o4gpu::Matrix::KmMatrix rows = mat.rows(index); thrust::host_vector h_sol (4 * 16); for (size_t i = 0; i < 16; ++i) { @@ -69,7 +69,7 @@ TEST(KmMatrix, KmMatrixRows) { h_sol[i] = vec[16 * 1 + (i - 48)]; } - H2O4GPU::KMeans::KmMatrix sol (h_sol, 4, 16); + h2o4gpu::Matrix::KmMatrix sol (h_sol, 4, 16); ASSERT_TRUE(rows == sol); } @@ -77,13 +77,13 @@ TEST(KmMatrix, KmMatrixRows) { TEST(KmMatrix, SizeError) { thrust::host_vector vec (12 * 16); ASSERT_THROW( - H2O4GPU::KMeans::KmMatrix mat (vec, 12, 4), + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 4), std::runtime_error); } TEST(KmMatrix, KmMatrixUtils) { thrust::host_vector vec (12 * 16); - H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 16); ASSERT_EQ(mat.rows(), 12); ASSERT_EQ(mat.cols(), 16); @@ -93,9 +93,9 @@ TEST(KmMatrix, KmMatrixUtils) { TEST(KmMatrix, KmMatrixKparam) { thrust::host_vector vec (12 * 16); thrust::fill(vec.begin(), vec.end(), 1); - H2O4GPU::KMeans::KmMatrix mat (vec, 12, 16); + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 16); - H2O4GPU::KMeans::kParam param = mat.k_param(); + h2o4gpu::Matrix::kParam param = mat.k_param(); ASSERT_EQ(param.ptr, mat.dev_ptr()); ASSERT_EQ(param.rows, 12); ASSERT_EQ(param.cols, 16); @@ -110,11 +110,11 @@ TEST(KmMatrix, KmMatrixCycle) { // Tweak this one to see if memory grows, there should be a better way to // test memory leak. size_t iters = std::pow(16, 1); - H2O4GPU::KMeans::KmMatrix mat0 (vec, rows, cols); + h2o4gpu::Matrix::KmMatrix mat0 (vec, rows, cols); mat0.dev_ptr(); for (size_t i = 0; i < iters; ++i) { - H2O4GPU::KMeans::KmMatrix mat1 = mat0; - H2O4GPU::KMeans::KmMatrix mat2 = mat1; + h2o4gpu::Matrix::KmMatrix mat1 = mat0; + h2o4gpu::Matrix::KmMatrix mat2 = mat1; mat0 = mat2; } } @@ -126,16 +126,16 @@ TEST(KmMatrix, Stack) { for (size_t i = 0; i < rows * cols; ++i) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat(vec, rows, cols); + h2o4gpu::Matrix::KmMatrix mat(vec, rows, cols); thrust::host_vector vec1 (rows * cols); for (size_t i = rows * cols; i < 2 * rows * cols; ++i) { vec1[i - rows * cols] = i; } - H2O4GPU::KMeans::KmMatrix mat1(vec1, rows, cols); + h2o4gpu::Matrix::KmMatrix mat1(vec1, rows, cols); - H2O4GPU::KMeans::KmMatrix calculated = - H2O4GPU::KMeans::stack(mat, mat1, H2O4GPU::KMeans::KmMatrixDim::ROW); + h2o4gpu::Matrix::KmMatrix calculated = + h2o4gpu::Matrix::stack(mat, mat1, h2o4gpu::Matrix::KmMatrixDim::ROW); thrust::host_vector res (2 * rows * cols); for (size_t i = 0; i < rows * cols; ++i) { @@ -145,7 +145,7 @@ TEST(KmMatrix, Stack) { res[i] = i; } - H2O4GPU::KMeans::KmMatrix res_mat (res, 2 * rows, cols); + h2o4gpu::Matrix::KmMatrix res_mat (res, 2 * rows, cols); ASSERT_TRUE(calculated == res_mat); } diff --git a/tests/cpp/gpu/KmMatrix/test_proxy.cu b/tests/cpp/gpu/KmMatrix/test_proxy.cu index 732d27eef..f7574596f 100644 --- a/tests/cpp/gpu/KmMatrix/test_proxy.cu +++ b/tests/cpp/gpu/KmMatrix/test_proxy.cu @@ -1,6 +1,6 @@ #include #include -#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" // r --gtest_filter=KmMatrix.KmMatrixHostProxy TEST(KmMatrix, KmMatrixProxyHostEqual) { @@ -10,9 +10,9 @@ TEST(KmMatrix, KmMatrixProxyHostEqual) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat (vec, rows, cols); + h2o4gpu::Matrix::KmMatrix mat (vec, rows, cols); - H2O4GPU::KMeans::KmMatrix row = mat.row(1); + h2o4gpu::Matrix::KmMatrix row = mat.row(1); thrust::host_vector res (cols); @@ -20,7 +20,7 @@ TEST(KmMatrix, KmMatrixProxyHostEqual) { res[i] = v; } - H2O4GPU::KMeans::KmMatrix res_mat (res, 1, cols); + h2o4gpu::Matrix::KmMatrix res_mat (res, 1, cols); ASSERT_TRUE(res_mat == row); } @@ -34,12 +34,12 @@ TEST(KmMatrix, KmMatrixProxyDevEqual) { vec[i] = i; } - H2O4GPU::KMeans::KmMatrix mat (vec, rows, cols); + h2o4gpu::Matrix::KmMatrix mat (vec, rows, cols); mat.set_name ("mat"); mat.dev_ptr(); - H2O4GPU::KMeans::KmMatrix row = mat.row(1); + h2o4gpu::Matrix::KmMatrix row = mat.row(1); row.set_name ("row"); thrust::host_vector res (cols); @@ -48,7 +48,7 @@ TEST(KmMatrix, KmMatrixProxyDevEqual) { res[i] = v; } - H2O4GPU::KMeans::KmMatrix res_mat (res, 1, cols); + h2o4gpu::Matrix::KmMatrix res_mat (res, 1, cols); res_mat.set_name("res"); ASSERT_TRUE(res_mat == row); diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index d15b6d4d7..32efb79d1 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -5,22 +5,26 @@ #include -#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp" -#include "../../../../src/gpu/kmeans/KmMatrix/Generator.hpp" -#include "../../../../src/gpu/kmeans/KmMatrix/Arith.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/Generator.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/Arith.hpp" #include "../../../../src/gpu/kmeans/kmeans_init.cuh" +#include "../../../../src/common/utils.h" #include #include #include -using namespace H2O4GPU::KMeans; +using namespace h2o4gpu::kMeans; +using namespace h2o4gpu::Matrix; template struct GeneratorMock : GeneratorBase { public: KmMatrix generate() override { - M_ERROR("Not implemented"); + h2o4gpu_error("Not implemented"); + KmMatrix res; + return res; } KmMatrix generate(size_t _size) override { @@ -155,7 +159,7 @@ TEST(KmeansLL, KmeansLLInit) { h_data[i] = i - 4; } - H2O4GPU::KMeans::KmMatrix data (h_data, 6, 5); + KmMatrix data (h_data, 6, 5); auto res = kmeans_ll_init(data, 2); From 9afa105642feadde97ef9c13a1df5b41df6fb77f Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 26 Jul 2018 05:09:18 +0800 Subject: [PATCH 44/49] Add config for USE_CUDA. --- CMakeLists.txt | 7 +++++++ src/common/CMakeLists.txt | 1 + src/common/{utils.h => utils.h.in} | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 src/common/CMakeLists.txt rename src/common/{utils.h => utils.h.in} (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index c696cad27..811e9be2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,13 @@ ADD_LIBRARY(cpuh2o4gpu STATIC ${CPU_SOURCES} $) TARGET_LINK_LIBRARIES(cpuh2o4gpu ${BLAS_LIBRARIES}) #============= BUILD CPU LIBRARY +if (USE_CUDA) + SET(HG_USE_CUDA 1) +else() + SET(HG_USE_CUDA 0) +endif(USE_CUDA) +ADD_SUBDIRECTORY(${CMAKE_CURRENT_LIST_DIR}/src/common) + if(USE_CUDA) FIND_PACKAGE(CUDA 8.0 REQUIRED) FIND_PACKAGE(NVML REQUIRED) diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt new file mode 100644 index 000000000..887751962 --- /dev/null +++ b/src/common/CMakeLists.txt @@ -0,0 +1 @@ +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/utils.h.in ${CMAKE_CURRENT_SOURCE_DIR}/utils.h) diff --git a/src/common/utils.h b/src/common/utils.h.in similarity index 97% rename from src/common/utils.h rename to src/common/utils.h.in index a2fb9ef7b..179b5f2ec 100644 --- a/src/common/utils.h +++ b/src/common/utils.h.in @@ -10,7 +10,7 @@ #include "cblas/cblas.h" -#define USE_CUDA() 1 +#define USE_CUDA() @HG_USE_CUDA@ template void self_dot(std::vector array_in, int n, int dim, From bf9e479173511c52527d779f17549cbe7ac59154 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 27 Jul 2018 03:10:46 +0800 Subject: [PATCH 45/49] Fix doc for KmeansLlInit. --- src/gpu/kmeans/kmeans_init.cuh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 28a055c2a..af656f731 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -115,6 +115,8 @@ class KmeansRandomInit : public KmeansInitBase { * Scalable K-Means++ * * + * @tparam ReclusterPolicy Policy for final recluster, default is + * GreedyRecluster * @tparam Numeric data type. */ template < @@ -148,16 +150,14 @@ struct KmeansLlInit : public KmeansInitBase { /* * Initialize KmeansLlInit algorithm, with default: - * over_sample = 1.5, - * seed = 0, + * over_sample = 1.5. */ KmeansLlInit () : over_sample_ (1.5f), seed_ (-1), k_ (0), generator_ (new UniformGenerator) {} /* - * Initialize KmeansLlInit algorithm, with default: - * seed = 0, + * Initialize KmeansLlInit algorithm. * * @param over_sample over_sample rate. * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ @@ -174,6 +174,8 @@ struct KmeansLlInit : public KmeansInitBase { * @param seed Seed used to generate threshold for sampling centroids. * @param over_sample over_sample rate. * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + * Note that when \f$over_sample != 1\f$, the probability for each data + * point doesn't add to 1. */ KmeansLlInit (int _seed, T _over_sample) : seed_(_seed), over_sample_(_over_sample), k_(0), @@ -186,6 +188,8 @@ struct KmeansLlInit : public KmeansInitBase { * sampling centroids. * @param over_sample over_sample rate. * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + * Note that when \f$over_sample != 1\f$, the probability for each data + * point doesn't add to 1. */ KmeansLlInit (std::unique_ptr>& _gen, T _over_sample) : generator_(std::move(_gen)), over_sample_ (1.5f), seed_ (-1), k_(0) {} From 10d8ac62e75fe7c6f9638a7349139f3f856437d2 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 27 Jul 2018 03:40:34 +0800 Subject: [PATCH 46/49] Remove SelfMinOp. --- src/gpu/kmeans/kmeans_init.cu | 35 +++--------------------- src/gpu/kmeans/kmeans_init.cuh | 4 ++- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 2 +- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu index 091023a9c..db11cfc86 100644 --- a/src/gpu/kmeans/kmeans_init.cu +++ b/src/gpu/kmeans/kmeans_init.cu @@ -69,23 +69,6 @@ __global__ void self_row_argmin_sequential(kParam _res, kParam _val) { } } -template -__global__ void self_row_min_sequential(kParam _res, kParam _val) { - - size_t idx = global_thread_idx(); - if (idx < _val.rows) { - T min = std::numeric_limits::max(); - for (size_t i = 0; i < _val.cols; ++i) { - T value = _val.ptr[idx * _val.cols + i]; - if (value < min) { - min = value; - } - } - min += ESP; - _res.ptr[idx] = min; - } -} - } // namespace kernel namespace detail { @@ -154,18 +137,6 @@ struct SelfArgMinOp { } }; -// MinOp that adds ESP to 0 value. -template -struct SelfMinOp { - KmMatrix min(KmMatrix& _val, KmMatrixDim _dim) { - size_t blocks = GpuInfo::ins().blocks(32); - KmMatrix _res(_val.rows(), 1); - kernel::self_row_min_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; - } -}; - // We use counting to construct the weight as described in the paper. Counting // is performed by histogram algorithm. // For re-cluster, the paper suggests using K-Means++, but that will require @@ -202,6 +173,7 @@ KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { (T)min_indices.rows(), (int)_centroids.rows())); + // cub has level bound, which deals with -1 returned by SelfArgMinOp safe_cuda(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); safe_cuda(cub::DeviceHistogram::HistogramEven( d_temp_storage, temp_storage_bytes, @@ -286,10 +258,11 @@ KmMatrix KmeansLlInit::probability( data_dot_, centroids_dot, distance_pairs_); distance_pairs_ = distance_op(_data, _centroids); - KmMatrix min_distances = detail::SelfMinOp().min(distance_pairs_, - KmMatrixDim::ROW); + KmMatrix min_distances = MinOp().min(distance_pairs_, + KmMatrixDim::ROW); T cost = SumOp().sum(min_distances); + cost += ESP; KmMatrix prob (min_distances.rows(), 1); MulOp().mul(prob, min_distances, over_sample_ * k_ / cost); diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index af656f731..2f7496f1a 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -116,7 +116,9 @@ class KmeansRandomInit : public KmeansInitBase { * * * @tparam ReclusterPolicy Policy for final recluster, default is - * GreedyRecluster + * GreedyRecluster. + * Contract: + * ReclusterPolicy::recluster(KmMatrix& centroids, size_t _k) * @tparam Numeric data type. */ template < diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 32efb79d1..7fae8ffdc 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -165,8 +165,8 @@ TEST(KmeansLL, KmeansLLInit) { std::vector h_sol = { + 4, 5, 6, 7, 8, 19, 20, 21, 22, 23, - 5, 6, 7, 8, 9 }; KmMatrix sol (h_sol, 2, 5); ASSERT_TRUE(sol == res); From 7efaf0337d20a0fd56fe800432cbb3e169280ff1 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 27 Jul 2018 06:20:28 +0800 Subject: [PATCH 47/49] Use cub function for calculating min. --- src/gpu/matrix/KmMatrix/Arith.cu | 68 +++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/src/gpu/matrix/KmMatrix/Arith.cu b/src/gpu/matrix/KmMatrix/Arith.cu index 2f9c4ccd5..4b3a7f112 100644 --- a/src/gpu/matrix/KmMatrix/Arith.cu +++ b/src/gpu/matrix/KmMatrix/Arith.cu @@ -1,3 +1,4 @@ +#include #include "Arith.hpp" #include "../../utils/GpuInfo.cuh" @@ -6,28 +7,21 @@ namespace Matrix { namespace kernel { +// Compute segment offsets for cub segment funtion. +template +__global__ void segment_offsets(kParam _res, kParam _val) { + size_t idx = global_thread_idx(); + if (idx < _res.size()) { + _res.ptr[idx] = _val.cols * idx; + } +} + /* * Compute min value for each row. * @tparam T Numeric type of the data * @param _res The output matrix with shape m x 1 * @param _val The input matrix with shape m x n */ -template -__global__ void row_min_sequential(kParam _res, kParam _val) { - - size_t idx = global_thread_idx(); - if (idx < _val.rows) { - T min = std::numeric_limits::max(); - for (size_t i = 0; i < _val.cols; ++i) { - T value = _val.ptr[idx * _val.cols + i]; - if (value < min) { - min = value; - } - } - _res.ptr[idx] = min; - } -} - template __global__ void row_argmin_sequential(kParam _res, kParam _val) { @@ -48,6 +42,7 @@ __global__ void row_argmin_sequential(kParam _res, kParam _val) { } // namespace kernel +// FIXME: The dot function deals with vector, not matrix. template void DotOp::dot(KmMatrix& _res, KmMatrix& _val) { this->dot(_res, _val, _val); @@ -73,7 +68,8 @@ void VecBatchDotOp::dot(KmMatrix& _res, KmMatrix& _val) { this->dot(_res, _val, _val); } template -void VecBatchDotOp::dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs) { +void VecBatchDotOp::dot(KmMatrix& _res, + KmMatrix& _lhs, KmMatrix& _rhs) { constexpr T alpha = 1.0; constexpr T beta = 1.0; cublasHandle_t handle = GpuInfo::ins().cublas_handle(); @@ -117,6 +113,9 @@ T MeanOp::mean(KmMatrix& _val) { template KmMatrix ArgMinOp::argmin(KmMatrix& _val, KmMatrixDim _dim) { if (_dim == KmMatrixDim::ROW) { + // FIXME: Didn't use cub function, offsets occupies n * sizeof(T) memory, + // occupies 2 * n * sizeof(T) memory considering memory + // alignment. That would be 3 * n * sizeof(T) in total. KmMatrix _res(_val.rows(), 1); kernel::row_argmin_sequential<<>>( _res.k_param(), _val.k_param()); @@ -131,12 +130,37 @@ KmMatrix ArgMinOp::argmin(KmMatrix& _val, KmMatrixDim _dim) { template KmMatrix MinOp::min(KmMatrix& _val, KmMatrixDim _dim) { - size_t blocks = GpuInfo::ins().blocks(32); if (_dim == KmMatrixDim::ROW) { - KmMatrix _res(_val.rows(), 1); - kernel::row_min_sequential<<>>( - _res.k_param(), _val.k_param()); - return _res; + KmMatrix res (_val.rows(), 1); + KmMatrix offsets (_val.rows() + 1, 1); + + kernel::segment_offsets<<>>( + offsets.k_param(), _val.k_param()); + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + safe_cuda(cub::DeviceSegmentedReduce::Min( + d_temp_storage, + temp_storage_bytes, + _val.dev_ptr(), + res.dev_ptr(), + _val.rows(), + offsets.dev_ptr(), + offsets.dev_ptr() + 1)); + + safe_cuda(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); + safe_cuda(cub::DeviceSegmentedReduce::Min( + d_temp_storage, + temp_storage_bytes, + _val.dev_ptr(), + res.dev_ptr(), + _val.rows(), + offsets.dev_ptr(), + offsets.dev_ptr() + 1)); + safe_cuda(cudaFree(d_temp_storage)); + + return res; } else { // FIXME h2o4gpu_error("Not implemented"); From 7468a0b443fff100f95797c4765bc231b8e8c8e7 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 27 Jul 2018 06:45:27 +0800 Subject: [PATCH 48/49] Use thrust to generate random numbers. --- src/gpu/matrix/KmMatrix/Generator.cuh | 53 ++++++----------- src/gpu/matrix/KmMatrix/GeneratorKernels.cu | 63 --------------------- 2 files changed, 17 insertions(+), 99 deletions(-) delete mode 100644 src/gpu/matrix/KmMatrix/GeneratorKernels.cu diff --git a/src/gpu/matrix/KmMatrix/Generator.cuh b/src/gpu/matrix/KmMatrix/Generator.cuh index 061edbe96..99abd9b6e 100644 --- a/src/gpu/matrix/KmMatrix/Generator.cuh +++ b/src/gpu/matrix/KmMatrix/Generator.cuh @@ -6,8 +6,6 @@ #include #include -#include - #include "Generator.hpp" #include "KmMatrix.hpp" #include "../../utils/utils.cuh" @@ -16,25 +14,9 @@ namespace h2o4gpu { namespace Matrix { -namespace kernel { -// Split the definition to avoid multiple definition. -__global__ void setup_random_states(int _seed, curandState *_state, - size_t _size); - -__global__ void generate_uniform_kernel(float *_res, - curandState *_state, - int _size); - -__global__ void generate_uniform_kernel(double *_res, - curandState *_state, - int _size); -} - template struct UniformGenerator : public GeneratorBase { private: - // FIXME: Use KmMatrix - curandState *dev_states_; size_t size_; // FIXME: Cache random_numbers_ in a safer way. KmMatrix random_numbers_; @@ -43,22 +25,15 @@ struct UniformGenerator : public GeneratorBase { void initialize (size_t _size) { size_ = _size; random_numbers_ = KmMatrix (1, size_); - - if (dev_states_ != nullptr) { - safe_cuda(cudaFree(dev_states_)); - } - safe_cuda(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState))); - kernel::setup_random_states<<>>( - seed_, dev_states_, size_); } public: - UniformGenerator() : dev_states_ (nullptr), size_ (0) { + UniformGenerator() : size_ (0) { std::random_device rd; seed_ = rd(); } - UniformGenerator (size_t _size, int _seed) { + UniformGenerator (size_t _size, int _seed) : seed_(_seed) { if (_size == 0) { h2o4gpu_error("Zero size for generate is not allowed."); } @@ -66,13 +41,9 @@ struct UniformGenerator : public GeneratorBase { } UniformGenerator(int _seed) : - seed_(_seed), dev_states_(nullptr), size_ (0) {} + seed_(_seed), size_ (0) {} - ~UniformGenerator () { - if (dev_states_ != nullptr) { - safe_cuda(cudaFree(dev_states_)); - } - } + ~UniformGenerator () {} UniformGenerator(const UniformGenerator& _rhs) = delete; UniformGenerator(UniformGenerator&& _rhs) = delete; @@ -80,8 +51,18 @@ struct UniformGenerator : public GeneratorBase { void operator=(UniformGenerator&& _rhs) = delete; KmMatrix generate() override { - kernel::generate_uniform_kernel<<>> - (random_numbers_.k_param().ptr, dev_states_, size_); + thrust::device_ptr rn_ptr (random_numbers_.dev_ptr()); + thrust::transform( + thrust::make_counting_iterator((size_t)0), + thrust::make_counting_iterator(size_), + rn_ptr, + [=] __device__ (int idx) { + thrust::default_random_engine rng(seed_); + thrust::uniform_real_distribution dist; + rng.discard(idx); + return dist(rng); + }); + return random_numbers_; } @@ -95,6 +76,6 @@ struct UniformGenerator : public GeneratorBase { return generate(); } }; - + } // namespace h2o4gpu } // namespace Matrix \ No newline at end of file diff --git a/src/gpu/matrix/KmMatrix/GeneratorKernels.cu b/src/gpu/matrix/KmMatrix/GeneratorKernels.cu deleted file mode 100644 index e81885f12..000000000 --- a/src/gpu/matrix/KmMatrix/GeneratorKernels.cu +++ /dev/null @@ -1,63 +0,0 @@ -/*! - * Copyright 2018 H2O.ai, Inc. - * License Apache License Version 2.0 (see LICENSE for details) - */ - -#include -#include - -namespace h2o4gpu { -namespace Matrix { -namespace kernel { - -__global__ void setup_random_states(int _seed, curandState *_state, - size_t _size) { - int id = threadIdx.x + blockIdx.x * threadIdx.x; - /* Each thread gets same seed, a different sequence - number, no offset */ - if (id < _size) - curand_init(_seed, id, 0, &_state[id]); -} - -__global__ void generate_uniform_kernel(float *_res, - curandState *_state, - int _size) { - int idx = threadIdx.x + blockIdx.x * threadIdx.x; - if (idx < _size) { - float x; - curandState local_state = _state[idx]; - x = curand_uniform(&local_state); - _state[idx] = local_state; - _res[idx] = x; - } -} - -__global__ void generate_uniform_kernel(double *_res, - curandState *_state, - int _size) { - int idx = threadIdx.x + blockIdx.x * threadIdx.x; - if (idx < _size) { - double x; - curandState local_state = _state[idx]; - x = curand_uniform_double(&local_state); - _state[idx] = local_state; - _res[idx] = x; - } -} - -__global__ void generate_uniform_kernel(int *_res, - curandState *_state, - int _size) { - int idx = threadIdx.x + blockIdx.x * threadIdx.x; - if (idx < _size) { - int x; - curandState local_state = _state[idx]; - x = (int) curand_uniform_double(&local_state); - _state[idx] = local_state; - _res[idx] = x; - } -} - -} // namespace kernel -} // namespace Matrix -} // namespace h2o4gpu \ No newline at end of file From f29707449ab7f55816b05ccb4325b69ad4a847c6 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 27 Jul 2018 07:00:26 +0800 Subject: [PATCH 49/49] Rename Generator to RandomGenerator. --- src/gpu/kmeans/kmeans_init.cuh | 19 ++++++++++--------- src/gpu/matrix/KmMatrix/Generator.cuh | 19 +++++++++---------- src/gpu/matrix/KmMatrix/Generator.hpp | 2 +- tests/cpp/gpu/kmeans/test_kmeans_init.cu | 6 +++--- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh index 2f7496f1a..e04f32f7c 100644 --- a/src/gpu/kmeans/kmeans_init.cuh +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -82,19 +82,19 @@ template class KmeansRandomInit : public KmeansInitBase { private: int seed_; - std::unique_ptr> generator_impl_; + std::unique_ptr> generator_impl_; public: /* * @param seed Random seed for generating centroids. */ KmeansRandomInit(size_t _seed) : - seed_(_seed), generator_impl_ (new UniformGenerator) {} + seed_(_seed), generator_impl_ (new UniformRandomGenerator) {} /* * @param gen Unique pointer to Random generator for generating centroids. */ - KmeansRandomInit(std::unique_ptr>& _gen) : + KmeansRandomInit(std::unique_ptr>& _gen) : generator_impl_(std::move(_gen)) {} virtual ~KmeansRandomInit() override {} @@ -134,7 +134,7 @@ struct KmeansLlInit : public KmeansInitBase { // Suggested in original paper, 8 is usually enough. constexpr static float MAX_ITER = 8; - std::unique_ptr> generator_; + std::unique_ptr> generator_; // Buffer like variables // store the self dot product of each data point @@ -156,7 +156,7 @@ struct KmeansLlInit : public KmeansInitBase { */ KmeansLlInit () : over_sample_ (1.5f), seed_ (-1), k_ (0), - generator_ (new UniformGenerator) {} + generator_ (new UniformRandomGenerator) {} /* * Initialize KmeansLlInit algorithm. @@ -168,7 +168,7 @@ struct KmeansLlInit : public KmeansInitBase { */ KmeansLlInit (T _over_sample) : over_sample_ (_over_sample), seed_ (-1), k_ (0), - generator_ (new UniformGenerator) {} + generator_ (new UniformRandomGenerator) {} /* * Initialize KmeansLlInit algorithm. @@ -181,7 +181,7 @@ struct KmeansLlInit : public KmeansInitBase { */ KmeansLlInit (int _seed, T _over_sample) : seed_(_seed), over_sample_(_over_sample), k_(0), - generator_ (new UniformGenerator(seed_)) {} + generator_ (new UniformRandomGenerator(seed_)) {} /* * Initialize KmeansLlInit algorithm. @@ -193,8 +193,9 @@ struct KmeansLlInit : public KmeansInitBase { * Note that when \f$over_sample != 1\f$, the probability for each data * point doesn't add to 1. */ - KmeansLlInit (std::unique_ptr>& _gen, T _over_sample) : - generator_(std::move(_gen)), over_sample_ (1.5f), seed_ (-1), k_(0) {} + KmeansLlInit (std::unique_ptr>& _gen, T _over_sample) : + generator_(std::move(_gen)), over_sample_ (_over_sample), seed_ (-1), + k_(0) {} virtual ~KmeansLlInit () override {} diff --git a/src/gpu/matrix/KmMatrix/Generator.cuh b/src/gpu/matrix/KmMatrix/Generator.cuh index 99abd9b6e..316c42393 100644 --- a/src/gpu/matrix/KmMatrix/Generator.cuh +++ b/src/gpu/matrix/KmMatrix/Generator.cuh @@ -10,12 +10,11 @@ #include "KmMatrix.hpp" #include "../../utils/utils.cuh" - namespace h2o4gpu { namespace Matrix { template -struct UniformGenerator : public GeneratorBase { +struct UniformRandomGenerator : public RandomGeneratorBase { private: size_t size_; // FIXME: Cache random_numbers_ in a safer way. @@ -28,27 +27,27 @@ struct UniformGenerator : public GeneratorBase { } public: - UniformGenerator() : size_ (0) { + UniformRandomGenerator() : size_ (0) { std::random_device rd; seed_ = rd(); } - UniformGenerator (size_t _size, int _seed) : seed_(_seed) { + UniformRandomGenerator (size_t _size, int _seed) : seed_(_seed) { if (_size == 0) { h2o4gpu_error("Zero size for generate is not allowed."); } initialize(_size); } - UniformGenerator(int _seed) : + UniformRandomGenerator(int _seed) : seed_(_seed), size_ (0) {} - ~UniformGenerator () {} + ~UniformRandomGenerator () {} - UniformGenerator(const UniformGenerator& _rhs) = delete; - UniformGenerator(UniformGenerator&& _rhs) = delete; - void operator=(const UniformGenerator& _rhs) = delete; - void operator=(UniformGenerator&& _rhs) = delete; + UniformRandomGenerator(const UniformRandomGenerator& _rhs) = delete; + UniformRandomGenerator(UniformRandomGenerator&& _rhs) = delete; + void operator=(const UniformRandomGenerator& _rhs) = delete; + void operator=(UniformRandomGenerator&& _rhs) = delete; KmMatrix generate() override { thrust::device_ptr rn_ptr (random_numbers_.dev_ptr()); diff --git a/src/gpu/matrix/KmMatrix/Generator.hpp b/src/gpu/matrix/KmMatrix/Generator.hpp index 9ef1ae9ac..9d4362c8c 100644 --- a/src/gpu/matrix/KmMatrix/Generator.hpp +++ b/src/gpu/matrix/KmMatrix/Generator.hpp @@ -12,7 +12,7 @@ namespace h2o4gpu { namespace Matrix { template -class GeneratorBase { +class RandomGeneratorBase { public: virtual KmMatrix generate() = 0; virtual KmMatrix generate(size_t _size) = 0; diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu index 7fae8ffdc..da2ddaf99 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -19,7 +19,7 @@ using namespace h2o4gpu::kMeans; using namespace h2o4gpu::Matrix; template -struct GeneratorMock : GeneratorBase { +struct GeneratorMock : RandomGeneratorBase { public: KmMatrix generate() override { h2o4gpu_error("Not implemented"); @@ -46,7 +46,7 @@ TEST(KmeansRandom, Init) { h_data[i] = i * 2; } KmMatrix data (h_data, 4, 5); - std::unique_ptr> gen (new GeneratorMock()); + std::unique_ptr> gen (new GeneratorMock()); KmeansRandomInit init (gen); auto res = init(data, 2); @@ -134,7 +134,7 @@ TEST(KmeansLL, GreedyRecluster) { // r --gtest_filter=KmeansLL.KmeansLLInit TEST(KmeansLL, KmeansLLInit) { - std::unique_ptr> mock_ptr (new GeneratorMock); + std::unique_ptr> mock_ptr (new GeneratorMock); KmeansLlInit kmeans_ll_init (mock_ptr, 2.5); thrust::host_vector h_data (30); // We split the points into two groups, but the result is statistic.