diff --git a/CMakeLists.txt b/CMakeLists.txt index c88dd0502..811e9be2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,151 +7,143 @@ SET_DEFAULT_CONFIGURATION_RELEASE() FIND_PACKAGE(OpenMP) FIND_PACKAGE(BLAS REQUIRED) -FIND_PACKAGE(SWIG REQUIRED) -FIND_PACKAGE(PythonLibs REQUIRED) # SWIG - -INCLUDE(${SWIG_USE_FILE}) #============= OPTIONS & SETTINGS OPTION(USE_CUDA "Build with GPU acceleration" ON) +OPTION(USE_SWIG "Use swig to generate language bindings." ON) +OPTION(BUILD_TESTS "Build tests." ON) +OPTION(USE_SYSTEM_GTEST "Use system google tests." ON) OPTION(DEV_BUILD "Dev build" OFF) # Compiler flags SET(CMAKE_CXX_STANDARD 11) SET(CMAKE_CXX_STANDARD_REQUIRED ON) SET(CMAKE_POSITION_INDEPENDENT_CODE ON) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") - -# PythonLibs' PYTHON_INCLUDE_PATH doesn't take into account virtualenv etc. -# Open to suggestions how to do this better. -EXECUTE_PROCESS(COMMAND python -c "import numpy; print(numpy.get_include())" - OUTPUT_VARIABLE PYTHON_INCLUDE_PATH_CUST - OUTPUT_STRIP_TRAILING_WHITESPACE) +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +SET(GPU_COMPUTE_VER "" CACHE STRING + "Semicolon separated list of compute versions to be built against, e.g. -DGPU_COMPUTE_VER='35;61'") if(OpenMP_CXX_FOUND OR OPENMP_FOUND) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif() #============= OPTIONS & SETTINGS # TODO probably to be removed after POGS is out in favor of XGboost GLM ADD_DEFINITIONS( - -D_GITHASH_=0 - -DH2O4GPU_DOUBLE - -DH2O4GPU_SINGLE -) + -D_GITHASH_=0 + -DH2O4GPU_DOUBLE + -DH2O4GPU_SINGLE) #============= BUILD COMMON CPU/GPU CODE FILE(GLOB_RECURSE COMMON_SOURCES - src/common/*.cpp - src/common/*.h - src/interface_c/*.cpp - src/interface_c/*.h - ) + src/common/*.cpp + src/common/*.h + src/interface_c/*.cpp + src/interface_c/*.h) INCLUDE_DIRECTORIES( - src/include - src/cpu/include - # Here and not in target_include_directories b/c cmake < 3.7 which we use in Dockerfiles does not support it - src/gpu/include - ${PYTHON_INCLUDE_PATH} - ${PYTHON_INCLUDE_PATH_CUST} -) + src/include + src/cpu/include + # Here and not in target_include_directories b/c cmake < 3.7 which we use in Dockerfiles does not support it + src/gpu/include) ADD_LIBRARY(commonh2o4gpu OBJECT ${COMMON_SOURCES}) #============= BUILD COMMON CPU/GPU CODE #============= BUILD CPU LIBRARY FILE(GLOB_RECURSE CPU_SOURCES - src/cpu/*.cpp - src/cpu/*.h - ) + src/cpu/*.cpp + src/cpu/*.h) ADD_LIBRARY(cpuh2o4gpu STATIC ${CPU_SOURCES} $) TARGET_LINK_LIBRARIES(cpuh2o4gpu ${BLAS_LIBRARIES}) #============= BUILD CPU LIBRARY -#============= SWIG -SET(CMAKE_SWIG_FLAGS -Werror) -#============= SWIG - -#============= CPU SWIG -SET_SOURCE_FILES_PROPERTIES(src/swig/ch2o4gpu_cpu.i PROPERTIES CPLUSPLUS ON) - -if (${CMAKE_VERSION} VERSION_LESS "3.8.0") - SWIG_ADD_MODULE(ch2o4gpu_cpu python src/swig/ch2o4gpu_cpu.i) +if (USE_CUDA) + SET(HG_USE_CUDA 1) else() - SWIG_ADD_LIBRARY(ch2o4gpu_cpu LANGUAGE python SOURCES src/swig/ch2o4gpu_cpu.i) -endif() - -SWIG_LINK_LIBRARIES(ch2o4gpu_cpu cpuh2o4gpu ${PYTHON_LIBRARIES}) - -SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_cpu_REAL_NAME} PROPERTIES - LINK_FLAGS ${OpenMP_CXX_FLAGS}) -#============= CPU SWIG + SET(HG_USE_CUDA 0) +endif(USE_CUDA) +ADD_SUBDIRECTORY(${CMAKE_CURRENT_LIST_DIR}/src/common) if(USE_CUDA) - FIND_PACKAGE(CUDA 8.0 REQUIRED) - FIND_PACKAGE(NVML REQUIRED) - - #============= BUILD GPU LIBRARY - ADD_DEFINITIONS( - -DCUDA_MAJOR=${CUDA_VERSION_MAJOR} - -DHAVECUDA - ) - - if(DEV_BUILD) - MESSAGE(STATUS "Building DEVELOPER compute capability version.") - SET(GPU_COMPUTE_VER 61 CACHE STRING - "Space separated list of compute versions to be built against") - else() - MESSAGE(STATUS "Building RELEASE compute capability version.") - SET(GPU_COMPUTE_VER 35;50;52;60;61 CACHE STRING - "Space separated list of compute versions to be built against") - endif() - - if(((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) AND NOT DEV_BUILD) - MESSAGE(STATUS "CUDA 9.0 detected, adding Volta compute capability (7.0).") - SET(GPU_COMPUTE_VER "${GPU_COMPUTE_VER};70") - endif() - - SET(GENCODE_FLAGS "") - FORMAT_GENCODE_FLAGS("${GPU_COMPUTE_VER}" GENCODE_FLAGS) - SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo; -w;") - - FILE(GLOB_RECURSE GPU_SOURCES - src/*.cu - src/*.cuh - src/common/*.cpp - src/common/*.h - ) - - CUDA_ADD_LIBRARY(gpuh2o4gpu ${GPU_SOURCES} $ STATIC) - - if($ENV{USENVTX}) - MESSAGE(STATUS "Building with NVTX support on.") - SET(NVTX_LIBRARY nvToolsExt) - endif() - - TARGET_LINK_LIBRARIES(gpuh2o4gpu - ${CUDA_CUBLAS_LIBRARIES} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${BLAS_LIBRARIES} - ${NVTX_LIBRARY} - ${NVML_LIBRARY}) - #============= BUILD GPU LIBRARY - - #============= GPU SWIG - SET_SOURCE_FILES_PROPERTIES(src/swig/ch2o4gpu_gpu.i PROPERTIES CPLUSPLUS ON) - - if (${CMAKE_VERSION} VERSION_LESS "3.8.0") - SWIG_ADD_MODULE(ch2o4gpu_gpu python src/swig/ch2o4gpu_gpu.i) - else() - SWIG_ADD_LIBRARY(ch2o4gpu_gpu LANGUAGE python SOURCES src/swig/ch2o4gpu_gpu.i) - endif() - SWIG_LINK_LIBRARIES(ch2o4gpu_gpu gpuh2o4gpu ${PYTHON_LIBRARIES}) - - SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_gpu_REAL_NAME} PROPERTIES - LINK_FLAGS ${OpenMP_CXX_FLAGS}) - #============= GPU SWIG + FIND_PACKAGE(CUDA 8.0 REQUIRED) + FIND_PACKAGE(NVML REQUIRED) + + #============= BUILD GPU LIBRARY + ADD_DEFINITIONS( + -DCUDA_MAJOR=${CUDA_VERSION_MAJOR} + -DHAVECUDA + ) + + if(DEV_BUILD AND NOT GPU_COMPUTE_VER) + MESSAGE(STATUS "Building DEVELOPER compute capability version.") + SET(GPU_COMPUTE_VER 61) + elseif(NOT GPU_COMPUTE_VER) + MESSAGE(STATUS "Building RELEASE compute capability version.") + SET(GPU_COMPUTE_VER 35;50;52;60;61) + endif() + + if(((CUDA_VERSION_MAJOR EQUAL 9) + OR (CUDA_VERSION_MAJOR GREATER 9)) + AND NOT DEV_BUILD + AND NOT GPU_COMPUTE_VER) + MESSAGE(STATUS "CUDA 9.0 detected, adding Volta compute capability (7.0).") + SET(GPU_COMPUTE_VER "${GPU_COMPUTE_VER};70") + endif() + + SET(GENCODE_FLAGS "") + FORMAT_GENCODE_FLAGS("${GPU_COMPUTE_VER}" GENCODE_FLAGS) + MESSAGE("CUDA architecture flags ${GENCODE_FLAGS}") + + SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};-Xcompiler -fPIC; -std=c++11;--expt-extended-lambda;--expt-relaxed-constexpr;${GENCODE_FLAGS};-lineinfo;") + + FILE(GLOB_RECURSE GPU_SOURCES + src/*.cu + src/*.cuh + src/gpu/matrix/*.cpp + src/common/*.cpp + src/common/*.h) + + CUDA_ADD_LIBRARY(gpuh2o4gpu ${GPU_SOURCES} $ STATIC) + + if($ENV{USENVTX}) + MESSAGE(STATUS "Building with NVTX support on.") + SET(NVTX_LIBRARY nvToolsExt) + endif() + + TARGET_LINK_LIBRARIES(gpuh2o4gpu + ${CUDA_CUBLAS_LIBRARIES} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${BLAS_LIBRARIES} + ${NVTX_LIBRARY} + ${NVML_LIBRARY}) + #============= BUILD GPU LIBRARY endif() + +if(USE_SWIG) + ADD_SUBDIRECTORY(${CMAKE_CURRENT_LIST_DIR}/src/swig) +endif(USE_SWIG) + +#============= Tests +if(BUILD_TESTS) + ENABLE_TESTING() + if (USE_SYSTEM_GTEST) + FIND_PACKAGE(GTest REQUIRED) + else () + ADD_SUBDIRECTORY(${CMAKE_CURRENT_LIST_DIR}/tests/googletest) + SET(GTEST_LIBRARIES gtest gtest_main) + endif (USE_SYSTEM_GTEST) + if (USE_CUDA) + FILE(GLOB_RECURSE CUDA_TEST_SOURCES "tests/cpp/*.cu") + CUDA_COMPILE(CUDA_TEST_OBJS ${CUDA_TEST_SOURCES}) + endif(USE_CUDA) + + ADD_EXECUTABLE(test-h2o4gpu ${CUDA_TEST_OBJS}) + TARGET_LINK_LIBRARIES(test-h2o4gpu gpuh2o4gpu) + TARGET_LINK_LIBRARIES(test-h2o4gpu ${GTEST_LIBRARIES}) + + ADD_TEST(TestH2O4GPU test-h2o4gpu) +endif(BUILD_TESTS) +#============= Tests diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt new file mode 100644 index 000000000..887751962 --- /dev/null +++ b/src/common/CMakeLists.txt @@ -0,0 +1 @@ +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/utils.h.in ${CMAKE_CURRENT_SOURCE_DIR}/utils.h) diff --git a/src/common/utils.h b/src/common/utils.h deleted file mode 100644 index 77b15827d..000000000 --- a/src/common/utils.h +++ /dev/null @@ -1,21 +0,0 @@ -/*! - * Copyright 2017-2018 H2O.ai, Inc. - * License Apache License Version 2.0 (see LICENSE for details) - */ -#pragma once -#include -#include "cblas.h" - -template -void self_dot(std::vector array_in, int n, int dim, - std::vector& dots); - -void compute_distances(std::vector data_in, - std::vector centroids_in, - std::vector &pairwise_distances, - int n, int dim, int k); - -void compute_distances(std::vector data_in, - std::vector centroids_in, - std::vector &pairwise_distances, - int n, int dim, int k); \ No newline at end of file diff --git a/src/common/utils.h.in b/src/common/utils.h.in new file mode 100644 index 000000000..179b5f2ec --- /dev/null +++ b/src/common/utils.h.in @@ -0,0 +1,54 @@ +/*! + * Copyright 2017-2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ +#pragma once +#include + +#include +#include + +#include "cblas/cblas.h" + +#define USE_CUDA() @HG_USE_CUDA@ + +template +void self_dot(std::vector array_in, int n, int dim, + std::vector& dots); + +void compute_distances(std::vector data_in, + std::vector centroids_in, + std::vector &pairwise_distances, + int n, int dim, int k); + +void compute_distances(std::vector data_in, + std::vector centroids_in, + std::vector &pairwise_distances, + int n, int dim, int k); + +// Matrix host dev +#define HG_HOSTDEV __host__ __device__ +#define HG_DEV __device__ +#define HG_DEVINLINE __device__ __forceinline__ +#define HG_HOSTDEVINLINE __host__ __device__ __forceinline__ + +#define h2o4gpu_error(x) error(x, __FILE__, __LINE__); + +inline void error(const char* e, const char* file, int line) +{ + std::stringstream ss; + ss << e << " - " << file << "(" << line << ")"; + //throw error_text; + std::cerr << ss.str() << std::endl; + exit(-1); +} + +#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__); + +inline void check(bool val, const char* e, const char* file, int line) +{ + if (!val) + { + error(e, file, line); + } +} diff --git a/src/cpu/h2o4gpuglm.cpp b/src/cpu/h2o4gpuglm.cpp index 39b664621..953428400 100644 --- a/src/cpu/h2o4gpuglm.cpp +++ b/src/cpu/h2o4gpuglm.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include diff --git a/src/cpu/h2o4gpukmeans.cpp b/src/cpu/h2o4gpukmeans.cpp index acc4c105d..45d3940c9 100644 --- a/src/cpu/h2o4gpukmeans.cpp +++ b/src/cpu/h2o4gpukmeans.cpp @@ -10,7 +10,7 @@ #include #include //#include "mkl.h" -#include "cblas.h" +#include "cblas/cblas.h" #include #include diff --git a/src/gpu/h2o4gpuglm.cu b/src/gpu/h2o4gpuglm.cu index ec16a828e..3eafafac3 100644 --- a/src/gpu/h2o4gpuglm.cu +++ b/src/gpu/h2o4gpuglm.cu @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/src/gpu/kmeans/kmeans_general.h b/src/gpu/kmeans/kmeans_general.h index 6f1a13e91..d32dcbce8 100644 --- a/src/gpu/kmeans/kmeans_general.h +++ b/src/gpu/kmeans/kmeans_general.h @@ -4,6 +4,8 @@ */ #pragma once #include "../../common/logger.h" +#include "../utils/utils.cuh" +#include "stdio.h" #define MAX_NGPUS 16 #define VERBOSE 0 @@ -12,8 +14,6 @@ // TODO(pseudotensor): Avoid throw for python exception handling. Need to avoid all exit's and return exit code all the way back. #define gpuErrchk(ans) { gpu_assert((ans), __FILE__, __LINE__); } -#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__); -#define safe_cublas(ans) throw_on_cublas_error((ans), __FILE__, __LINE__); #define CUDACHECK(cmd) do { \ cudaError_t e = cmd; \ diff --git a/src/gpu/kmeans/kmeans_h2o4gpu.cu b/src/gpu/kmeans/kmeans_h2o4gpu.cu index d05750a86..224070503 100644 --- a/src/gpu/kmeans/kmeans_h2o4gpu.cu +++ b/src/gpu/kmeans/kmeans_h2o4gpu.cu @@ -15,6 +15,9 @@ #include "kmeans_impl.h" #include "kmeans_general.h" #include "kmeans_h2o4gpu.h" + +#include "kmeans_init.cuh" + #include #include #include @@ -764,7 +767,21 @@ int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry, } else if (1 == init_from_data) { // kmeans|| log_debug(verbose, "KMeans - Using K-Means|| initialization."); - thrust::host_vector final_centroids = kmeans_parallel(verbose, seed, ord, data, data_dots, rows, cols, k, n_gpu, threshold); + thrust::host_vector h_init_data (rows * cols); + // Gather + for (size_t i = 0; i < n_gpu; ++i) { + thrust::copy( + thrust::device, + data[i]->begin(), data[i]->end(), h_init_data.begin()); + } + h2o4gpu::kMeans::KmMatrix init_data(h_init_data, rows, cols); + h2o4gpu::kMeans::KmMatrix final_centroids_matrix = + h2o4gpu::kMeans::KmeansLlInit(seed, 1.5)(init_data, k); + thrust::host_vector final_centroids (final_centroids_matrix.size()); + thrust::copy( + final_centroids_matrix.dev_ptr(), + final_centroids_matrix.dev_ptr() + final_centroids_matrix.size(), + final_centroids.begin()); #pragma omp parallel for for (int q = 0; q < n_gpu; q++) { diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu new file mode 100644 index 000000000..db11cfc86 --- /dev/null +++ b/src/gpu/kmeans/kmeans_init.cu @@ -0,0 +1,412 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include + +#include + +#include +#include +#include + +#include + +#include "kmeans_init.cuh" + +#include "../matrix/KmMatrix/KmMatrix.hpp" +#include "../matrix/KmMatrix/Arith.hpp" +#include "../matrix/KmMatrix/blas.cuh" +#include "../utils/utils.cuh" +#include "../utils/GpuInfo.cuh" + +namespace h2o4gpu { +namespace kMeans { + +using namespace Matrix; + +namespace kernel { +// X^2 + Y^2, here only calculates the + operation. +template +__global__ void construct_distance_pairs_kernel( + kParam _distance_pairs, + kParam _data_dots, kParam _centroids_dots) { + + size_t idx = global_thread_idx(); // indexing data + size_t idy = global_thread_idy(); // indexing centroids + + // FIXME: Is using shared memory necessary? + + size_t stride_x = grid_stride_x (); + // strides only for data. + for (size_t i = idx; i < _data_dots.rows; i += stride_x) { + if (idy < _centroids_dots.rows ) { + // i + idy: x^2 + y^2 between i^th data (a.k.a x) and idy^th + // centroid (a.k.a y) + _distance_pairs.ptr[i*_centroids_dots.rows + idy] = + _data_dots.ptr[idx] + _centroids_dots.ptr[idy]; + } + } +} + +// See SelfArgMinOp +template +__global__ void self_row_argmin_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + if (idx < _val.rows) { + T min = std::numeric_limits::max(); + int min_idx = -1; + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min && value != 0) { + min = value; + min_idx = i; + } + } + _res.ptr[idx] = min_idx; + } +} + +} // namespace kernel + +namespace detail { + +template +void PairWiseDistanceOp::initialize(KmMatrix& _data_dot, + KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs) { + data_dot_ = _data_dot; + centroids_dot_ = _centroids_dot; + distance_pairs_ = _distance_pairs; + initialized_ = true; +} + +template +PairWiseDistanceOp::PairWiseDistanceOp(KmMatrix& _data_dot, + KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs) : + data_dot_(_data_dot), centroids_dot_(_centroids_dot), + distance_pairs_(_distance_pairs), initialized_(true) {} + +template +KmMatrix PairWiseDistanceOp::operator()(KmMatrix& _data, + KmMatrix& _centroids) { + + kernel::construct_distance_pairs_kernel<<< + dim3(GpuInfo::ins().blocks(32), div_roundup(_centroids.rows(), 16)), + dim3(32, 16)>>>( // FIXME: Tune this. + distance_pairs_.k_param(), + data_dot_.k_param(), + centroids_dot_.k_param()); + + safe_cuda(cudaGetLastError()); + + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + + T alpha = -2.0; + T beta = 1.0; + + Blas::gemm( + handle, + CUBLAS_OP_T, CUBLAS_OP_N, + // n, d, d/k + _centroids.rows(), _data.rows(), _data.cols(), + &alpha, + _centroids.dev_ptr(), _centroids.cols(), + _data.dev_ptr(), _data.cols(), + &beta, + distance_pairs_.dev_ptr(), _centroids.rows()); + + return distance_pairs_; +} + +// ArgMin operation that excludes 0. Used when dealing with distance_pairs +// in recluster where distance between points with itself is calculated, +// hence the name. +// FIXME: Maybe generalize it to selection algorithm. +template +struct SelfArgMinOp { + KmMatrix argmin(KmMatrix& _val) { + KmMatrix _res(_val.rows(), 1); + kernel::self_row_argmin_sequential<<< + div_roundup(_val.rows(), 256), 256>>>(_res.k_param(), + _val.k_param()); + return _res; + } +}; + +// We use counting to construct the weight as described in the paper. Counting +// is performed by histogram algorithm. +// For re-cluster, the paper suggests using K-Means++, but that will require +// copying data back to host. So we simply use those selected centroids with +// highest probability. +template +KmMatrix GreedyRecluster::recluster(KmMatrix& _centroids, size_t _k) { + + // Get the distance pairs for centroids + KmMatrix centroids_dot (_centroids.rows(), 1); + VecBatchDotOp().dot(centroids_dot, _centroids); + KmMatrix distance_pairs (_centroids.rows(), _centroids.rows()); + PairWiseDistanceOp centroids_distance_op( + centroids_dot, centroids_dot, distance_pairs); + + distance_pairs = centroids_distance_op(_centroids, _centroids); + + // get the closest x_j for each x_i in centroids. + KmMatrix min_indices = SelfArgMinOp().argmin(distance_pairs); + + // use historgram to get counting for weights + KmMatrix weights (1, _centroids.rows()); + + size_t temp_storage_bytes = 0; + void *d_temp_storage = NULL; + + // determine the temp_storage_bytes + safe_cuda(cub::DeviceHistogram::HistogramEven( + d_temp_storage, temp_storage_bytes, + min_indices.dev_ptr(), + weights.dev_ptr(), + min_indices.rows() + 1, + (T)0.0, + (T)min_indices.rows(), + (int)_centroids.rows())); + + // cub has level bound, which deals with -1 returned by SelfArgMinOp + safe_cuda(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); + safe_cuda(cub::DeviceHistogram::HistogramEven( + d_temp_storage, temp_storage_bytes, + min_indices.dev_ptr(), // d_samples + weights.dev_ptr(), // d_histogram + min_indices.rows() + 1, // num_levels + (T)0.0, // lower_level + (T)min_indices.rows(), // upper_level + (int)_centroids.rows())); // num_samples + safe_cuda(cudaFree(d_temp_storage)); + + // Sort the indices by weights in ascending order, then use those at front + // as result. + thrust::sort_by_key(thrust::device, + weights.dev_ptr(), + weights.dev_ptr() + weights.size(), + min_indices.dev_ptr(), + thrust::greater()); + + int * min_indices_ptr = min_indices.dev_ptr(); + + KmMatrix new_centroids (_k, _centroids.cols()); + T * new_centroids_ptr = new_centroids.dev_ptr(); + int cols = _centroids.cols(); + + T * old_centroids_ptr = _centroids.dev_ptr(); + + auto k_iter = thrust::make_counting_iterator(0); + thrust::for_each( + thrust::device, + k_iter, k_iter + _k, + [=] __device__ (int idx) { + size_t index = min_indices_ptr[idx]; + + size_t in_begin = index * cols; + size_t in_end = (index + 1) * cols; + + size_t res_begin = idx * cols; + + for (size_t i = in_begin, j = res_begin; i < in_end; ++i, ++j) { + new_centroids_ptr[j] = old_centroids_ptr[i]; + } + }); + + return new_centroids; +} + +} // namespace detail + + +/* ============ KmeansRandomInit Class member functions ============ */ + +template +KmMatrix KmeansRandomInit::operator()(KmMatrix& _data, size_t _k) { + + KmMatrix dist = generator_impl_->generate(_k); + MulOp().mul(dist, dist, (T)_data.rows() - 1); + + dist = dist.reshape(dist.cols(), dist.rows()); + + KmMatrix centroids = _data.rows(dist); + + return centroids; +} + + +/* ============== KmeansLlInit Class member functions ============== */ + +// Although the paper suggested calculating the probability independently, +// but due to zero distance between a point and itself, the already selected +// points will have very low probability. +template class ReclusterPolicy > +KmMatrix KmeansLlInit::probability( + KmMatrix& _data, KmMatrix& _centroids) { + + KmMatrix centroids_dot (_centroids.rows(), 1); + VecBatchDotOp().dot(centroids_dot, _centroids); + + // FIXME: Time this + distance_pairs_ = KmMatrix(_data.rows(), _centroids.rows()); + detail::PairWiseDistanceOp distance_op ( + data_dot_, centroids_dot, distance_pairs_); + distance_pairs_ = distance_op(_data, _centroids); + + KmMatrix min_distances = MinOp().min(distance_pairs_, + KmMatrixDim::ROW); + + T cost = SumOp().sum(min_distances); + cost += ESP; + + KmMatrix prob (min_distances.rows(), 1); + MulOp().mul(prob, min_distances, over_sample_ * k_ / cost); + + return prob; +} + + +template class ReclusterPolicy > +KmMatrix KmeansLlInit::sample_centroids( + KmMatrix& _data, KmMatrix& _prob) { + + KmMatrix thresholds = generator_->generate(_data.rows()); + + T * thresholds_ptr = thresholds.dev_ptr(); + + // If use kParam, nvcc complains: + // identifier "H2O4GPU::KMeans::kParam ::kParam" is undefined in + // device code. + T* prob_ptr = _prob.dev_ptr(); + + auto prob_iter = thrust::make_counting_iterator(0); + size_t n_new_centroids = thrust::count_if( + thrust::device, prob_iter, + prob_iter + _prob.size(), + [=] __device__ (int idx) { + float thresh = thresholds_ptr[idx]; + T prob_x = prob_ptr[idx]; + return prob_x > thresh; + }); + + KmMatrix new_centroids(n_new_centroids, _data.cols()); + thrust::device_ptr new_centroids_ptr (new_centroids.dev_ptr()); + + thrust::device_ptr data_ptr (_data.dev_ptr()); + + size_t cols = _data.cols(); + // renew iterator + prob_iter = thrust::make_counting_iterator(0); + thrust::copy_if( + thrust::device, + data_ptr, data_ptr + _data.size(), prob_iter, + new_centroids_ptr, + [=] __device__(int idx) { + size_t row = idx / cols; + T thresh = thresholds_ptr[row]; + T prob_x = prob_ptr[row]; + return prob_x > thresh; + }); + + return new_centroids; +} + +template class ReclusterPolicy> +KmMatrix +KmeansLlInit::operator()(KmMatrix& _data, size_t _k) { + + if (_k > _data.size()) { + char err_msg[128]; + sprintf( + err_msg, + "k must be less than or equal to the number of data points" + ", k: %lu, data points: %lu", + _k, _data.rows()); + h2o4gpu_error(err_msg); + } + + if (seed_ < 0) { + std::random_device rd; + seed_ = rd(); + } + k_ = _k; + + std::mt19937 generator(0); + + std::uniform_int_distribution<> distribution(0, _data.rows()); + size_t idx = distribution(generator); + + // Calculate X^2 (point-wise) + data_dot_ = KmMatrix(_data.rows(), 1); + VecBatchDotOp().dot(data_dot_, _data); + + // First centroid + KmMatrix centroids = _data.row(idx); + + KmMatrix prob = probability(_data, centroids); + + T cost = SumOp().sum(prob); + + size_t max_iter = std::max((size_t)(MAX_ITER), + (size_t)std::ceil(std::log(cost))); + for (size_t i = 0; i < max_iter; ++i) { + prob = probability(_data, centroids); + KmMatrix new_centroids = sample_centroids(_data, prob); + centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); + } + + if (centroids.rows() < _k) { + KmMatrix new_centroids = KmeansRandomInit(generator_)(_data, + _k - centroids.rows()); + centroids = stack(centroids, new_centroids, KmMatrixDim::ROW); + } + + centroids = ReclusterPolicy::recluster(centroids, k_); + + return centroids; +} + +#define INSTANTIATE(T) \ + template KmMatrix KmeansLlInit::operator()( \ + KmMatrix& _data, size_t _k); \ + template KmMatrix KmeansLlInit::probability( \ + KmMatrix& data, KmMatrix& centroids); \ + template KmMatrix KmeansLlInit::sample_centroids( \ + KmMatrix& data, KmMatrix& centroids); \ + template KmMatrix KmeansRandomInit::operator()( \ + KmMatrix& _data, size_t _k); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +#undef INSTANTIATE + +namespace detail { + +#define INSTANTIATE(T) \ + template PairWiseDistanceOp::PairWiseDistanceOp ( \ + KmMatrix& _data_dot, \ + KmMatrix& _centroids_dot, \ + KmMatrix& _distance_pairs); \ + template void PairWiseDistanceOp::initialize( \ + KmMatrix& _data_dot, \ + KmMatrix& _centroids_dot, \ + KmMatrix& _distance_pairs); \ + template KmMatrix PairWiseDistanceOp::operator()( \ + KmMatrix& _data, KmMatrix& _centroids); \ + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +#undef INSTANTIATE +} + +} // namespace kMeans +} // namespace h2o4gpu diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh new file mode 100644 index 000000000..e04f32f7c --- /dev/null +++ b/src/gpu/kmeans/kmeans_init.cuh @@ -0,0 +1,216 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef KMEANS_INIT_H_ +#define KMEANS_INIT_H_ + + +#include + +#include "../matrix/KmMatrix/KmMatrix.hpp" +#include "../matrix/KmMatrix/Generator.hpp" +#include "../matrix/KmMatrix/Generator.cuh" +#include "../utils/GpuInfo.cuh" +#include "../utils/utils.cuh" + +constexpr double ESP = 1e-8; + +namespace h2o4gpu { +namespace kMeans { + +using namespace Matrix; + +namespace detail { + +// FIXME: +// Operations performed in K-Means|| loop leads to a-approximation. +// Intuitively, choosing those centroids with highest probability should not +// break this property. But I haven't made the argument yet. +// And benchmarking should be performed to check the result. +template +struct GreedyRecluster { + static KmMatrix recluster(KmMatrix& _centroids, size_t _k); +}; + +// Extracted as an independent Op for k-means use. +template +struct PairWiseDistanceOp { + private: + KmMatrix data_dot_; + KmMatrix centroids_dot_; + KmMatrix distance_pairs_; + + bool initialized_; + + public: + void initialize (KmMatrix& _data_dot, KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs); + + PairWiseDistanceOp () : initialized_(false) {} + + PairWiseDistanceOp (KmMatrix& _data_dot, KmMatrix& _centroids_dot, + KmMatrix& _distance_pairs); + + KmMatrix operator()(KmMatrix& _data, KmMatrix& _centroids); +}; + +} // namespace detail + +/* + * Base class used for all K-Means initialization algorithms. + */ +template +class KmeansInitBase { + public: + virtual ~KmeansInitBase() {} + /* + * Select k centroids from data. + * + * @param data data points stored in row major matrix. + * @param k number of centroids. + */ + virtual KmMatrix operator()(KmMatrix& data, size_t k) = 0; +}; + +/* + * Random initialization. + * @tparam Numeric data type. + */ +template +class KmeansRandomInit : public KmeansInitBase { + private: + int seed_; + std::unique_ptr> generator_impl_; + + public: + /* + * @param seed Random seed for generating centroids. + */ + KmeansRandomInit(size_t _seed) : + seed_(_seed), generator_impl_ (new UniformRandomGenerator) {} + + /* + * @param gen Unique pointer to Random generator for generating centroids. + */ + KmeansRandomInit(std::unique_ptr>& _gen) : + generator_impl_(std::move(_gen)) {} + + virtual ~KmeansRandomInit() override {} + + /* + * @param data Data points stored in row major matrix. + * @param k Number of centroids. + */ + virtual KmMatrix operator()(KmMatrix& data, size_t k) override; +}; + +/* + * Each instance of KmeansLlInit corresponds to one dataset, if a new data set + * is used, users need to create a new instance. + * + * k-means|| algorithm based on the paper: + * + * Scalable K-Means++ + * + * + * @tparam ReclusterPolicy Policy for final recluster, default is + * GreedyRecluster. + * Contract: + * ReclusterPolicy::recluster(KmMatrix& centroids, size_t _k) + * @tparam Numeric data type. + */ +template < + typename T, + template + class ReclusterPolicy = detail::GreedyRecluster> +struct KmeansLlInit : public KmeansInitBase { + private: + T over_sample_; + int seed_; + int k_; + + // Suggested in original paper, 8 is usually enough. + constexpr static float MAX_ITER = 8; + + std::unique_ptr> generator_; + + // Buffer like variables + // store the self dot product of each data point + KmMatrix data_dot_; + // store distances between each data point and centroids + KmMatrix distance_pairs_; + + KmMatrix probability(KmMatrix& data, KmMatrix& centroids); + public: + // sample_centroids should not be part of the interface, but following error + // is generated when put in private section: + // The enclosing parent function ("sample_centroids") for an extended + // __device__ lambda cannot have private or protected access within its class + KmMatrix sample_centroids(KmMatrix& data, KmMatrix& centroids); + + /* + * Initialize KmeansLlInit algorithm, with default: + * over_sample = 1.5. + */ + KmeansLlInit () : + over_sample_ (1.5f), seed_ (-1), k_ (0), + generator_ (new UniformRandomGenerator) {} + + /* + * Initialize KmeansLlInit algorithm. + * + * @param over_sample over_sample rate. + * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + * Note that when \f$over_sample != 1\f$, the probability for each data + * point doesn't add to 1. + */ + KmeansLlInit (T _over_sample) : + over_sample_ (_over_sample), seed_ (-1), k_ (0), + generator_ (new UniformRandomGenerator) {} + + /* + * Initialize KmeansLlInit algorithm. + * + * @param seed Seed used to generate threshold for sampling centroids. + * @param over_sample over_sample rate. + * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + * Note that when \f$over_sample != 1\f$, the probability for each data + * point doesn't add to 1. + */ + KmeansLlInit (int _seed, T _over_sample) : + seed_(_seed), over_sample_(_over_sample), k_(0), + generator_ (new UniformRandomGenerator(seed_)) {} + + /* + * Initialize KmeansLlInit algorithm. + * + * @param gen Unique pointer to a generator used to generate threshold for + * sampling centroids. + * @param over_sample over_sample rate. + * \f$p_x = over_sample \times \frac{d^2(x, C)}{\Phi_X (C)}\f$ + * Note that when \f$over_sample != 1\f$, the probability for each data + * point doesn't add to 1. + */ + KmeansLlInit (std::unique_ptr>& _gen, T _over_sample) : + generator_(std::move(_gen)), over_sample_ (_over_sample), seed_ (-1), + k_(0) {} + + virtual ~KmeansLlInit () override {} + + /* + * Select k centroids from data. + * @param data data points stored in row major matrix. + * @param k number of centroids. + */ + KmMatrix operator()(KmMatrix& data, size_t k) override; +}; + + +// FIXME: Make kmeans++ a derived class of KmeansInitBase + +} // namespace kMeans +} // namespace h2o4gpu + +#endif // KMEANS_INIT_H_ \ No newline at end of file diff --git a/src/gpu/kmeans/kmeans_labels.h b/src/gpu/kmeans/kmeans_labels.h index ada7f286e..87b7b5ed9 100644 --- a/src/gpu/kmeans/kmeans_labels.h +++ b/src/gpu/kmeans/kmeans_labels.h @@ -9,9 +9,13 @@ #include #include #include -#include "kmeans_general.h" #include +#include "kmeans_general.h" +#include "../utils/utils.cuh" + +using namespace h2o4gpu; + inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); @@ -23,20 +27,6 @@ inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort= } } - -inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file, - int line) { - if (code != cudaSuccess) { - std::stringstream ss; - ss << file << "(" << line << ")"; - std::string file_and_line; - ss >> file_and_line; - thrust::system_error(code, thrust::cuda_category(), file_and_line); - } - - return code; -} - #ifdef CUBLAS_API_H_ // cuBLAS API errors static const char *cudaGetErrorEnum(cublasStatus_t error) @@ -72,23 +62,6 @@ static const char *cudaGetErrorEnum(cublasStatus_t error) } #endif -inline cublasStatus_t throw_on_cublas_error(cublasStatus_t code, const char *file, - int line) { - - - if (code != CUBLAS_STATUS_SUCCESS) { - fprintf(stderr,"cublas error: %s %s %d\n", cudaGetErrorEnum(code), file, line); - std::stringstream ss; - ss << file << "(" << line << ")"; - std::string file_and_line; - ss >> file_and_line; - thrust::system_error(code, thrust::cuda_category(), file_and_line); - } - - return code; -} - - extern cudaStream_t cuda_stream[MAX_NGPUS]; template diff --git a/src/gpu/matrix/KmMatrix/Arith.cu b/src/gpu/matrix/KmMatrix/Arith.cu new file mode 100644 index 000000000..4b3a7f112 --- /dev/null +++ b/src/gpu/matrix/KmMatrix/Arith.cu @@ -0,0 +1,193 @@ +#include +#include "Arith.hpp" +#include "../../utils/GpuInfo.cuh" + +namespace h2o4gpu { +namespace Matrix { + +namespace kernel { + +// Compute segment offsets for cub segment funtion. +template +__global__ void segment_offsets(kParam _res, kParam _val) { + size_t idx = global_thread_idx(); + if (idx < _res.size()) { + _res.ptr[idx] = _val.cols * idx; + } +} + +/* + * Compute min value for each row. + * @tparam T Numeric type of the data + * @param _res The output matrix with shape m x 1 + * @param _val The input matrix with shape m x n + */ +template +__global__ void row_argmin_sequential(kParam _res, kParam _val) { + + size_t idx = global_thread_idx(); + if (idx < _val.rows) { + T min = std::numeric_limits::max(); + int min_idx = -1; + for (size_t i = 0; i < _val.cols; ++i) { + T value = _val.ptr[idx * _val.cols + i]; + if (value < min) { + min = value; + min_idx = i; + } + } + _res.ptr[idx] = min_idx; + } +} + +} // namespace kernel + +// FIXME: The dot function deals with vector, not matrix. +template +void DotOp::dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); +} +template +void DotOp::dot(KmMatrix& _res, KmMatrix& _lhs, + KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm(handle, + CUBLAS_OP_N, CUBLAS_OP_N, // FIXME + _lhs.rows(), _rhs.cols(), _lhs.cols(), + &alpha, + _lhs.dev_ptr(), _lhs.cols(), + _rhs.dev_ptr(), _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols()); +} + +template +void VecBatchDotOp::dot(KmMatrix& _res, KmMatrix& _val) { + this->dot(_res, _val, _val); +} +template +void VecBatchDotOp::dot(KmMatrix& _res, + KmMatrix& _lhs, KmMatrix& _rhs) { + constexpr T alpha = 1.0; + constexpr T beta = 1.0; + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::gemm_strided_batched( + handle, + CUBLAS_OP_N, CUBLAS_OP_T, + 1, 1, _rhs.cols(), // m, n, k + &alpha, + _lhs.dev_ptr(), 1, _lhs.cols(), + _rhs.dev_ptr(), 1, _rhs.cols(), + &beta, + _res.dev_ptr(), _res.cols(), 1, // c should be columun vector + _lhs.rows()); +} + +template +T SumOp::sum(KmMatrix& _val) { + T* raw_ptr = _val.dev_ptr(); + thrust::device_ptr ptr (raw_ptr); + T res = thrust::reduce(ptr, ptr + _val.size(), (T)0, thrust::plus()); + return res; +} + +template +void MulOp::mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs) { + cublasHandle_t handle = GpuInfo::ins().cublas_handle(); + Blas::axpy( + handle, _lhs.size(), // handle, n + &_rhs, // alpha + _lhs.dev_ptr(), 1, + _res.dev_ptr(), 1); +} + +template +T MeanOp::mean(KmMatrix& _val) { + T res = SumOp().sum(_val); + res = res / _val.size(); + return res; +} + +template +KmMatrix ArgMinOp::argmin(KmMatrix& _val, KmMatrixDim _dim) { + if (_dim == KmMatrixDim::ROW) { + // FIXME: Didn't use cub function, offsets occupies n * sizeof(T) memory, + // occupies 2 * n * sizeof(T) memory considering memory + // alignment. That would be 3 * n * sizeof(T) in total. + KmMatrix _res(_val.rows(), 1); + kernel::row_argmin_sequential<<>>( + _res.k_param(), _val.k_param()); + return _res; + } else { + // FIXME + h2o4gpu_error("Not implemented"); + KmMatrix res; + return res; + } +} + +template +KmMatrix MinOp::min(KmMatrix& _val, KmMatrixDim _dim) { + if (_dim == KmMatrixDim::ROW) { + KmMatrix res (_val.rows(), 1); + KmMatrix offsets (_val.rows() + 1, 1); + + kernel::segment_offsets<<>>( + offsets.k_param(), _val.k_param()); + + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + + safe_cuda(cub::DeviceSegmentedReduce::Min( + d_temp_storage, + temp_storage_bytes, + _val.dev_ptr(), + res.dev_ptr(), + _val.rows(), + offsets.dev_ptr(), + offsets.dev_ptr() + 1)); + + safe_cuda(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes)); + safe_cuda(cub::DeviceSegmentedReduce::Min( + d_temp_storage, + temp_storage_bytes, + _val.dev_ptr(), + res.dev_ptr(), + _val.rows(), + offsets.dev_ptr(), + offsets.dev_ptr() + 1)); + safe_cuda(cudaFree(d_temp_storage)); + + return res; + } else { + // FIXME + h2o4gpu_error("Not implemented"); + KmMatrix res; + return res; + } +} + +#define INSTANTIATE(T) \ + template void DotOp::dot(KmMatrix& _res, KmMatrix& _val); \ + template void DotOp::dot(KmMatrix& _res, KmMatrix& _lhs, \ + KmMatrix& _rhs); \ + template void VecBatchDotOp::dot( \ + KmMatrix& _res, KmMatrix& _val); \ + template void VecBatchDotOp::dot( \ + KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs); \ + template T SumOp::sum(KmMatrix& _val); \ + template void MulOp::mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs); \ + template T MeanOp::mean(KmMatrix& _val); \ + template KmMatrix ArgMinOp::argmin( \ + KmMatrix& _val, KmMatrixDim _dim); \ + template KmMatrix MinOp::min(KmMatrix& _val, KmMatrixDim _dim); \ + + +INSTANTIATE(double) +INSTANTIATE(float) +INSTANTIATE(int) + +} // namespace Matrix +} // namespace h2o4gpu \ No newline at end of file diff --git a/src/gpu/matrix/KmMatrix/Arith.hpp b/src/gpu/matrix/KmMatrix/Arith.hpp new file mode 100644 index 000000000..7760cd6aa --- /dev/null +++ b/src/gpu/matrix/KmMatrix/Arith.hpp @@ -0,0 +1,58 @@ +#ifndef M_ARITH_HPP_ +#define M_ARITH_HPP_ + +#include "KmMatrix.hpp" +#include "blas.cuh" +#include "../../utils/utils.cuh" + +namespace h2o4gpu { +namespace Matrix { + +// FIXME: Using struct for operations is just keeping the possibility of +// creating an unified operations for KmMatrix. For example, let KmMatrix +// inherit those left associative ops, or create an inferface for elementwise +// operations. + +// FIXME: Use return value instead. +template +struct DotOp { + void dot(KmMatrix& _res, KmMatrix& _val); + void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs); +}; + +template +struct VecBatchDotOp { + void dot(KmMatrix& _res, KmMatrix& _val); + void dot(KmMatrix& _res, KmMatrix& _lhs, KmMatrix& _rhs); +}; + +template +struct SumOp { + T sum(KmMatrix& _val); +}; + +template +struct MulOp { + void mul(KmMatrix& _res, KmMatrix& _lhs, T _rhs); +}; + + +template +struct MeanOp { + T mean(KmMatrix& _val); +}; + +template +struct ArgMinOp { + KmMatrix argmin(KmMatrix& _val, KmMatrixDim _dim); +}; + +template +struct MinOp { + KmMatrix min(KmMatrix& _val, KmMatrixDim _dim); +}; + +} // namespace Matrix +} // namespace h2o4gpu + +#endif // M_ARITH_HPP_ diff --git a/src/gpu/matrix/KmMatrix/Generator.cuh b/src/gpu/matrix/KmMatrix/Generator.cuh new file mode 100644 index 000000000..316c42393 --- /dev/null +++ b/src/gpu/matrix/KmMatrix/Generator.cuh @@ -0,0 +1,80 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include +#include + +#include "Generator.hpp" +#include "KmMatrix.hpp" +#include "../../utils/utils.cuh" + +namespace h2o4gpu { +namespace Matrix { + +template +struct UniformRandomGenerator : public RandomGeneratorBase { + private: + size_t size_; + // FIXME: Cache random_numbers_ in a safer way. + KmMatrix random_numbers_; + int seed_; + + void initialize (size_t _size) { + size_ = _size; + random_numbers_ = KmMatrix (1, size_); + } + + public: + UniformRandomGenerator() : size_ (0) { + std::random_device rd; + seed_ = rd(); + } + + UniformRandomGenerator (size_t _size, int _seed) : seed_(_seed) { + if (_size == 0) { + h2o4gpu_error("Zero size for generate is not allowed."); + } + initialize(_size); + } + + UniformRandomGenerator(int _seed) : + seed_(_seed), size_ (0) {} + + ~UniformRandomGenerator () {} + + UniformRandomGenerator(const UniformRandomGenerator& _rhs) = delete; + UniformRandomGenerator(UniformRandomGenerator&& _rhs) = delete; + void operator=(const UniformRandomGenerator& _rhs) = delete; + void operator=(UniformRandomGenerator&& _rhs) = delete; + + KmMatrix generate() override { + thrust::device_ptr rn_ptr (random_numbers_.dev_ptr()); + thrust::transform( + thrust::make_counting_iterator((size_t)0), + thrust::make_counting_iterator(size_), + rn_ptr, + [=] __device__ (int idx) { + thrust::default_random_engine rng(seed_); + thrust::uniform_real_distribution dist; + rng.discard(idx); + return dist(rng); + }); + + return random_numbers_; + } + + KmMatrix generate(size_t _size) override { + if (_size == 0) { + h2o4gpu_error("Zero size for generate is not allowed."); + } + if (_size != size_) { + initialize(_size); + } + return generate(); + } +}; + +} // namespace h2o4gpu +} // namespace Matrix \ No newline at end of file diff --git a/src/gpu/matrix/KmMatrix/Generator.hpp b/src/gpu/matrix/KmMatrix/Generator.hpp new file mode 100644 index 000000000..9d4362c8c --- /dev/null +++ b/src/gpu/matrix/KmMatrix/Generator.hpp @@ -0,0 +1,25 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef GENERATOR_HPP_ +#define GENERATOR_HPP_ + +#include "KmMatrix.hpp" + +namespace h2o4gpu { +namespace Matrix { + +template +class RandomGeneratorBase { + public: + virtual KmMatrix generate() = 0; + virtual KmMatrix generate(size_t _size) = 0; +}; + +} // namespace Matrix +} // namespace h2o4gpu + + +#endif // GENERATOR_HPP_ diff --git a/src/gpu/matrix/KmMatrix/KmMatrix.cpp b/src/gpu/matrix/KmMatrix/KmMatrix.cpp new file mode 100644 index 000000000..67d3f3207 --- /dev/null +++ b/src/gpu/matrix/KmMatrix/KmMatrix.cpp @@ -0,0 +1,358 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include "KmMatrix.hpp" + +#if USE_CUDA() +#include "KmMatrixCuda.cuh" +#endif + +namespace h2o4gpu { +namespace Matrix { + +// ============================== +// KmMatrixImpl implementation +// ============================== + +template +KmMatrixImpl::KmMatrixImpl(KmMatrix *_matrix) + : matrix_(_matrix){} + + +// ============================== +// KmMatrix implementation +// ============================== + +template +KmMatrix::KmMatrix() : + param_ (0, 0, nullptr) { + init_impls(); +#if USE_CUDA() + KmMatrixImpl * ptr = new CudaKmMatrixImpl(this); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; +#elif + backend_ = Backend::CPUDense; +#endif +} + +template +KmMatrix::KmMatrix(size_t _rows, size_t _cols) : + param_ (_rows, _cols, nullptr) { + init_impls(); +#if USE_CUDA() + KmMatrixImpl * ptr = new CudaKmMatrixImpl(_rows * _cols, this); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; +#elif + backend_ = Backend::CPUDense; +#endif +} + +template +KmMatrix::KmMatrix(thrust::host_vector _vec, + size_t _rows, size_t _cols) : + param_ (_rows, _cols, nullptr) { + init_impls(); + if (_vec.size() != _rows * _cols) { + throw KmMatrixSizeError("Expecting hv::size() == rows * cols. " + + std::string("Got hv::size(): ") + + std::to_string(_vec.size()) + + ", rows: " + std::to_string(_rows) + + ", cols: " + std::to_string(_cols)); + } +#if USE_CUDA() + KmMatrixImpl * ptr = new CudaKmMatrixImpl(_vec, this); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; +#elif + backend_ = Backend::CPUDense; +#endif +} + +template +KmMatrix::KmMatrix(const KmMatrixProxy& _other) : + param_ (_other.param_){ + init_impls(); + name_ = _other.orgi_.name_ + "(" + std::to_string(_other.start()) + "," + + std::to_string(_other.end()) + ")"; +#if USE_CUDA() + KmMatrixImpl * ptr = new CudaKmMatrixImpl( + _other.orgi_, _other.start(), _other.size(), _other.stride(), this); + impls[(int)Backend::CUDADense].reset(ptr); + backend_ = Backend::CUDADense; +#elif + backend_ = Backend::CPUDense; +#endif +} + +template +KmMatrix::KmMatrix(const KmMatrix& _other) : + param_(_other.param_) { + copy_impls(_other.impls); + backend_ = _other.backend_; + name_ = _other.name_ + "(copied)"; +} + +template +KmMatrix::KmMatrix(KmMatrix&& _other) : + param_(_other.param_) { + copy_impls(_other.impls); + backend_ = _other.backend_; + name_ = _other.name_ + "(copied [in move])"; +} + +template +void KmMatrix::copy_impls(const std::shared_ptr>* _impls) { + for (size_t i = 0; i < 4; ++i) { + if (_impls[i].get() != nullptr) { + impls[i] = _impls[i]; + impls[i]->set_interface(this); + } + } +} + +template +void KmMatrix::operator=(const KmMatrix& _other) { + copy_impls(_other.impls); + param_ = _other.param_; + backend_ = _other.backend_; + name_ = _other.name_ + "(copied)"; +} + +template +void KmMatrix::operator=(KmMatrix&& _other) { + for (size_t i = 0; i < 4; ++i) { + impls[i] = std::move(_other.impls[i]); + if (impls[i] != nullptr) + impls[i]->set_interface(this); + } + param_ = _other.param_; + backend_ = _other.backend_; + name_ = _other.name_ + "(copied [in move])"; +} + +template +void KmMatrix::init_impls() { + for (size_t i = 0; i < 4; ++i) { + impls[i] = nullptr; + } +} + +template +KmMatrix::~KmMatrix() {} + + +template +size_t KmMatrix::size() const { + return param_.rows * param_.cols; +} + +template +size_t KmMatrix::rows() const { + return param_.rows; +} + +template +size_t KmMatrix::cols() const { + return param_.cols; +} + +template +KmMatrix KmMatrix::reshape(size_t _rows, size_t _cols) const { + if (_rows * _cols != param_.rows * param_.cols) { + throw KmMatrixSizeError("Expecting rows * cols == " + + std::to_string(param_.rows * param_.cols) + + ", get " + + "rows: " + std::to_string(_rows) + + ", cols: " + std::to_string(_cols)); + } + KmMatrix res (*this); + res.param_.rows = _rows; + res.param_.cols = _cols; + return res; +} + +template +kParam KmMatrix::k_param () { + T * ptr = dev_ptr(); + kParam param (param_); + param.ptr = ptr; + return param; +} + +template +T* KmMatrix::host_ptr() { + if (backend_ == Backend::CUDADense) { + return impls[0]->host_ptr(); + } else { + // FIXME + return nullptr; + } +} + +template +T* KmMatrix::dev_ptr() { + if (backend_ == Backend::CUDADense) { + return impls[(int)Backend::CUDADense]->dev_ptr(); + } else { + return nullptr; + } +} + +template +bool KmMatrix::on_device() const { + if (backend_ == Backend::CUDADense) { + return impls[(int)Backend::CUDADense]->on_device(); + } else { + return false; + } +} + +template +KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem) { + size_t start = param_.cols * idx; + size_t stride = 1; + size_t end = param_.cols * (idx + 1); + kParam param(1, param_.cols, nullptr); + return KmMatrixProxy(*this, start, end, stride, param); +} + +template +KmMatrixProxy KmMatrix::col(size_t idx) { + h2o4gpu_error("Not implemented."); + return KmMatrixProxy(*this, 0, 0, 0); +} + +template +KmMatrix KmMatrix::rows(KmMatrix& _index) { + KmMatrix res; + if (backend_ == Backend::CUDADense && + _index.backend_ == Backend::CUDADense) { + if (! _index.on_device()) { + _index.dev_ptr(); + } + res = impls[(int)Backend::CUDADense]->rows(_index); + } else { + h2o4gpu_error("Not implemented."); + } + return res; +} + +template +KmMatrix KmMatrix::cols(KmMatrix& _index) { + h2o4gpu_error("Not implemented."); + KmMatrix res; + return res; +} + +template +bool KmMatrix::operator==(KmMatrix& _rhs) { + if (_rhs.backend_ == Backend::CUDADense && backend_ == Backend::CUDADense) { + bool res = impls[(int)Backend::CUDADense]->equal(_rhs); + return res; + } else { + h2o4gpu_error("Not implemented."); + return false; + } +} + +template +KmMatrix KmMatrix::stack(KmMatrix &_second, + KmMatrixDim _dim) { + KmMatrix res; + + if (_dim == KmMatrixDim::ROW) { + if (cols() != _second.cols()) { + h2o4gpu_error("Columns of first is not equal to second."); + } + + if (backend_ == Backend::CUDADense) { + res = impls[(int)Backend::CUDADense]->stack(_second, _dim); + } else { + h2o4gpu_error("Not implemented."); + } + + } else { + h2o4gpu_error("Not implemented."); + } + + return res; +} + + +// ============================== +// Helper functions +// ============================== + +template +std::ostream& operator<<(std::ostream& os, KmMatrix& m) { + std::cout << "\nmatrix: " << m.name() << std::endl; + std::cout << "shape: (" << m.rows() << ", " << m.cols() << ")\n" << "["; + T * ptr = m.host_ptr(); + kParam param = m.k_param(); + for (size_t i = 0; i < param.rows; ++i) { + if (i == 0) std::cout << "["; + else std::cout << " ["; + for (size_t j = 0; j < param.cols; ++j) { + std::cout << std::setw(5) << ptr[i*param.cols + j] << ','; + } + std::cout << " ]"; + if (i != param.rows - 1) std::cout << "," << std::endl; + } + std::cout << "]\n" << std::endl; + return os; +} + +template +KmMatrix stack(KmMatrix& _first, KmMatrix& _second, + KmMatrixDim _dim) { + return _first.stack(_second, _dim); +} + +#define INSTANTIATE(T) \ + /* Standard con(de)structors*/ \ + template KmMatrixImpl::KmMatrixImpl(KmMatrix *_matrix); \ + template KmMatrix::KmMatrix(); \ + template KmMatrix::KmMatrix(size_t _rows, size_t _cols); \ + template KmMatrix::KmMatrix(thrust::host_vector _other, \ + size_t _rows, size_t _cols); \ + template KmMatrix::KmMatrix(const KmMatrix& _other); \ + template KmMatrix::KmMatrix(KmMatrix&& _other); \ + template void KmMatrix::copy_impls( \ + const std::shared_ptr>* _impls); \ + template void KmMatrix::operator=(const KmMatrix& _other); \ + template void KmMatrix::operator=(KmMatrix&& _other); \ + template KmMatrix::KmMatrix(const KmMatrixProxy& _other); \ + template KmMatrix::~KmMatrix(); \ + /* Methods */ \ + template size_t KmMatrix::size() const; \ + template size_t KmMatrix::rows() const; \ + template size_t KmMatrix::cols() const; \ + template KmMatrix KmMatrix::reshape( \ + size_t _rows, size_t _cols) const; \ + template kParam KmMatrix::k_param (); \ + template T * KmMatrix::host_ptr(); \ + template T * KmMatrix::dev_ptr(); \ + template bool KmMatrix::on_device() const; \ + template KmMatrixProxy KmMatrix::row(size_t idx, bool dev_mem=true); \ + template KmMatrix KmMatrix::rows(KmMatrix& _index); \ + template KmMatrix KmMatrix::cols(KmMatrix& _index); \ + template bool KmMatrix::operator==(KmMatrix &_rhs); \ + template KmMatrix KmMatrix::stack(KmMatrix &_second, \ + KmMatrixDim _dim); \ + /* Helper functions */ \ + template std::ostream& operator<<(std::ostream& os, KmMatrix& m); \ + template KmMatrix stack(KmMatrix& _first, KmMatrix& _second, \ + KmMatrixDim _dim); + + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +#undef INSTANTIATE +} // namespace Matrix +} // namepsace h2o4gpu diff --git a/src/gpu/matrix/KmMatrix/KmMatrix.hpp b/src/gpu/matrix/KmMatrix/KmMatrix.hpp new file mode 100644 index 000000000..dbeb1943e --- /dev/null +++ b/src/gpu/matrix/KmMatrix/KmMatrix.hpp @@ -0,0 +1,198 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef KM_MATRIX_HPP_ +#define KM_MATRIX_HPP_ + +#include +#include +#include +#include +#include +#include "../../../common/utils.h" + +#if USE_CUDA() +#include "KmMatrixCuda.cuh" +#endif + +namespace h2o4gpu { +namespace Matrix { + +template +class KmMatrixProxy; + +template +class KmMatrix; + +enum class Backend { + CUDADense = 0, + CUDASparse = 1, + CPUDense = 2, + CPUSparse = 3 +}; + +enum class KmMatrixDim { + ROW, + COL +}; + +// Kernel parameter +template +struct kParam { + size_t rows; + size_t cols; + T *ptr; + + kParam(size_t _rows, size_t _cols, T *_ptr) + : rows (_rows), cols(_cols), ptr (_ptr) {} + kParam(const kParam& _other) { + rows = _other.rows; + cols = _other.cols; + ptr = _other.ptr; + } + HG_HOSTDEV void operator=(const kParam& _other) { + rows = _other.rows; + cols = _other.cols; + ptr = _other.ptr; + } + + HG_HOSTDEV size_t size() const { + return rows * cols; + } +}; + +template +class KmMatrixImpl { + protected: + KmMatrix * matrix_; + public: + KmMatrixImpl(KmMatrix *_matrix); + virtual ~KmMatrixImpl () {} + + // FIXME + // Used in KmMatrix constructors to deal with temp return value. + // Maybe better solution. + virtual void set_interface(KmMatrix* _par) = 0; + + virtual T* host_ptr() = 0; + virtual T* dev_ptr() = 0; + virtual size_t size() const = 0; + virtual bool on_device() const = 0; + + virtual KmMatrix stack(KmMatrix&, KmMatrixDim _dim) = 0; + virtual bool equal(KmMatrix& _val) = 0; + + virtual KmMatrix rows(KmMatrix& _index) = 0; +}; + +template +class KmMatrix { + private: + + Backend backend_; + + std::shared_ptr> impls[4]; + kParam param_; + + std::string name_; + + void init_impls(); + void copy_impls(const std::shared_ptr>* _impls); + + public: + explicit KmMatrix(); + KmMatrix(size_t _rows, size_t _cols); + KmMatrix(thrust::host_vector _other, size_t _rows, size_t _cols); + KmMatrix(const KmMatrixProxy& _other); + + KmMatrix(const KmMatrix& _other); + KmMatrix(KmMatrix&& _other); + + void operator=(const KmMatrix& _other); + void operator=(KmMatrix&& _other); + + bool operator==(KmMatrix& _rhs); + + virtual ~KmMatrix(); + + size_t size () const; + size_t rows () const; + size_t cols () const; + + KmMatrix reshape(size_t _rows, size_t _cols) const; + + T* host_ptr(); + T* dev_ptr(); + + bool on_device() const; + + kParam k_param (); + + std::string name() const { return name_; } + void set_name (std::string _name) {name_ = _name;} + + KmMatrixProxy row(size_t idx, bool dev_mem=true); + KmMatrixProxy col(size_t idx); + + KmMatrix rows(KmMatrix& index); + KmMatrix cols(KmMatrix& index); + + KmMatrix stack(KmMatrix& _second, KmMatrixDim _dim); +}; + +template +KmMatrix stack(KmMatrix& _first, + KmMatrix& _second, + KmMatrixDim _dim); + +/* + * Print the matrix. + * The printed format is mimicing numpy, one can just copy the output to Python + * shell and let numpy to verify it. + */ +template +std::ostream& operator<<(std::ostream& os, KmMatrix& m); + +template +class KmMatrixProxy { + private: + KmMatrix& orgi_; + + kParam param_; + + size_t start_; + size_t end_; + size_t stride_; + + size_t start() const; + size_t end() const; + size_t stride() const; + + public: + + size_t size() const; + bool on_device() const; + + KmMatrixProxy(KmMatrix& _other, + size_t _start, size_t _end, size_t _stride, kParam& _param); + + bool operator==(const KmMatrix& _rhs); + bool operator==(const KmMatrixProxy& _rhs); + + void operator=(KmMatrix& _other); + friend KmMatrix; +}; + +struct KmMatrixSizeError: public std::runtime_error +{ + KmMatrixSizeError(std::string const& message) + : std::runtime_error(message) + {} +}; + +} // namespace Matrix +} // namespace h2o4gpu + +#endif diff --git a/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu new file mode 100644 index 000000000..6b6052ce1 --- /dev/null +++ b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu @@ -0,0 +1,236 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include +#include +#include + +#include + +#include "KmMatrixCuda.cuh" +#include "KmMatrix.hpp" + +namespace h2o4gpu { +namespace Matrix { + +template +CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par) : + KmMatrixImpl(_par){ + assert(_par); +} + +template +CudaKmMatrixImpl::CudaKmMatrixImpl(const thrust::host_vector& _h_vec, + KmMatrix* _par) + : on_device_(false), KmMatrixImpl(_par) { + assert(_par); + h_vector_.resize(_h_vec.size()); + thrust::copy(_h_vec.begin(), _h_vec.end(), h_vector_.begin()); +} + +template +CudaKmMatrixImpl::CudaKmMatrixImpl(size_t _size, KmMatrix * _par) : + KmMatrixImpl(_par) { + assert(_par); + if (_size == 0) return; + + d_vector_.resize(_size); + on_device_ = true; +} + +template +CudaKmMatrixImpl::CudaKmMatrixImpl( + KmMatrix& _other, size_t _start, size_t _size, size_t _stride, + KmMatrix * _par) : + KmMatrixImpl(_par) { + assert(_par); + assert (_size > 0); + + if (_size == 0) + return; + + T* raw_ptr; + + if (_other.on_device()) { + raw_ptr = _other.dev_ptr(); + assert (raw_ptr); + thrust::device_ptr ptr (raw_ptr); + ptr += _start; + d_vector_.resize(_size); + on_device_ = true; + thrust::copy(ptr, ptr + _size, d_vector_.begin()); + } else { + raw_ptr = _other.host_ptr(); + assert (raw_ptr); + raw_ptr += _start; + h_vector_.resize(_size); + on_device_ = false; + thrust::copy(raw_ptr, raw_ptr + _size, h_vector_.begin()); + } +} + +template +CudaKmMatrixImpl::~CudaKmMatrixImpl() {} + +template +void CudaKmMatrixImpl::set_interface(KmMatrix* _par) { + assert(_par); + KmMatrixImpl::matrix_ = _par; +} + +template +T* CudaKmMatrixImpl::host_ptr() { + device_to_host(); + return thrust::raw_pointer_cast(h_vector_.data()); +} + +template +T* CudaKmMatrixImpl::dev_ptr() { + host_to_device(); + T* ptr = thrust::raw_pointer_cast(d_vector_.data()); + return ptr; +} + +template +void CudaKmMatrixImpl::host_to_device() { + if (on_device_) + return; + d_vector_.resize(h_vector_.size()); + thrust::copy(h_vector_.begin(), h_vector_.end(), d_vector_.begin()); + on_device_ = true; +} + +template +void CudaKmMatrixImpl::device_to_host() { + if (!on_device_) + return; + h_vector_.resize(d_vector_.size()); + thrust::copy(d_vector_.begin(), d_vector_.end(), h_vector_.begin()); + on_device_ = false; +} + +template +bool CudaKmMatrixImpl::on_device() const { + return on_device_; +} + +template +size_t CudaKmMatrixImpl::size() const { + if (on_device_) { + return d_vector_.size(); + } else { + return h_vector_.size(); + } +} + +template +bool CudaKmMatrixImpl::equal(KmMatrix& _rhs) { + T* rhs_raw_ptr = _rhs.dev_ptr(); + host_to_device(); + thrust::device_ptr rhs_ptr (rhs_raw_ptr); + // FIXME, Is it floating compatible? + bool res = thrust::equal(d_vector_.begin(), d_vector_.end(), + rhs_ptr); + return res; +} + +template +KmMatrix CudaKmMatrixImpl::rows(KmMatrix& _index) { + + KmMatrix out (_index.rows(), KmMatrixImpl::matrix_->cols()); + + T * index_ptr = _index.dev_ptr(); + T * in_ptr = KmMatrixImpl::matrix_->dev_ptr(); + T * out_ptr = out.dev_ptr(); + + auto iter = thrust::make_counting_iterator(0); + + size_t cols = KmMatrixImpl::matrix_->cols(); + + thrust::for_each( + thrust::device, + iter, iter + _index.rows(), + [=] __device__ (int idx) { + size_t index = index_ptr[idx]; + + size_t in_begin = index * cols; + size_t in_end = (index + 1) * cols; + + size_t out_begin = idx * cols; + + for (size_t i = in_begin, j = out_begin; i != in_end; ++i, ++j) { + out_ptr[j] = in_ptr[i]; + } + }); + + return out; +} + +template +KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, + KmMatrixDim _dim) { + if (_dim == KmMatrixDim::ROW) { + if (KmMatrixImpl::matrix_->cols() != _second.cols()) { + h2o4gpu_error("Columns of first is not equal to second."); + } + host_to_device(); + + T * sec_raw_ptr = _second.dev_ptr(); + thrust::device_ptr self_ptr = d_vector_.data(); + + thrust::device_ptr sec_ptr (sec_raw_ptr); + + KmMatrix res (KmMatrixImpl::matrix_->rows() + _second.rows(), + KmMatrixImpl::matrix_->cols()); + + T * res_raw_ptr = res.dev_ptr(); + thrust::device_ptr res_ptr (res_raw_ptr); + + thrust::copy(self_ptr, self_ptr + size(), res_ptr); + res_ptr = thrust::device_ptr(res_raw_ptr) + size(); + thrust::copy(sec_ptr, sec_ptr + _second.size(), res_ptr); + + return res; + } else { + // FIXME + h2o4gpu_error("Not implemented."); + KmMatrix res; + return res; + } +} + + +#define INSTANTIATE(T) \ + /* Standard con(de)structors*/ \ + template CudaKmMatrixImpl::CudaKmMatrixImpl( \ + KmMatrix& _other, size_t _start, size_t _size, size_t _stride, \ + KmMatrix * _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl( \ + const thrust::host_vector& _h_vec, KmMatrix* _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl(KmMatrix * _par); \ + template CudaKmMatrixImpl::CudaKmMatrixImpl(size_t _size, \ + KmMatrix * _par); \ + template CudaKmMatrixImpl::~CudaKmMatrixImpl(); \ + template void CudaKmMatrixImpl::set_interface(KmMatrix* _par); \ + /* Member functions */ \ + template bool CudaKmMatrixImpl::on_device() const; \ + template void CudaKmMatrixImpl::device_to_host(); \ + template void CudaKmMatrixImpl::host_to_device(); \ + template T* CudaKmMatrixImpl::dev_ptr(); \ + template T* CudaKmMatrixImpl::host_ptr(); \ + template size_t CudaKmMatrixImpl::size() const; \ + template bool CudaKmMatrixImpl::equal(KmMatrix& _rhs); \ + template KmMatrix CudaKmMatrixImpl::stack(KmMatrix& _second, \ + KmMatrixDim _dim); \ + template KmMatrix CudaKmMatrixImpl::rows(KmMatrix& _index); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +#undef INSTANTIATE + +} // namespace Matrix +} // namespace h2o4gpu diff --git a/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh new file mode 100644 index 000000000..02f8f6adc --- /dev/null +++ b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh @@ -0,0 +1,87 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef KM_MATRIX_CUDA_CUH_ +#define KM_MATRIX_CUDA_CUH_ + +#include "KmMatrix.hpp" +#include "thrust/device_vector.h" +#include + +namespace h2o4gpu { +namespace Matrix { + +template +class KmMatrix; + +template +class KmMatrixImpl; + +template +class KmMatrixProxy; + +enum class KmMatrixDim; + +struct CudaInfo { + int n_devices; + int * _devices; + + CudaInfo (int _n_devices) + : n_devices(_n_devices) { + _devices = new int[_n_devices]; + } + ~CudaInfo () { + delete [] _devices; + } +}; + +template +class CudaKmMatrixImpl : public KmMatrixImpl { + private: + thrust::device_vector d_vector_; + thrust::host_vector h_vector_; + + bool on_device_; + KmMatrix* matrix_; + + void host_to_device(); + void device_to_host(); + + public: + CudaKmMatrixImpl(KmMatrix * _par); + CudaKmMatrixImpl(const thrust::host_vector& _h_vec, KmMatrix* _par); + CudaKmMatrixImpl(size_t _size, KmMatrix * _par); + CudaKmMatrixImpl(KmMatrix& _other, + size_t _start, size_t _size, size_t _stride, + KmMatrix * _par); + + CudaKmMatrixImpl(const CudaKmMatrixImpl&) = delete; + CudaKmMatrixImpl(CudaKmMatrixImpl&&) = delete; + + virtual ~CudaKmMatrixImpl(); + + virtual void set_interface(KmMatrix* _par) override; + + void operator=(const CudaKmMatrixImpl&) = delete; + void operator=(CudaKmMatrixImpl&&) = delete; + + KmMatrix stack(KmMatrix& _second, KmMatrixDim _dim) override; + + virtual T* host_ptr() override; + virtual T* dev_ptr() override; + + virtual size_t size() const override; + + virtual bool equal(KmMatrix& _rhs); + + virtual KmMatrix rows(KmMatrix& _index) override; + + virtual bool on_device() const override; +}; + +} // namespace Matrix +} // namespace h2o4gpu + +#endif diff --git a/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp b/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp new file mode 100644 index 000000000..9e1d5ffd9 --- /dev/null +++ b/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp @@ -0,0 +1,68 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include "KmMatrix.hpp" + +namespace h2o4gpu { +namespace Matrix { + +template +KmMatrixProxy::KmMatrixProxy(KmMatrix& _other, + size_t _start, size_t _end, size_t _stride, + kParam& _param) + : orgi_ (_other), param_(_param), start_(_start), end_(_end), + stride_(_stride) { + assert(size() > 0); +} + +template +bool KmMatrixProxy::on_device() const { + return orgi_.on_device(); +} + +template +size_t KmMatrixProxy::start() const { + return start_; +} + +template +size_t KmMatrixProxy::end() const { + return end_; +} + +template +size_t KmMatrixProxy::stride() const { + return stride_; +} + +template +size_t KmMatrixProxy::size() const { + return (end_ - start_) / stride_; +} + +template +void KmMatrixProxy::operator=(KmMatrix &_other) { + // FIXME + assert(false); +} + +#define INSTANTIATE(T) \ + template KmMatrixProxy::KmMatrixProxy(KmMatrix& _other, \ + size_t _start, size_t _end, \ + size_t _stride, \ + kParam& _param); \ + template bool KmMatrixProxy::on_device() const; \ + template size_t KmMatrixProxy::start() const; \ + template size_t KmMatrixProxy::end() const; \ + template size_t KmMatrixProxy::stride() const; \ + template size_t KmMatrixProxy::size() const; \ + template void KmMatrixProxy::operator=(KmMatrix &_other); + +INSTANTIATE(float) +INSTANTIATE(double) +INSTANTIATE(int) + +} // namespace Matrix +} // namespace h2o4gpu diff --git a/src/gpu/matrix/KmMatrix/blas.cuh b/src/gpu/matrix/KmMatrix/blas.cuh new file mode 100644 index 000000000..50b69898b --- /dev/null +++ b/src/gpu/matrix/KmMatrix/blas.cuh @@ -0,0 +1,268 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef KM_BLAS_CUH_ +#define KM_BLAS_CUH_ + +#include +#include "../../utils/utils.cuh" + +// C++ Wrappers for cublas + +namespace h2o4gpu { +namespace Matrix { + +namespace Blas { +// LEVEL 1 +inline void axpy(cublasHandle_t handle, int n, + const double *alpha, + const double *x, int incx, + double *y, int incy) { + safe_cublas(cublasDaxpy(handle, n, + alpha, + x, incx, + y, incy));} + +inline void axpy(cublasHandle_t handle, int n, + const float *alpha, + const float *x, int incx, + float *y, int incy) { + safe_cublas(cublasSaxpy(handle, n, + alpha, + x, incx, + y, incy));} + +inline void axpy(cublasHandle_t handle, int n, + const int *alpha, + const int *x, int incx, + int *y, int incy) { + safe_cublas(cublasSaxpy(handle, n, + (const float *)alpha, + (const float *)x, incx, + (float *)y, incy));} +// LEVEL 3 +inline void gemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + const float *beta, /* host or device pointer */ + float *C, + int ldc) { + safe_cublas(cublasSgemm(handle, + transa, transb, + m, n, k, + alpha, /* host or device pointer */ + A, lda, + B, ldb, + beta, /* host or device pointer */ + C, ldc));} + +inline void gemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + const double *beta, /* host or device pointer */ + double *C, + int ldc) { + safe_cublas(cublasDgemm(handle, + transa, + transb, + m, + n, + k, + alpha, /* host or device pointer */ + A, + lda, + B, + ldb, + beta, /* host or device pointer */ + C, + ldc));} + +inline void gemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const int *alpha, /* host or device pointer */ + const int *A, + int lda, + const int *B, + int ldb, + const int *beta, /* host or device pointer */ + int *C, + int ldc) { + safe_cublas(cublasSgemm(handle, + transa, transb, + m, n, k, + (const float*)alpha, /* host or device pointer */ + (const float*)A, lda, + (const float*)B, ldb, + (const float*)beta, /* host or device pointer */ + (float*)C, ldc));} + +/* -- gemm_batched --*/ +inline void gemm_batched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const double *alpha, + const double *Aarray[], int lda, + const double *Barray[], int ldb, + const double *beta, + double *Carray[], int ldc, + int batchCount) { + safe_cublas(cublasDgemmBatched(handle, + transa, + transb, + m, n, k, + alpha, + Aarray, lda, + Barray, ldb, + beta, + Carray, ldc, + batchCount)); +} + +inline void gemm_batched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const float *alpha, + const float *Aarray[], int lda, + const float *Barray[], int ldb, + const float *beta, + float *Carray[], int ldc, + int batchCount) { + safe_cublas(cublasSgemmBatched(handle, + transa, + transb, + m, n, k, + alpha, + Aarray, lda, + Barray, ldb, + beta, + Carray, ldc, + batchCount)); +} + +inline void gemm_batched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, int n, int k, + const int *alpha, + const int *Aarray[], int lda, + const int *Barray[], int ldb, + const int *beta, + float *Carray[], int ldc, + int batchCount) { + safe_cublas(cublasSgemmBatched(handle, + transa, + transb, + m, n, k, + (const float *)alpha, + (const float * const *)Aarray, lda, + (const float * const *)Barray, ldb, + (const float *)beta, + (float * const *)Carray, ldc, + batchCount)); +} + +/* -- gemm_strided_batched -- */ +inline void gemm_strided_batched( + cublasHandle_t handle, + cublasOperation_t transA, cublasOperation_t transB, + int M, int N, int K, + const double* alpha, + const double* A, int ldA, int strideA, + const double* B, int ldB, int strideB, + const double* beta, + double* C, int ldC, int strideC, + int batchCount) { + safe_cublas(cublasDgemmStridedBatched(handle, + transA, + transB, + M, N, K, + alpha, + A, ldA, + strideA, + B, ldB, + strideB, + beta, + C, ldC, + strideC, + batchCount)); +} + +inline void gemm_strided_batched( + cublasHandle_t handle, + cublasOperation_t transA, cublasOperation_t transB, + int M, int N, int K, + const float* alpha, + const float* A, int ldA, int strideA, + const float* B, int ldB, int strideB, + const float* beta, + float* C, int ldC, int strideC, + int batchCount) { + safe_cublas(cublasSgemmStridedBatched(handle, + transA, + transB, + M, N, K, + alpha, + A, ldA, + strideA, + B, ldB, + strideB, + beta, + C, ldC, + strideC, + batchCount)); +} + +inline void gemm_strided_batched( + cublasHandle_t handle, + cublasOperation_t transA, cublasOperation_t transB, + int M, int N, int K, + const int* alpha, + const int* A, int ldA, int strideA, + const int* B, int ldB, int strideB, + const int* beta, + int* C, int ldC, int strideC, + int batchCount) { + safe_cublas(cublasSgemmStridedBatched(handle, + transA, + transB, + M, N, K, + (const float*)alpha, + (const float*)A, ldA, + strideA, + (const float*)B, ldB, + strideB, + (const float*)beta, + (float*)C, ldC, + strideC, + batchCount)); +} + +} // Blas +} // Matrix +} // h2o4gpu + +#endif // KM_BLAS_CUH_ \ No newline at end of file diff --git a/src/gpu/utils/GpuInfo.cuh b/src/gpu/utils/GpuInfo.cuh new file mode 100644 index 000000000..0a57644f7 --- /dev/null +++ b/src/gpu/utils/GpuInfo.cuh @@ -0,0 +1,80 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#ifndef GPU_INFO_HPP_ +#define GPU_INFO_HPP_ + +#include "utils.cuh" + +#include + +#include +#include + +namespace h2o4gpu { +// Singleton class storing gpu info. +// Call GpuInfo::ins() to use the class; +class GpuInfo { + private: + int n_gpu_; + int* n_sm_; // number of gpu processors for each device + cublasHandle_t* handles_; // handle for each device + + public: + GpuInfo () { + safe_cuda(cudaGetDeviceCount(&n_gpu_)); + n_sm_ = (int*) malloc (n_gpu_); + handles_ = (cublasHandle_t*) malloc (n_gpu_); + + for (int i = 0; i < n_gpu_; ++i) { + cudaDeviceGetAttribute(&n_sm_[i], cudaDevAttrMultiProcessorCount, i); + safe_cublas(cublasCreate(&handles_[i])); + } + } + ~GpuInfo () { + free (n_sm_); + for (int i = 0; i < n_gpu_; ++i) { + safe_cublas(cublasDestroy(handles_[i])); + } + free (handles_); + } + + static GpuInfo& ins() { + static GpuInfo obj; + return obj; + } + + // Call the following methods with GpuInfo::ins(). For example: + // GpuInfo::ins().blocks(32) + + // Get number of blocks for grid strided loop kernel. + // returns _mul * MultiProcessorCount[device]. + // FIXME, get active device + size_t blocks (size_t _mul, int _device=0) { + if (has_device(_device)) { + return _mul * n_sm_[_device]; + } else { + fprintf(stderr, "Doesn't have device: %d\n", _device); + abort(); + } + } + // FIXME, ditto + cublasHandle_t cublas_handle(int _device=0) { + if (has_device(_device)) { + return handles_[_device]; + } else { + fprintf(stderr, "Doesn't have device: %d\n", _device); + abort(); + } + } + + bool has_device(int _device) { + return _device < n_gpu_ && _device >= 0; + } +}; + +} // namespace h2o4gpu + +#endif // GPU_INFO_HPP_ diff --git a/src/gpu/utils/utils.cuh b/src/gpu/utils/utils.cuh index 3f35f375d..fed9e46e1 100644 --- a/src/gpu/utils/utils.cuh +++ b/src/gpu/utils/utils.cuh @@ -9,29 +9,10 @@ #include #include +#include "../../common/utils.h" + namespace h2o4gpu { -#define h2o4gpu_error(x) error(x, __FILE__, __LINE__); - - inline void error(const char* e, const char* file, int line) - { - std::stringstream ss; - ss << e << " - " << file << "(" << line << ")"; - //throw error_text; - std::cerr << ss.str() << std::endl; - exit(-1); - } - -#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__); - - inline void check(bool val, const char* e, const char* file, int line) - { - if (!val) - { - error(e, file, line); - } - } - #define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__) @@ -366,4 +347,56 @@ namespace h2o4gpu return idx * col_size; }); } + +HG_DEVINLINE size_t global_thread_idx () { + return threadIdx.x + blockIdx.x * blockDim.x; +} + +HG_DEVINLINE size_t global_thread_idy () { + return threadIdx.y + blockIdx.y * blockDim.y; +} + +HG_DEVINLINE size_t grid_stride_x () { + return blockDim.x * gridDim.x; +} + +HG_DEVINLINE size_t grid_stride_y () { + return blockDim.y * gridDim.y; +} + +template +T1 HG_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) { + return static_cast(ceil(static_cast(a) / b)); +} + + +// Work around for shared memory +// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name +template +struct KernelSharedMem; + +template <> +struct KernelSharedMem { + __device__ float * ptr() { + extern __shared__ __align__(sizeof(float)) float s_float[]; + return s_float; + } +}; + +template <> +struct KernelSharedMem { + __device__ double * ptr() { + extern __shared__ __align__(sizeof(double)) double s_double[]; + return s_double; + } +}; + +template <> +struct KernelSharedMem { + __device__ int * ptr() { + extern __shared__ __align__(sizeof(int)) int s_int[]; + return s_int; + } +}; + } diff --git a/src/swig/CMakeLists.txt b/src/swig/CMakeLists.txt new file mode 100644 index 000000000..3ee50c0c4 --- /dev/null +++ b/src/swig/CMakeLists.txt @@ -0,0 +1,48 @@ +FIND_PACKAGE(SWIG REQUIRED) +FIND_PACKAGE(PythonLibs REQUIRED) + +INCLUDE(${SWIG_USE_FILE}) +# PythonLibs' PYTHON_INCLUDE_PATH doesn't take into account virtualenv etc. +# Open to suggestions how to do this better. +include_directories( + ${PYTHON_INCLUDE_PATH} + ${PYTHON_INCLUDE_PATH_CUST}) + +EXECUTE_PROCESS(COMMAND python -c "import numpy; print(numpy.get_include())" + OUTPUT_VARIABLE PYTHON_INCLUDE_PATH_CUST + OUTPUT_STRIP_TRAILING_WHITESPACE) + +#============= SWIG +SET(CMAKE_SWIG_FLAGS -Werror) +#============= SWIG + +#============= CPU SWIG +SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_cpu.i PROPERTIES CPLUSPLUS ON) + +if (${CMAKE_VERSION} VERSION_LESS "3.8.0") + SWIG_ADD_MODULE(ch2o4gpu_cpu python ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_cpu.i) +else() + SWIG_ADD_LIBRARY(ch2o4gpu_cpu LANGUAGE python SOURCES ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_cpu.i) +endif() + +SWIG_LINK_LIBRARIES(ch2o4gpu_cpu cpuh2o4gpu ${PYTHON_LIBRARIES}) + +SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_cpu_REAL_NAME} PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) +#============= CPU SWIG + +#============= GPU SWIG +if(USE_CUDA) + SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_gpu.i PROPERTIES CPLUSPLUS ON) + + if (${CMAKE_VERSION} VERSION_LESS "3.8.0") + SWIG_ADD_MODULE(ch2o4gpu_gpu python ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_gpu.i) + else() + SWIG_ADD_LIBRARY(ch2o4gpu_gpu LANGUAGE python SOURCES ${CMAKE_CURRENT_LIST_DIR}/ch2o4gpu_gpu.i) + endif() + SWIG_LINK_LIBRARIES(ch2o4gpu_gpu gpuh2o4gpu ${PYTHON_LIBRARIES}) + + SET_TARGET_PROPERTIES(${SWIG_MODULE_ch2o4gpu_gpu_REAL_NAME} PROPERTIES + LINK_FLAGS ${OpenMP_CXX_FLAGS}) +endif(USE_CUDA) +#============= GPU SWIG diff --git a/tests/cpp/gpu/KmMatrix/test_arith.cu b/tests/cpp/gpu/KmMatrix/test_arith.cu new file mode 100644 index 000000000..81bf4101c --- /dev/null +++ b/tests/cpp/gpu/KmMatrix/test_arith.cu @@ -0,0 +1,127 @@ +#include +#include +#include + +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/Arith.hpp" + +#include + +using namespace h2o4gpu::Matrix; + +constexpr float esp = 0.001f; + +TEST(KmMatrix, ArithDot) { + thrust::host_vector h_data (9); + for (size_t i = 0; i < 9; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 3, 3); + KmMatrix res (3, 3); + + DotOp().dot(res, mat); + + std::vector answer_vec + { + 153.0f, 212.0f, 281.0f, + 1044.0f, 1490.0f, 2036.0f, + 2745.0f, 3956.0f, 5465.0f + }; + + KmMatrix answer (answer_vec, 3, 3); + + ASSERT_TRUE(answer == res); +} + +TEST(KmMatrix, ArithVecBatchDot) { + thrust::host_vector h_data (20); + for (size_t i = 0; i < 20; ++i) { + h_data[i] = (float) i * i; + } + + KmMatrix data (h_data, 4, 5); + + KmMatrix res (4, 1); + VecBatchDotOp().dot(res, data); + + thrust::host_vector h_sol (4); + h_sol[0] = 354; + h_sol[1] = 14979; + h_sol[2] = 112354; + h_sol[3] = 434979; + KmMatrix sol (h_sol, 4, 1); + + ASSERT_TRUE(res == sol); +} + +TEST(KmMatrix, ArithSum) { + thrust::host_vector h_data (16); + for (size_t i = 0; i < 16; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 4, 4); + float res = SumOp().sum(mat); + EXPECT_NEAR(res, 1240.0f, esp); +} + +TEST(KmMatrix, ArithMean) { + thrust::host_vector h_data (16); + for (size_t i = 0; i < 16; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 4, 4); + float res = MeanOp().mean(mat); + EXPECT_NEAR(res, 77.5, esp); +} + +TEST(KmMatrix, ArithMul) { + thrust::host_vector h_data (16); + for (size_t i = 0; i < 16; ++i) { + h_data[i] = (float) i * i; + } + KmMatrix mat (h_data, 4, 4); + KmMatrix res (4, 4); + MulOp().mul(res, mat, 2.0f); + + thrust::host_vector h_sol(16); + for (size_t i = 0; i < 16; ++i) { + h_sol[i] = h_data[i] * 2.0f; + } + KmMatrix solution {h_sol, 4, 4}; + + ASSERT_TRUE(res == solution); +} + +TEST(KmMatrix, ArithArgMin) { + std::vector h_data + { + 1.0f, 3.0f, 2.0f, 0.0f, + 3.0f, 1.0f, 0.0f, 2.0f, + 1.0f, 1.0f, 1.0f, 1.0f + }; + + KmMatrix mat (h_data, 3, 4); + KmMatrix res = ArgMinOp().argmin(mat, KmMatrixDim::ROW); + + std::vector solution_vec {3, 2, 0}; + KmMatrix solution (solution_vec, 3, 1); + + ASSERT_TRUE(res == solution); +} + +TEST(KmMatrix, ArithMin) { + std::vector h_data + { + 1.0f, 3.0f, 2.0f, 0.0f, + 3.0f, 1.0f, 0.0f, 2.0f, + 1.0f, 1.0f, 1.0f, 1.0f + }; + KmMatrix mat (h_data, 3, 4); + + KmMatrix res = MinOp().min(mat, KmMatrixDim::ROW); + + std::vector solution_vec {0.0f, 0.0f, 1.0f}; + KmMatrix solution (solution_vec, 3, 1); + + ASSERT_TRUE(res == solution); +} \ No newline at end of file diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu new file mode 100644 index 000000000..e1aae97f1 --- /dev/null +++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu @@ -0,0 +1,151 @@ +#include +#include + +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" +#include + +// r --gtest_filter=KmMatrix.KmMatrixEqual +TEST(KmMatrix, KmMatrixEqual) { + thrust::host_vector vec (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec[i] = i; + } + h2o4gpu::Matrix::KmMatrix mat (vec, 2048, 1024); + + ASSERT_TRUE (mat == mat); + + thrust::host_vector vec2 (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec2[i] = i + i; + } + h2o4gpu::Matrix::KmMatrix mat2 (vec2, 2048, 1024); + + ASSERT_FALSE(mat == mat2); +} + +TEST(KmMatrix, KmMatrixAssig) { + thrust::host_vector vec (2048 * 1024); + for (size_t i = 0; i < 2048 * 1024; ++i) { + vec[i] = i; + } + + h2o4gpu::Matrix::KmMatrix mat0 (vec, 2048, 1024); + h2o4gpu::Matrix::KmMatrix mat1 = mat0; + h2o4gpu::Matrix::KmMatrix mat2; + + mat2 = mat0; + + ASSERT_TRUE(mat0 == mat1); + ASSERT_TRUE(mat1 == mat2); +} + +TEST(KmMatrix, KmMatrixRows) { + thrust::host_vector vec (12 * 16); + for (size_t i = 0; i < 12 * 16; ++i) { + vec[i] = i; + } + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 16); + + thrust::host_vector h_index (4, 1); + h_index[0] = 0; + h_index[1] = 2; + h_index[2] = 9; + h_index[3] = 1; + h2o4gpu::Matrix::KmMatrix index (h_index, 4, 1); + + h2o4gpu::Matrix::KmMatrix rows = mat.rows(index); + + thrust::host_vector h_sol (4 * 16); + for (size_t i = 0; i < 16; ++i) { + h_sol[i] = vec[i]; + } + for (size_t i = 16; i < 32; ++i) { + h_sol[i] = vec[16 * 2 + (i - 16)]; + } + for (size_t i = 32; i < 48; ++i) { + h_sol[i] = vec[16 * 9 + (i - 32)]; + } + for (size_t i = 48; i < 64; ++i) { + h_sol[i] = vec[16 * 1 + (i - 48)]; + } + + h2o4gpu::Matrix::KmMatrix sol (h_sol, 4, 16); + + ASSERT_TRUE(rows == sol); +} + +TEST(KmMatrix, SizeError) { + thrust::host_vector vec (12 * 16); + ASSERT_THROW( + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 4), + std::runtime_error); +} + +TEST(KmMatrix, KmMatrixUtils) { + thrust::host_vector vec (12 * 16); + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 16); + + ASSERT_EQ(mat.rows(), 12); + ASSERT_EQ(mat.cols(), 16); + ASSERT_EQ(mat.size(), 12 * 16); +} + +TEST(KmMatrix, KmMatrixKparam) { + thrust::host_vector vec (12 * 16); + thrust::fill(vec.begin(), vec.end(), 1); + h2o4gpu::Matrix::KmMatrix mat (vec, 12, 16); + + h2o4gpu::Matrix::kParam param = mat.k_param(); + ASSERT_EQ(param.ptr, mat.dev_ptr()); + ASSERT_EQ(param.rows, 12); + ASSERT_EQ(param.cols, 16); +} + +TEST(KmMatrix, KmMatrixCycle) { + size_t rows = 2048, cols = 1024; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + vec[i] = i; + } + // Tweak this one to see if memory grows, there should be a better way to + // test memory leak. + size_t iters = std::pow(16, 1); + h2o4gpu::Matrix::KmMatrix mat0 (vec, rows, cols); + mat0.dev_ptr(); + for (size_t i = 0; i < iters; ++i) { + h2o4gpu::Matrix::KmMatrix mat1 = mat0; + h2o4gpu::Matrix::KmMatrix mat2 = mat1; + mat0 = mat2; + } +} + +// r --gtest_filter=KmMatrix.Stack +TEST(KmMatrix, Stack) { + constexpr size_t rows = 16, cols = 16; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + vec[i] = i; + } + h2o4gpu::Matrix::KmMatrix mat(vec, rows, cols); + + thrust::host_vector vec1 (rows * cols); + for (size_t i = rows * cols; i < 2 * rows * cols; ++i) { + vec1[i - rows * cols] = i; + } + h2o4gpu::Matrix::KmMatrix mat1(vec1, rows, cols); + + h2o4gpu::Matrix::KmMatrix calculated = + h2o4gpu::Matrix::stack(mat, mat1, h2o4gpu::Matrix::KmMatrixDim::ROW); + + thrust::host_vector res (2 * rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + res[i] = i; + } + for (size_t i = rows * cols; i < 2 * rows * cols; ++i) { + res[i] = i; + } + + h2o4gpu::Matrix::KmMatrix res_mat (res, 2 * rows, cols); + + ASSERT_TRUE(calculated == res_mat); +} diff --git a/tests/cpp/gpu/KmMatrix/test_proxy.cu b/tests/cpp/gpu/KmMatrix/test_proxy.cu new file mode 100644 index 000000000..f7574596f --- /dev/null +++ b/tests/cpp/gpu/KmMatrix/test_proxy.cu @@ -0,0 +1,55 @@ +#include +#include +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" + +// r --gtest_filter=KmMatrix.KmMatrixHostProxy +TEST(KmMatrix, KmMatrixProxyHostEqual) { + size_t rows = 12, cols = 16; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + vec[i] = i; + } + + h2o4gpu::Matrix::KmMatrix mat (vec, rows, cols); + + h2o4gpu::Matrix::KmMatrix row = mat.row(1); + + thrust::host_vector res (cols); + + for (size_t i = 0, v = 16; v < 32; ++i, ++v) { + res[i] = v; + } + + h2o4gpu::Matrix::KmMatrix res_mat (res, 1, cols); + + ASSERT_TRUE(res_mat == row); +} + +// r --gtest_filter=KmMatrix.KmMatrixDevProxy +// FIXME +TEST(KmMatrix, KmMatrixProxyDevEqual) { + size_t rows = 12, cols = 16; + thrust::host_vector vec (rows * cols); + for (size_t i = 0; i < rows * cols; ++i) { + vec[i] = i; + } + + h2o4gpu::Matrix::KmMatrix mat (vec, rows, cols); + mat.set_name ("mat"); + + mat.dev_ptr(); + + h2o4gpu::Matrix::KmMatrix row = mat.row(1); + row.set_name ("row"); + + thrust::host_vector res (cols); + + for (size_t i = 0, v = 16; v < 16 + cols; ++i, ++v) { + res[i] = v; + } + + h2o4gpu::Matrix::KmMatrix res_mat (res, 1, cols); + res_mat.set_name("res"); + + ASSERT_TRUE(res_mat == row); +} diff --git a/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu b/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu index c8bdc53fd..2a1bad0bf 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_centroids.cu @@ -1,6 +1,6 @@ #include "gtest/gtest.h" -#include "../src/gpu/kmeans/kmeans_centroids.h" +#include "../../../../src/gpu/kmeans/kmeans_centroids.h" #include TEST(KMeansCentroids, CalculateCentroids) { diff --git a/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu b/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu index a02eb00c0..ddc30d525 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_h2o4gpu.cu @@ -1,7 +1,7 @@ #include "gtest/gtest.h" -#include "../src/gpu/kmeans/kmeans_h2o4gpu.h" -#include "../src/gpu/kmeans/kmeans_labels.h" +#include "../../../../src/gpu/kmeans/kmeans_h2o4gpu.h" +#include "../../../../src/gpu/kmeans/kmeans_labels.h" #include TEST(KMeans, CountsPerCentroids) { diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu new file mode 100644 index 000000000..da2ddaf99 --- /dev/null +++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu @@ -0,0 +1,173 @@ +/*! + * Copyright 2018 H2O.ai, Inc. + * License Apache License Version 2.0 (see LICENSE for details) + */ + +#include + +#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/Generator.hpp" +#include "../../../../src/gpu/matrix/KmMatrix/Arith.hpp" +#include "../../../../src/gpu/kmeans/kmeans_init.cuh" +#include "../../../../src/common/utils.h" + +#include +#include +#include + +using namespace h2o4gpu::kMeans; +using namespace h2o4gpu::Matrix; + +template +struct GeneratorMock : RandomGeneratorBase { + public: + KmMatrix generate() override { + h2o4gpu_error("Not implemented"); + KmMatrix res; + return res; + } + + KmMatrix generate(size_t _size) override { + thrust::host_vector random_numbers (_size); + for (size_t i = 0; i < _size; ++i) { + if ( i % 2 == 0) + random_numbers[i] = 0.8; + else + random_numbers[i] = 0.2; + } + KmMatrix res (random_numbers, 1, _size); + return res; + } +}; + +TEST(KmeansRandom, Init) { + thrust::host_vector h_data (20); + for (size_t i = 0; i < 20; ++i) { + h_data[i] = i * 2; + } + KmMatrix data (h_data, 4, 5); + std::unique_ptr> gen (new GeneratorMock()); + KmeansRandomInit init (gen); + + auto res = init(data, 2); + + std::vector h_sol = + { + 30, 32, 34, 36, 38, + 0, 2, 4, 6, 8 + }; + KmMatrix sol (h_sol, 2, 5); + ASSERT_TRUE(sol == res); +} + +// r --gtest_filter=KmeansLL.PairWiseDistance +TEST(KmeansLL, PairWiseDistance) { + + thrust::host_vector h_data (20); + for (size_t i = 0; i < 20; ++i) { + h_data[i] = i * 2; + } + KmMatrix data (h_data, 4, 5); + + thrust::host_vector h_centroids(10); + for (size_t i = 0; i < 10; ++i) { + h_centroids[i] = i; + } + KmMatrix centroids (h_centroids, 2, 5); + + KmMatrix data_dot (4, 1); + VecBatchDotOp().dot(data_dot, data); + + KmMatrix centroids_dot (2, 1); + VecBatchDotOp().dot(centroids_dot, centroids); + + thrust::host_vector h_pairs (8); + for (size_t i = 0; i < 8; ++i) { + h_pairs[i] = 0; + } + KmMatrix distance_pairs (h_pairs, 4, 2); + + KmMatrix res = detail::PairWiseDistanceOp( + data_dot, centroids_dot, distance_pairs)(data, centroids); + + std::vector h_sol + { + 30., 55., + 730., 255., + 2430., 1455., + 5130., 3655., + }; + KmMatrix sol (h_sol, 4, 2); + + ASSERT_TRUE(sol == res); +} + +// r --gtest_filter=KmeansLL.GreedyRecluster +TEST(KmeansLL, GreedyRecluster) { + thrust::host_vector h_centroids (20); + // close points + for (size_t i = 0; i < 5; ++i) { + h_centroids[i] = i + 4; + } + for (size_t i = 5; i < 10; ++i) { + h_centroids[i] = i; + } + for (size_t i = 10; i < 15; ++i) { + h_centroids[i] = i - 4; + } + // satelite + for (size_t i = 15; i < 20; ++i) { + h_centroids[i] = i; + } + + KmMatrix centroids (h_centroids, 4, 5); + KmMatrix res = detail::GreedyRecluster::recluster(centroids, + 2); + std::vector h_sol = + { + 4, 5, 6, 7, 8, + 5, 6, 7, 8, 9, + }; + KmMatrix sol (h_sol, 2, 5); + ASSERT_TRUE(res == sol); +} + +// r --gtest_filter=KmeansLL.KmeansLLInit +TEST(KmeansLL, KmeansLLInit) { + std::unique_ptr> mock_ptr (new GeneratorMock); + KmeansLlInit kmeans_ll_init (mock_ptr, 2.5); + thrust::host_vector h_data (30); + // We split the points into two groups, but the result is statistic. + for (size_t i = 0; i < 5; ++i) { + h_data[i] = i + 4; + } + for (size_t i = 5; i < 10; ++i) { + h_data[i] = i; + } + for (size_t i = 10; i < 15; ++i) { + h_data[i] = i - 4; + } + + + for (size_t i = 15; i < 20; ++i) { + h_data[i] = i + 4; + } + for (size_t i = 20; i < 25; ++i) { + h_data[i] = i; + } + for (size_t i = 25; i < 30; ++i) { + h_data[i] = i - 4; + } + + KmMatrix data (h_data, 6, 5); + + auto res = kmeans_ll_init(data, 2); + + std::vector h_sol = + { + 4, 5, 6, 7, 8, + 19, 20, 21, 22, 23, + }; + KmMatrix sol (h_sol, 2, 5); + ASSERT_TRUE(sol == res); +} diff --git a/tests/cpp/gpu/kmeans/test_kmeans_labels.cu b/tests/cpp/gpu/kmeans/test_kmeans_labels.cu index f702681bc..aa54c8f1c 100644 --- a/tests/cpp/gpu/kmeans/test_kmeans_labels.cu +++ b/tests/cpp/gpu/kmeans/test_kmeans_labels.cu @@ -1,6 +1,6 @@ #include "gtest/gtest.h" -#include "../src/gpu/kmeans/kmeans_labels.h" +#include "../../../../src/gpu/kmeans/kmeans_labels.h" #include #include