diff --git a/examples/algorithms/CMakeLists.txt b/examples/algorithms/CMakeLists.txt index 927b2c1a8..ccd2874d0 100644 --- a/examples/algorithms/CMakeLists.txt +++ b/examples/algorithms/CMakeLists.txt @@ -8,6 +8,7 @@ set(subdirs lir lra kmeans + randomforest nn ) diff --git a/examples/algorithms/randomforest/CMakeLists.txt b/examples/algorithms/randomforest/CMakeLists.txt new file mode 100644 index 000000000..5d3432589 --- /dev/null +++ b/examples/algorithms/randomforest/CMakeLists.txt @@ -0,0 +1,36 @@ +# Copyright (c) 2017 Hartmut Kaiser +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +set(example_programs + randomforest + simple_randomforest + ) + +foreach(example_program ${example_programs}) + + set(${example_program}_FLAGS DEPENDENCIES iostreams_component) + + set(sources ${example_program}.cpp) + + source_group("Source Files" FILES ${sources}) + + # add example executable + add_phylanx_executable(${example_program} + SOURCES ${sources} + ${${example_program}_FLAGS} + FOLDER "Examples/Algorithms") + + # add a custom target for this example + add_phylanx_pseudo_target(examples.algorithms_.${example_program}) + + # make pseudo-targets depend on master pseudo-target + add_phylanx_pseudo_dependencies(examples.algorithms_ + examples.algorithms_.${example_program}) + + # add dependencies to pseudo-target + add_phylanx_pseudo_dependencies(examples.algorithms_.${example_program} + ${example_program}_exe) +endforeach() + diff --git a/examples/algorithms/randomforest/phylanx_randomforest.py b/examples/algorithms/randomforest/phylanx_randomforest.py new file mode 100644 index 000000000..f57b25162 --- /dev/null +++ b/examples/algorithms/randomforest/phylanx_randomforest.py @@ -0,0 +1,244 @@ +# Copyright (c) 2018 Christopher Taylor +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +# +# significant re-working of the algorithm implementation found on this site: +# +# https://machinelearningmastery.com/implement-random-forest-scratch-python/ +# + +from numpy import floor, argsort, sqrt +from numpy import float64, int64, zeros +from numpy import argmax, inf, genfromtxt +from numpy import vstack, iinfo, finfo, unique +from numpy.random import randint, rand +from phylanx import Phylanx + + +def test_split(idx, val, dataset): + left, right = list(), list() + for i in range(dataset.shape[0]): + row = dataset[i, :] + if row[idx] < val: + left.append(row) + else: + right.append(row) + + if len(left) < 1 and len(right) > 0: + return (zeros((0,)), vstack(right)) + elif len(left) > 0 and len(right) < 0: + return (vstack(left), zeros((0,))) + + return (vstack(left), vstack(right)) + + +def gini_index(groups, classes): + groups_len = list([len(x) for x in groups]) + n_instances = float64(sum(groups_len)) + gini = 0.0 + + p = zeros(len(classes), dtype=float64) + for (group, group_len) in filter( + lambda x: x[1] > 0, zip(groups, groups_len) + ): + for row in group: + p[classes[int64(row[-1])]] += 1.0 + score = sum(((p / float64(group_len)) ** 2.0)) + gini += (1.0 - score) * float64(group_len / n_instances) + p[:] = 0.0 + + return gini + + +def get_split(dataset, n_features, classes): + cls_values = zeros(len(classes), dtype=int64) + for i, v in enumerate(classes): + cls_values[v] = i + + b_idx = iinfo(int64).max + b_val = finfo(float64).max + b_score = finfo(float64).max + b_groups = (list(), list()) + idx_w = randint(0, dataset.shape[1] - 1, size=dataset.shape[1] - 1) + idx = zeros(dataset.shape[1] - 1, dtype=int64) + + for i in range(dataset.shape[1] - 1): + idx[i] = i + + features = idx[argsort(idx_w)][:n_features] + for feature in features: + for r in range(dataset.shape[0]): + groups = test_split(feature, dataset[r, feature], dataset) + gini = gini_index(groups, cls_values) + if gini < b_score: + b_idx = feature + b_val = dataset[r, feature] + b_score = gini + b_groups = groups + + return {'index': b_idx, + 'value': b_val, + 'groups': b_groups, + 'lw': inf, + 'rw': inf} + + +def to_terminal(group, classes): + outcome_hist = zeros(len(classes), dtype=int64) + for g in group: + k = int64(g[-1]) + outcome_hist[classes[k]] += 1 + + return argmax(outcome_hist) + + +def split(node, max_depth, min_sz, n_features, depth, classes): + GRP, LFT, RHT, LW, RW = 'groups', 'left', 'right', 'lw', 'rw' + + (left, right) = node[GRP] + del(node[GRP]) + + if left.shape == (0,) or right.shape == (0,): + if left.shape == (0,): + term = to_terminal(right, classes) + else: + term = to_terminal(left, classes) + + node[LW] = term + node[RW] = term + return + + if depth >= max_depth: + lterm = to_terminal(left, classes) + rterm = to_terminal(right, classes) + node[LW] = lterm + node[RW] = rterm + return + + if len(left) <= min_sz: + node[LW] = to_terminal(left, classes) + else: + node[LFT] = get_split(left, n_features, classes) + split(node[LFT], max_depth, min_sz, n_features, depth + 1, classes) + + if len(right) <= min_sz: + node[RW] = to_terminal(right, classes) + else: + node[RHT] = get_split(right, n_features, classes) + split(node[RHT], max_depth, min_sz, n_features, depth + 1, classes) + + +def build_tree(train, max_depth, min_sz, n_features, classes): + root = get_split(train, n_features, classes) + split(root, max_depth, min_sz, n_features, 1, classes) + return root + + +def node_predict(node, r): + if r[node['index']] < node['value']: + if node['lw'] == inf: + return node_predict(node['left'], r) + else: + return node['lw'] + else: + if node['rw'] == inf: + return node_predict(node['right'], r) + else: + return node['rw'] + + +def subsample(dataset, ratio): + n_sample = int64(floor(len(dataset) * ratio)) + idx_w = list(map(lambda x: rand(), range(dataset.shape[0]))) + idx_s = argsort(idx_w) + sample = vstack(map(lambda x: dataset[idx_s[x], :], range(n_sample))) + return sample + + +def bagging_predict(trees, row, classes): + predictions = list(map(lambda tree: node_predict(tree, row), trees)) + # parallel + # + # predictions = + # list(map(lambda tree: + # node_predict(trees[tree], row), + # prange(len(trees))) + # + classes_vec = zeros(len(classes), dtype=int64) + for p in predictions: + classes_vec[classes[p]] += 1 + + idx = argmax(classes_vec) + for (k, v) in classes.items(): + if v == idx: + return k + return inf + + +@Phylanx +def random_forest(train, max_depth, min_sz, sample_sz, n_trees): + cls = unique(train[:, -1]) + classes = dict() + print(cls.shape) + for c in range(cls.shape[0]): + classes[int64(cls[c])] = c + + n_features = int64(floor(sqrt(dataset.shape[0]))) + trees = list( + map(lambda i: + build_tree( + subsample(train, sample_sz), + max_depth, + min_sz, + n_features, + classes + ), + range(n_trees)) + ) + + # parallel + # + # trees = + # list(map(lambda i: + # build_tree(subsample(train, sample_sz) + # , max_depth, min_sz, n_features) + # , prange(n_trees))) + # + return {'trees': trees, 'classes': classes} + + +@Phylanx +def predict(randomforest, test): + trees, classes = randomforest['trees'], randomforest['classes'] + predictions = list( + map(lambda row: + bagging_predict( + trees, + test[row, :], + classes), + range(len(test))) + ) + + return predictions + + +if __name__ == "__main__": + file_name = "../datasets/breast_cancer.csv" + dataset = genfromtxt(file_name, skip_header=1, delimiter=",") + max_depth = 10 + min_size = 1 + sample_size = 1.0 + n_trees = [1, 5, 10] + train = int64(dataset.shape[0] / 2) + trees = random_forest( + dataset[:train, :], + max_depth, + min_size, + sample_size, + n_trees[1] + ) + + print('predict') + predict = predict(trees, dataset[train:, :]) + print(predict) diff --git a/examples/algorithms/randomforest/randomforest.cpp b/examples/algorithms/randomforest/randomforest.cpp new file mode 100644 index 000000000..3d3e1f4f0 --- /dev/null +++ b/examples/algorithms/randomforest/randomforest.cpp @@ -0,0 +1,109 @@ +// Copyright (c) 2018 Christopher Taylor +// +// Distributed under the Boost Software License, Version 1.0.0. (See accompanying +// file LICENSE_1_0.0.txt or copy at http://www.boost.org/LICENSE_1_0.0.txt) + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "impl/randomforest.hpp" + +using namespace phylanx::algorithms::impl; + +int hpx_main(boost::program_options::variables_map& vm) +{ + // evaluate generated execution tree + auto ntrees = vm["trees"].as(); + auto mnsize = vm["minsize"].as(); + auto mxdepth = vm["maxdepth"].as(); + auto samplesize = vm["samples"].as(); + + using namespace phylanx::algorithms::impl; + + blaze::DynamicMatrix train{ { 1.0, 1.0, 1.0, 1.0, 0.0 } + , { 1.0, 1.0, 1.0, 1.0, 0.0 } + , { 1.0, 1.0, 1.0, 1.0, 1.0 } + , { 1.0, 1.0, 1.0, 1.0, 1.0 } + }; + + blaze::DynamicVector labels { 1.0, 1.0, 1.0, 1.0 }; + + auto const train_submat_data = blaze::submatrix( train, 0UL + , 0UL, train.rows(), train.columns()-1UL ); + + randomforest_impl rf(ntrees); + + // Measure execution time + hpx::util::high_resolution_timer traintimer; + + rf.fit(train, labels, mxdepth, mnsize, samplesize); + + auto trainelapsed = traintimer.elapsed(); + + blaze::DynamicVector results(train.rows()); + + // Make sure all counters are properly initialized, + // don't reset current counter values + hpx::reinit_active_counters(false); + + hpx::util::high_resolution_timer predicttimer; + + rf.predict(train, results); + + auto predictelapsed = predicttimer.elapsed(); + + // Make sure all counters are properly initialized, don't reset current + // counter values + hpx::reinit_active_counters(false); + + std::cout << "fit lapsed\t" << trainelapsed << std::endl; + std::cout << "predict lapsed\t" << predictelapsed << std::endl; + + for(auto & r : results) { + std::cout << r << std::endl; + } + + return hpx::finalize(); +} + +int main(int argc, char* argv[]) +{ + // command line handling + boost::program_options::options_description desc( + "usage: randomforest [options]"); + + desc.add_options()("trees,t", + boost::program_options::value()->default_value(5), + "number of trees (default: 5)")("samples,s", + boost::program_options::value()->default_value(1.0), + "ratio of sample size (default: 1.0)")("minsize,m", + boost::program_options::value()->default_value(1), + "min size (default: 1")("maxdepth,d", + boost::program_options::value()->default_value(10), + "max depth (default: 10)"); + + return hpx::init(desc, argc, argv); +} diff --git a/examples/algorithms/randomforest/randomforest.py b/examples/algorithms/randomforest/randomforest.py new file mode 100644 index 000000000..e198d72ab --- /dev/null +++ b/examples/algorithms/randomforest/randomforest.py @@ -0,0 +1,243 @@ +# Copyright (c) 2018 Christopher Taylor +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +# +# significant re-working of the algorithm implementation found on this site: +# +# https://machinelearningmastery.com/implement-random-forest-scratch-python/ +# +from numpy import floor, argsort, sqrt +from numpy import float64, int64, zeros +from numpy import argmax, inf, genfromtxt +from numpy import vstack, unique +from numpy.random import randint, rand + +from phylanx import Phylanx + + +def test_split(idx, val, dataset): + left, right = list(), list() + for i in range(dataset.shape[0]): + row = dataset[i, :] + if row[idx] < val: + left.append(row) + else: + right.append(row) + + if len(left) < 1 and len(right) > 0: + return (zeros((0,)), vstack(right)) + elif len(left) > 0 and len(right) < 0: + return (vstack(left), zeros((0,))) + + return (vstack(left), vstack(right)) + + +def gini_index(groups, classes): + groups_len = list([len(x) for x in groups]) + n_instances = float64(sum(groups_len)) + gini = 0.0 + + p = zeros(len(classes), dtype=float64) + for (group, group_len) in filter( + lambda x: x[1] > 0, zip(groups, groups_len) + ): + for row in group: + p[classes[int64(row[-1])]] += 1.0 + score = sum(((p / float64(group_len)) ** 2.0)) + gini += (1.0 - score) * float64(group_len / n_instances) + p[:] = 0.0 + + return gini + + +def get_split(dataset, n_features, classes): + cls_values = zeros(len(classes), dtype=int64) + for i, v in enumerate(classes): + cls_values[v] = i + + b_idx = 0 + b_val = 0 + b_score = 0 + b_groups = (list(), list()) + idx_w = randint(0, dataset.shape[1] - 1, size=dataset.shape[1] - 1) + idx = zeros(dataset.shape[1] - 1, dtype=int64) + + for i in range(dataset.shape[1] - 1): + idx[i] = i + + features = idx[argsort(idx_w)][:n_features] + for feature in features: + for r in range(dataset.shape[0]): + groups = test_split(feature, dataset[r, feature], dataset) + gini = gini_index(groups, cls_values) + if gini < b_score: + b_idx = feature + b_val = dataset[r, feature] + b_score = gini + b_groups = groups + + return {'index': b_idx, + 'value': b_val, + 'groups': b_groups, + 'lw': inf, + 'rw': inf} + + +def to_terminal(group, classes): + outcome_hist = zeros(len(classes), dtype=int64) + for g in group: + k = int64(g[-1]) + outcome_hist[classes[k]] += 1 + + return argmax(outcome_hist) + + +def split(node, max_depth, min_sz, n_features, depth, classes): + GRP, LFT, RHT, LW, RW = 'groups', 'left', 'right', 'lw', 'rw' + + (left, right) = node[GRP] + del(node[GRP]) + + if left.shape == (0,) or right.shape == (0,): + if left.shape == (0,): + term = to_terminal(right, classes) + else: + term = to_terminal(left, classes) + + node[LW] = term + node[RW] = term + return + + if depth >= max_depth: + lterm = to_terminal(left, classes) + rterm = to_terminal(right, classes) + node[LW] = lterm + node[RW] = rterm + return + + if len(left) <= min_sz: + node[LW] = to_terminal(left, classes) + else: + node[LFT] = get_split(left, n_features, classes) + split(node[LFT], max_depth, min_sz, n_features, depth + 1, classes) + + if len(right) <= min_sz: + node[RW] = to_terminal(right, classes) + else: + node[RHT] = get_split(right, n_features, classes) + split(node[RHT], max_depth, min_sz, n_features, depth + 1, classes) + + +def build_tree(train, max_depth, min_sz, n_features, classes): + root = get_split(train, n_features, classes) + split(root, max_depth, min_sz, n_features, 1, classes) + return root + + +def node_predict(node, r): + if r[node['index']] < node['value']: + if node['lw'] == inf: + return node_predict(node['left'], r) + else: + return node['lw'] + else: + if node['rw'] == inf: + return node_predict(node['right'], r) + else: + return node['rw'] + + +def subsample(dataset, ratio): + n_sample = int64(floor(len(dataset) * ratio)) + idx_w = list(map(lambda x: rand(), range(dataset.shape[0]))) + idx_s = argsort(idx_w) + sample = vstack(map(lambda x: dataset[idx_s[x], :], range(n_sample))) + return sample + + +def bagging_predict(trees, row, classes): + predictions = list(map(lambda tree: node_predict(tree, row), trees)) + # parallel + # + # predictions = + # list(map(lambda tree: + # node_predict(trees[tree], row), + # prange(len(trees))) + # + classes_vec = zeros(len(classes), dtype=int64) + for p in predictions: + classes_vec[classes[p]] += 1 + + idx = argmax(classes_vec) + for (k, v) in classes.items(): + if v == idx: + return k + return inf + + +@Phylanx +def random_forest(train, max_depth, min_sz, sample_sz, n_trees): + cls = unique(train[:, -1]) + classes = dict() + for c in range(cls.shape[0]): + classes[int64(cls[c])] = c + + n_features = int64(floor(sqrt(dataset.shape[0]))) + trees = list( + map(lambda i: + build_tree( + subsample(train, sample_sz), + max_depth, + min_sz, + n_features, + classes + ), + range(n_trees)) + ) + + # parallel + # + # trees = + # list(map(lambda i: + # build_tree(subsample(train, sample_sz) + # , max_depth, min_sz, n_features) + # , prange(n_trees))) + # + return {'trees': trees, 'classes': classes} + + +def predict(randomforest, test): + trees, classes = randomforest['trees'], randomforest['classes'] + predictions = list( + map(lambda row: + bagging_predict( + trees, + test[row, :], + classes), + range(len(test))) + ) + + return predictions + + +if __name__ == "__main__": + file_name = "../datasets/breast_cancer.csv" + dataset = genfromtxt(file_name, skip_header=1, delimiter=",") + max_depth = 10 + min_size = 1 + sample_size = 1.0 + n_trees = [1, 5, 10] + train = int64(dataset.shape[0] / 2) + trees = random_forest( + dataset[:train, :], + max_depth, + min_size, + sample_size, + n_trees[1] + ) + + # print('predict') + # predict = predict(trees, dataset[train:, :]) + # + print(predict) diff --git a/examples/algorithms/randomforest/simple_randomforest.cpp b/examples/algorithms/randomforest/simple_randomforest.cpp new file mode 100644 index 000000000..bfad7acf1 --- /dev/null +++ b/examples/algorithms/randomforest/simple_randomforest.cpp @@ -0,0 +1,104 @@ +// Copyright (c) 2017 Hartmut Kaiser +// Copyright (c) 2018 Christopher Taylor +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +//#include +#include +#include + +#include +#include +#include + +#include + +/////////////////////////////////////////////////////////////////////////////// +char const* const randomforest_test_code = R"(block( + // + // randomforest algorithm + // + define(randomforest_test, train_data, train_labels, test_data, + block( + define(test_labels, constant(0.0, shape(test_data, 0))), + define(mx_depth, 10), + define(min_size, 1), + define(sample_size, 1.0), + define(n_trees, 10), + define(model, randomforest_fit(train_data, train_labels + , mx_depth, min_size, sample_size, n_trees)), + block( + store(test_labels, randomforest_predict(model, test_data)), + cout(test_data) + ) + ) + ), + randomforest_test +))"; + +int hpx_main(boost::program_options::variables_map& vm) +{ +/* + blaze::DynamicMatrix v1{ + {15.04, 16.74}, {13.82, 24.49}, {12.54, 16.32}, {23.09, 19.83}, + {9.268, 12.87}, {9.676, 13.14}, {12.22, 20.04}, {11.06, 17.12}, + {16.3, 15.7}, {15.46, 23.95}, {11.74, 14.69}, {14.81, 14.7}, + {13.4, 20.52}, {14.58, 13.66}, {15.05, 19.07}, {11.34, 18.61}, + {18.31, 20.58}, {19.89, 20.26}, {12.88, 18.22}, {12.75, 16.7}, + {9.295, 13.9}, {24.63, 21.6}, {11.26, 19.83}, {13.71, 18.68}, + {9.847, 15.68}, {8.571, 13.1}, {13.46, 18.75}, {12.34, 12.27}, + {13.94, 13.17}, {12.07, 13.44}}; + + blaze::DynamicVector v2{1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, + 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}; +*/ + blaze::DynamicMatrix train{{0.0, 4.0, 0.0, 0.0, 0.0}, + {1.0, 0.0, 4.0, 0.0, 5.0}, {0.0, 0.0, 0.0, 2.0, 0.0}, + {0.0, 8.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 4.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 0.0, 2.0}, + {1.0, 0.0, 0.0, 0.0, 0.0}, {0.0, 0.0, 0.0, 5.0, 0.0}, + {1.0, 0.0, 0.0, 2.0, 0.0}}; + + blaze::DynamicVector labels { 0.0, 0.0, 0.0, 0.0, 0.0 + , 0.0, 1.0, 1.0, 1.0, 1.0 }; + + // evaluate generated execution tree + auto ntrees = vm["trees"].as(); + auto mnsize = vm["minsize"].as(); + auto mxdepth = vm["maxdepth"].as(); + auto samplesize = vm["samples"].as(); + + // compile the given code + phylanx::execution_tree::compiler::function_list snippets; + auto const& code = phylanx::execution_tree::compile(randomforest_test_code + , snippets); + auto randomforest_test = code.run(); + + // evaluate generated execution tree + auto x = phylanx::ir::node_data{train}; + auto y = phylanx::ir::node_data{labels}; + auto z = phylanx::ir::node_data{train}; + + randomforest_test(x, y, z); + return hpx::finalize(); +} + +int main(int argc, char* argv[]) +{ + boost::program_options::options_description desc( + "usage: simple_randomforest [options]"); + + desc.add_options()("trees,t", + boost::program_options::value()->default_value(5), + "number of trees (default: 5)")("samples,s", + boost::program_options::value()->default_value(1.0), + "ratio of sample size (default: 1.0)")("minsize,m", + boost::program_options::value()->default_value(1), + "min size (default: 1")("maxdepth,d", + boost::program_options::value()->default_value(10), + "max depth (default: 10)"); + + return hpx::init(desc, argc, argv); +} diff --git a/phylanx/plugins/algorithms/algorithms.hpp b/phylanx/plugins/algorithms/algorithms.hpp index 9d3709986..dd918e317 100644 --- a/phylanx/plugins/algorithms/algorithms.hpp +++ b/phylanx/plugins/algorithms/algorithms.hpp @@ -8,5 +8,6 @@ #include #include +#include #endif diff --git a/phylanx/plugins/algorithms/impl/randomforest.hpp b/phylanx/plugins/algorithms/impl/randomforest.hpp new file mode 100644 index 000000000..33dc19eb6 --- /dev/null +++ b/phylanx/plugins/algorithms/impl/randomforest.hpp @@ -0,0 +1,568 @@ +// Copyright (c) 2018 Christopher Taylor +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#if !defined(__PHYLANX_RANDOMFORESTIMPL_HPP__) +#define __PHYLANX_RANDOMFORESTIMPL_HPP__ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/////////////////////////////////////////////////////////////////////////////// + +/* + life savers... + + https://www.boost.org/doc/libs/1_68_0/libs/spirit/ + doc/html/spirit/qi/tutorials/mini_xml___asts_.html + https://www.boost.org/doc/libs/1_68_0/libs/spirit/ + example/qi/mini_xml1.cpp + */ +namespace phylanx { namespace algorithms { namespace impl { + +using randomforest_node = + phylanx::execution_tree::primitive_argument_type; + +struct randomforest_impl { + + std::vector trees; + std::int64_t ntrees; + std::unordered_map classes; + + explicit randomforest_impl(const std::int64_t n_trees) + : trees(n_trees) + , ntrees(n_trees) + , classes() { + } + + static void test_split( + std::int64_t const feature + , double const val + , blaze::DynamicMatrix const& dataset + , std::tuple< blaze::DynamicVector< std::int64_t > + , blaze::DynamicVector< std::int64_t > > & groups ) { + + std::vector< std::int64_t > left, right; + auto const rows = dataset.rows(); + + for(auto rowidx = 0; rowidx < rows; ++rowidx) { + if(dataset(rowidx, feature) < val) { + left.push_back(rowidx); + } + else { + right.push_back(rowidx); + } + } + + blaze::DynamicVector< std::int64_t > lvec(left.size()), rvec(right.size()); + std::copy(left.begin(), left.end(), lvec.begin()); + std::copy(right.begin(), right.end(), rvec.begin()); + + groups = std::make_tuple(lvec, rvec); + } + + /////////////////////////////////////////////////////////////////////////// + static double gini_index( + blaze::DynamicMatrix const& dataset + , blaze::DynamicVector const& dataset_labels + , std::tuple< blaze::DynamicVector + , blaze::DynamicVector > const& groups + , std::unordered_map & classes) { + + std::vector groups_len{ + static_cast(std::get<0>(groups).size()) + , static_cast(std::get<1>(groups).size())}; + + std::vector< blaze::DynamicVector > groups_vec{std::get<0>(groups) + , std::get<1>(groups)}; + + double const n_instances = static_cast( + std::accumulate(groups_len.begin(), groups_len.end(), 0UL) + ); + double gini = 0.0; + + blaze::DynamicVector p(classes.size()); + + for(auto i = 0; i < groups_len.size(); ++i) { + if(groups_len[i] < 1) { continue; } + double const& group_len = static_cast(groups_len[i]); + auto const& group = groups_vec[i]; + + for(auto const& row_idx : group) { + p[classes[dataset_labels[row_idx]]] += 1.0; + } + + double const score = std::accumulate(p.begin(), p.end(), 0.0 + , [&group_len](double const pval, double const qval) { + return std::pow(pval/group_len, 2.0) + + std::pow(qval/group_len, 2.0); + } + ); + + gini += (1.0 - score) * static_cast(group_len / n_instances); + std::transform(p.begin() + , p.end(), p.begin() + , [](auto const i) { return 0.0; }); + } + + return gini; + } + + /////////////////////////////////////////////////////////////////////////// + template + static void argsort(T const& v, F & idx ) { + if(idx.size() < v.size()) + idx.resize(v.size()); + + std::iota(idx.begin(), idx.end(), 0); + std::sort(idx.begin(), idx.end() + , [&v](auto const i1, auto const i2) { + return v[i1] < v[i2]; + } + ); + } + + /////////////////////////////////////////////////////////////////////////// + static void get_split( + blaze::DynamicMatrix const& dataset + , blaze::DynamicVector const& dataset_labels + , blaze::DynamicVector const& dataset_indices + , std::int64_t const n_features + , std::unordered_map & classes + , randomforest_node & ret) { + + std::int64_t b_idx = (std::numeric_limits::max)(); + double b_val = (std::numeric_limits::max)(); + double b_score = (std::numeric_limits::max)(); + std::tuple< blaze::DynamicVector + , blaze::DynamicVector > b_groups; + + std::vector features(n_features); + + { + std::default_random_engine generator; + std::uniform_int_distribution distribution(0 + , dataset.columns()); + auto gen = [&generator, &distribution]() { + return distribution(generator); + }; + + std::vector idx_w(dataset.columns()); + std::generate_n(idx_w.begin(), idx_w.size() + , [&gen]() { return gen(); }); + std::vector idx_w_sort; + argsort(idx_w, idx_w_sort); + + std::vector idx(idx_w_sort.size()); + std::iota(idx.begin(), idx.end(), 0); + + std::transform( + idx_w_sort.begin(), idx_w_sort.begin()+n_features + , features.begin() + , [&idx](auto const i) { + return idx[i]; + } + ); + } + + std::tuple< blaze::DynamicVector< std::int64_t > + , blaze::DynamicVector< std::int64_t > > groups; + + for(std::int64_t feature : features) { + for(std::int64_t r : dataset_indices) { + test_split(feature, dataset(r, feature), dataset, groups); + auto const gini = gini_index(dataset, + dataset_labels, groups, classes); + if(gini < b_score) { + b_idx = feature; + b_val = dataset(r, feature); + b_score = gini; + b_groups = std::make_tuple( std::get<0>(groups) + , std::get<1>(groups) ); + } + } + } + + auto& node_ = phylanx::util::get(ret); + node_[phylanx::execution_tree::primitive_argument_type{std::string("index")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data(b_idx) + }; + node_[phylanx::execution_tree::primitive_argument_type{std::string("value")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data(b_val) + }; + node_[phylanx::execution_tree::primitive_argument_type{std::string("groups_l")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data( std::get<0>(b_groups) ) + }; + node_[phylanx::execution_tree::primitive_argument_type{std::string("groups_r")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data( std::get<1>(b_groups) ) + }; + node_[phylanx::execution_tree::primitive_argument_type{std::string("lw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data( + (std::numeric_limits::max)() + ) + }; + node_[phylanx::execution_tree::primitive_argument_type{std::string("rw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data( + (std::numeric_limits::max)() + ) + }; + } + + /////////////////////////////////////////////////////////////////////////// + static std::int64_t to_terminal( + blaze::DynamicVector const& train_labels + , blaze::DynamicVector const& group + , std::unordered_map & classes) { + + blaze::DynamicVector outcome_hist(classes.size()); + + for(auto g : group) { + double const k = train_labels[g]; + outcome_hist[classes[k]] += 1.0; + } + + return (outcome_hist.size() < 1) ? 0 : std::distance(outcome_hist.begin() + , std::max_element(outcome_hist.begin(), outcome_hist.end()) + ); + } + + /////////////////////////////////////////////////////////////////////////// + static void split(randomforest_node & node + , blaze::DynamicMatrix const& train + , blaze::DynamicVector const& train_labels + , std::int64_t max_depth + , std::int64_t min_size + , std::int64_t n_features + , std::int64_t depth + , std::unordered_map & classes) { + + auto& node_ = phylanx::util::get(node); + + blaze::DynamicVector left = + phylanx::util::get>( + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("groups_l")}].get().variant() + ).vector(); + + blaze::DynamicVector right = + phylanx::util::get>( + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("groups_r")}].get().variant() + ).vector(); + + + node_.erase(phylanx::execution_tree::primitive_argument_type{ + std::string("groups_l")}); + node_.erase(phylanx::execution_tree::primitive_argument_type{ + std::string("groups_r")}); + + if(left.size() == 0 || right.size() == 0) { + blaze::DynamicVector joint{left}; + auto jsz = joint.size(); + joint.reserve(jsz + right.size()); + + std::for_each(right.begin(), right.end() + , [&joint, &jsz](auto rval) { joint[jsz] = rval; ++jsz; }); + + auto term = to_terminal(train_labels, joint, classes); + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("lw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data{term}}; + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("rw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data{term}}; + return; + } + + if(depth >= max_depth) { + auto lterm = to_terminal(train_labels, left, classes); + auto rterm = to_terminal(train_labels, right, classes); + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("lw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data{lterm}}; + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("rw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data{rterm}}; + return; + } + + if(left.size() <= min_size) { + auto lterm = to_terminal(train_labels, left, classes); + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("lw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data{lterm}}; + } + else { + randomforest_node left_node{}; + get_split(train, train_labels, left, n_features, classes, left_node); + split(left_node, train, train_labels, max_depth, min_size, n_features + , depth + 1, classes); + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("left")}] = std::move(left_node); + } + + if(right.size() <= min_size) { + auto rterm = to_terminal(train_labels, right, classes); + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("rw")}] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data{rterm}}; + } + else { + randomforest_node right_node{}; + get_split(train, train_labels, right, n_features, classes, right_node); + split(right_node, train, train_labels, max_depth, min_size, n_features + , depth + 1, classes); + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("right")}] = std::move(right_node); + } + } + + /////////////////////////////////////////////////////////////////////////// + static void build_tree( + randomforest_node & tree_root + , blaze::DynamicMatrix const& train + , blaze::DynamicVector const& train_labels + , blaze::DynamicVector const& sample_indices + , std::int64_t max_depth + , std::int64_t min_size + , std::int64_t n_features + , std::unordered_map & classes) { + + get_split(train, train_labels, sample_indices, n_features, classes + , tree_root); + split(tree_root, train, train_labels, max_depth, min_size, n_features + , 1, classes); + } + + /////////////////////////////////////////////////////////////////////////// + static std::int64_t node_predict( + randomforest_node & node + , blaze::DynamicMatrix const& r + , std::int64_t const i) { + + auto& node_ = phylanx::util::get(node); + + std::int64_t index = phylanx::util::get>( + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("index")}].get().variant() + ).scalar(); + + double value = phylanx::util::get>( + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("value")}].get().variant() + ).scalar(); + + if(r(i, index) < value) { + std::int64_t lw = phylanx::util::get>( + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("lw")}].get().variant() + ).scalar(); + + if( lw == (std::numeric_limits::max)()) { + auto& left = + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("left")}].get(); + return node_predict(left, r, i); + } + else { + return lw; + } + } + + auto rw = phylanx::util::get>( + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("rw")}].get().variant() + ).scalar(); + + if(rw == (std::numeric_limits::max)()) { + auto& right = + node_[phylanx::execution_tree::primitive_argument_type{ + std::string("right")}].get(); + return node_predict(right, r, i); + } + + return rw; + } + + /////////////////////////////////////////////////////////////////////////// + static void subsample( + blaze::DynamicMatrix const& dataset + , blaze::DynamicVector & idx_w_sort + , std::int64_t const ratio) { + + std::default_random_engine generator; + std::uniform_int_distribution distribution(0, dataset.rows()); + auto gen = [&generator, &distribution] { + return static_cast(distribution(generator)); + }; + + std::vector idx_w(dataset.rows()); + std::generate_n(idx_w.begin(), idx_w.size(), gen); + argsort(idx_w, idx_w_sort); + } + + /////////////////////////////////////////////////////////////////////////// + static std::int64_t bagging_predict( + std::vector & trees + , blaze::DynamicMatrix const& dataset + , std::int64_t const row + , std::unordered_map & classes) { + + std::vector predictions(trees.size()); + + hpx::parallel::transform( + hpx::parallel::execution::par + , trees.begin() + , trees.end() + , predictions.begin() + , [&dataset, &row](auto & tree) { + return node_predict(tree, dataset, row); + } + ); + + std::vector< blaze::DynamicVector > prediction_histograms( + predictions.size() + ); + auto prediction_indices = boost::irange(0UL + , predictions.size()); + + hpx::parallel::for_each( + hpx::parallel::execution::par + , std::begin(prediction_indices) + , std::end(prediction_indices) + , [&classes, &predictions, &prediction_histograms](auto const i) { + prediction_histograms[i].resize(classes.size()); + prediction_histograms[i][predictions[i]] += 1UL; + } + ); + + blaze::DynamicVector fhistogram(classes.size()); + + fhistogram = hpx::parallel::reduce( + hpx::parallel::execution::par + , prediction_histograms.begin() + , prediction_histograms.end() + , fhistogram + , std::plus>() + ); + + auto itr = std::max_element(fhistogram.begin(), fhistogram.end()); + return std::distance(fhistogram.begin(), itr); + } + + void fit( + blaze::DynamicMatrix const& train + , blaze::DynamicVector const& train_labels + , std::int64_t const max_depth + , std::int64_t const min_size + , std::int64_t const sample_size) { + + std::vector labels(train_labels.size()); + std::copy(train_labels.begin(), train_labels.end(), labels.begin()); + std::sort(labels.begin(), labels.end()); + auto last = std::unique(labels.begin(), labels.end()); + labels.erase(last, labels.end()); + + std::int64_t i = 0; + for(auto itr = labels.begin(); itr != labels.end(); ++itr, ++i) { + classes[(*itr)] = i; + } + + std::int64_t const n_features = static_cast( + std::floor(std::sqrt(train.rows())) + ); + + std::vector< blaze::DynamicVector > subsample_indices(ntrees); + auto tree_indices = boost::irange(0, ntrees); + + std::for_each(std::begin(tree_indices) + , std::end(tree_indices) + , [this](std::int64_t const idx) { + trees[idx] = + std::move(phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::dictionary() + }); + }); + + hpx::parallel::for_each( + hpx::parallel::execution::par + , std::begin(tree_indices) + , std::end(tree_indices) + , [this, &train, &train_labels, &subsample_indices, &max_depth, &min_size + , &sample_size, &n_features] + (std::int64_t const idx) { + subsample(train, subsample_indices[idx], sample_size); + build_tree( trees[idx] + , train, train_labels, subsample_indices[idx] + , max_depth, min_size, n_features, classes); + } + ); + } + + void predict( + blaze::DynamicMatrix const& test + , blaze::DynamicVector & y) { + + if(y.size() < 1UL) { + y.resize(test.rows()); + } + + blaze::DynamicVector labels(classes.size()); + for(auto & cls : classes) { + labels[cls.second] = cls.first; + } + + auto indices = boost::irange(0, test.rows()); + + hpx::parallel::for_each( + hpx::parallel::execution::par + , std::begin(indices) + , std::end(indices) + , [this, &test, &y, &labels](auto const& idx) { + y[idx] = labels[bagging_predict(trees, test, idx, classes)]; + } + ); + } + +}; + +}}} // end namespace + +#endif diff --git a/phylanx/plugins/algorithms/randomforest.hpp b/phylanx/plugins/algorithms/randomforest.hpp new file mode 100644 index 000000000..4f569f89a --- /dev/null +++ b/phylanx/plugins/algorithms/randomforest.hpp @@ -0,0 +1,114 @@ +// Copyright (c) 2018 Christopher Taylor +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#if !defined(PHYLANX_MAKE_LIST_NOV_10_2018_1848AM) +#define PHYLANX_MAKE_LIST_NOV_10_2018_1848AM + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace phylanx { namespace execution_tree { namespace primitives +{ + class randomforest_fit + : public primitive_component_base + , public std::enable_shared_from_this + { + protected: + hpx::future eval( + primitive_arguments_type const& operands, + primitive_arguments_type const& args) const; + + public: + static match_pattern_type const match_data; + + randomforest_fit() = default; + + /// + /// Creates a primitive executing the RandomForest algorithm on + /// the given input data + /// + /// \param args Is a (possibly empty) list of any values to be + /// concatenated into a PhySL list in order. + /// + randomforest_fit(primitive_arguments_type&& operands, + std::string const& name, std::string const& codename); + + hpx::future eval( + primitive_arguments_type const& operands, + primitive_arguments_type const& args, + eval_context ctx) const override; + + protected: + primitive_argument_type calculate( + primitive_arguments_type&& args) const; + }; + + inline primitive create_randomforest_fit(hpx::id_type const& locality, + primitive_arguments_type&& operands, + std::string const& name = "", std::string const& codename = "") + { + return create_primitive_component( + locality, "randomforest_fit", std::move(operands), name, codename); + } + + class randomforest_predict + : public primitive_component_base + , public std::enable_shared_from_this + { + protected: + hpx::future eval( + primitive_arguments_type const& operands, + primitive_arguments_type const& args) const; + + public: + static match_pattern_type const match_data; + + randomforest_predict() = default; + + /// + /// Creates a primitive executing the RandomForest algorithm on + /// the given input data + /// + /// \param args Is a (possibly empty) list of any values to be + /// concatenated into a PhySL list in order. + /// + randomforest_predict(primitive_arguments_type&& operands, + std::string const& name, std::string const& codename); + + hpx::future eval( + primitive_arguments_type const& operands, + primitive_arguments_type const& args, + eval_context ctx) const override; + + protected: + primitive_argument_type calculate( + primitive_arguments_type&& args) const; + }; + + inline primitive create_randomforest_predict(hpx::id_type const& locality, + primitive_arguments_type&& operands, + std::string const& name = "", std::string const& codename = "") + { + return create_primitive_component( + locality, "randomforest_predict", std::move(operands), name, codename); + } + +}}} + +#endif diff --git a/python/phylanx/ast/physl.py b/python/phylanx/ast/physl.py index b5cb84618..32b1f811d 100644 --- a/python/phylanx/ast/physl.py +++ b/python/phylanx/ast/physl.py @@ -29,7 +29,6 @@ def define(i, idx): iterators = tuple(define(i, idx) for idx, i in enumerate(targets)) return (fmap, iterators) - mapped_methods = { "add": "__add", "array": "hstack", @@ -638,7 +637,7 @@ def _Assign(self, node): raise Exception("Phylanx does not support chain assignments.") if isinstance(node.targets[0], ast.Tuple): raise Exception( - "Phylanx does not support multi-target assignments.") + "Phylanx does not support multi-target assignments. Line #%d." % node.lineno ) symbol = self.apply_rule(node.targets[0]) # if lhs is not indexed. diff --git a/src/plugins/algorithms/algorithms.cpp b/src/plugins/algorithms/algorithms.cpp index 9eb68e88e..d61ef7f69 100644 --- a/src/plugins/algorithms/algorithms.cpp +++ b/src/plugins/algorithms/algorithms.cpp @@ -15,3 +15,7 @@ PHYLANX_REGISTER_PLUGIN_FACTORY(als_plugin, phylanx::execution_tree::primitives::als::match_data); PHYLANX_REGISTER_PLUGIN_FACTORY(lra_plugin, phylanx::execution_tree::primitives::lra::match_data); +PHYLANX_REGISTER_PLUGIN_FACTORY(randomforest_fit_plugin, + phylanx::execution_tree::primitives::randomforest_fit::match_data); +PHYLANX_REGISTER_PLUGIN_FACTORY(randomforest_predict_plugin, + phylanx::execution_tree::primitives::randomforest_predict::match_data); diff --git a/src/plugins/algorithms/randomforest.cpp b/src/plugins/algorithms/randomforest.cpp new file mode 100644 index 000000000..bd9751c01 --- /dev/null +++ b/src/plugins/algorithms/randomforest.cpp @@ -0,0 +1,344 @@ +// Copyright (c) 2018 Hartmut Kaiser +// Copyright (c) 2018 Christopher Taylor +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace phylanx::algorithms::impl; + +/////////////////////////////////////////////////////////////////////////////// +namespace phylanx { namespace execution_tree { namespace primitives +{ + /////////////////////////////////////////////////////////////////////////// + match_pattern_type const randomforest_fit::match_data = + { + hpx::util::make_tuple("randomforest_fit", + std::vector{ + "randomforest_fit(_1, _2, _3, _4, _5, _6, _7)", + }, + &create_randomforest_fit, &create_primitive, + "training, training_labels, max_depth, min_size, sample_size, n_trees, test\n" + "Args:\n" + "\n" + " training (matrix) : training features\n" + " training_labels (vector) : training labels\n" + "the data\n" + " max_depth (int) : depth of tree splits\n" + " min_size (int) : min size\n" + " sample_size (int) : number of samples per tree\n" + " n_trees (int) : number of trees to train\n" + " test (matrix) : testing data to predict upon\n" + "\n" + "Returns:\n" + "\n" + "The Calculated weights" + ) + }; + + /////////////////////////////////////////////////////////////////////////// + randomforest_fit::randomforest_fit(primitive_arguments_type&& operands, + std::string const& name, std::string const& codename) + : primitive_component_base(std::move(operands), name, codename) + {} + + /////////////////////////////////////////////////////////////////////////// + primitive_argument_type randomforest_fit::calculate( + primitive_arguments_type && args) const + { + // extract arguments + auto arg1 = extract_numeric_value(args[0], name_, codename_); + if (arg1.num_dimensions() != 2) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires for the first " + "argument ('training_data') to represent a matrix")); + } + auto training_data = arg1.matrix(); + + auto arg2 = extract_numeric_value(args[1], name_, codename_); + if (arg2.num_dimensions() != 1) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires for the second " + "argument ('training_labels') to represent a vector")); + } + auto training_labels = arg2.vector(); + + // verify correctness of argument dimensions + if (training_data.rows() != training_labels.size()) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires for the number " + "of rows in 'training_data' to be equal to the size of " + "'training_labels'")); + } + + auto max_depth = static_cast( + extract_scalar_integer_value(args[2], name_, codename_)); + auto min_size = static_cast( + extract_scalar_integer_value(args[3], name_, codename_)); + auto sample_size = static_cast( + extract_scalar_integer_value(args[4], name_, codename_)); + auto n_trees = static_cast( + extract_scalar_integer_value(args[5], name_, codename_)); + + randomforest_impl rf(n_trees); + rf.fit(training_data + , training_labels + , max_depth + , min_size + , sample_size); + + phylanx::ir::dictionary trees; + for(std::int64_t i = 0; i < rf.trees.size(); ++i) { + trees[phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data(i) + }] = + phylanx::execution_tree::primitive_argument_type{ + rf.trees[i] + }; + } + + phylanx::ir::dictionary classes; + for(auto & cls : rf.classes) { + trees[phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data(std::get<0>(cls)) + }] = + phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data(std::get<1>(cls)) + }; + } + + phylanx::ir::dictionary model; + model[phylanx::execution_tree::primitive_argument_type{std::string("forest")}] = + phylanx::execution_tree::primitive_argument_type{trees}; + model[phylanx::execution_tree::primitive_argument_type{std::string("n_trees")}] = + phylanx::execution_tree::primitive_argument_type{rf.ntrees}; + model[phylanx::execution_tree::primitive_argument_type{std::string("classes")}] = + phylanx::execution_tree::primitive_argument_type{classes}; + + return primitive_argument_type{std::move(model)}; //std::move(forest)}; + } + + /////////////////////////////////////////////////////////////////////////// + hpx::future randomforest_fit::eval( + primitive_arguments_type const& operands, + primitive_arguments_type const& args, eval_context ctx) const + { + if (operands.size() != 7) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, + "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires exactly seven " + "operands")); + } + + bool arguments_valid = true; + for (std::size_t i = 0; i != operands.size(); ++i) + { + if (!valid(operands[i])) + { + arguments_valid = false; + } + } + + if (!arguments_valid) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, + "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires that the " + "arguments given by the operands array are valid")); + } + + auto this_ = this->shared_from_this(); + return hpx::dataflow(hpx::launch::sync, hpx::util::unwrapping( + [this_ = std::move(this_)](primitive_arguments_type && args) + -> primitive_argument_type + { + return this_->calculate(std::move(args)); + }), + detail::map_operands( + operands, functional::value_operand{}, args, + name_, codename_, std::move(ctx))); + } + + /////////////////////////////////////////////////////////////////////////// + static blaze::DynamicVector randomforest_predict_fn( + phylanx::ir::dictionary & tree + , blaze::DynamicMatrix const& test) { + + blaze::DynamicVector test_labels(test.rows()); + + std::int64_t const ntree = + phylanx::util::get>( + tree[phylanx::execution_tree::primitive_argument_type{ + std::string("n_trees") + }].get().variant()).scalar(); + + randomforest_impl rf(ntree); + + phylanx::ir::dictionary model = phylanx::util::get( + tree[phylanx::execution_tree::primitive_argument_type{ + std::string("forest") + }].get().variant()); + + for(std::int64_t i = 0; i < ntree; ++i) { + rf.trees.push_back( + phylanx::util::get( + tree[phylanx::execution_tree::primitive_argument_type{ + phylanx::ir::node_data(i) + }].get().variant())); + } + + phylanx::ir::dictionary classes = phylanx::util::get( + tree[phylanx::execution_tree::primitive_argument_type{ + std::string("classes") + }].get().variant()); + + for(auto & cls : classes) { + double const k = phylanx::util::get>( + std::get<0>(cls).get().variant() + ).scalar(); + std::int64_t const v = phylanx::util::get< + phylanx::ir::node_data>( + std::get<1>(cls).get().variant() + ).scalar(); + rf.classes[k] = v; + } + + rf.predict(test, test_labels); + + return std::move(test_labels); + } + + /////////////////////////////////////////////////////////////////////////// + match_pattern_type const randomforest_predict::match_data = + { + hpx::util::make_tuple("randomforest_predict", + std::vector{ + "randomforest_predict(_1, _2)", + }, + &create_randomforest_predict, &create_primitive, + "training, training_labels, max_depth, min_size, sample_size, n_trees, test\n" + "Args:\n" + "\n" + " model (randomforest_fit) : trained randomforest model\n" + " data (matrix) : features to predict\n" + "\n" + "Returns:\n" + "\n" + "The Calculated labels" + ) + }; + + /////////////////////////////////////////////////////////////////////////// + randomforest_predict::randomforest_predict(primitive_arguments_type&& operands, + std::string const& name, std::string const& codename) + : primitive_component_base(std::move(operands), name, codename) + {} + + /////////////////////////////////////////////////////////////////////////// + primitive_argument_type randomforest_predict::calculate( + primitive_arguments_type && args) const + { + // extract arguments + auto model = extract_dictionary_value(args[0], name_, codename_); + if (!is_dictionary_operand(model)) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires for the first " + "argument ('model') to represent a phylanx::ir::dictionary")); + } + + auto arg2 = extract_numeric_value(args[1], name_, codename_); + if (arg2.num_dimensions() != 2) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, "randomforest::eval", + generate_error_message( + "the randomforest algorithm primitive requires for the second " + "argument ('data') to represent a matrix")); + } + auto data = arg2.matrix(); + + // perform calculations + auto predicted_labels = randomforest_predict_fn(model, data); + + return primitive_argument_type{std::move(predicted_labels)}; + } + + /////////////////////////////////////////////////////////////////////////// + hpx::future randomforest_predict::eval( + primitive_arguments_type const& operands, + primitive_arguments_type const& args, eval_context ctx) const + { + if (operands.size() != 2) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, + "randomforest_predict::eval", + generate_error_message( + "the randomforest algorithm primitive requires exactly two " + "operands")); + } + + bool arguments_valid = true; + for (std::size_t i = 0; i != operands.size(); ++i) + { + if (!valid(operands[i])) + { + arguments_valid = false; + } + } + + if (!arguments_valid) + { + HPX_THROW_EXCEPTION(hpx::bad_parameter, + "randomforest_predict::eval", + generate_error_message( + "the randomforest algorithm primitive requires that the " + "arguments given by the operands array are valid")); + } + + auto this_ = this->shared_from_this(); + return hpx::dataflow(hpx::launch::sync, hpx::util::unwrapping( + [this_ = std::move(this_)](primitive_arguments_type && args) + -> primitive_argument_type + { + return this_->calculate(std::move(args)); + }), + detail::map_operands( + operands, functional::value_operand{}, args, + name_, codename_, std::move(ctx))); + } + +}}} diff --git a/tests/unit/algorithm/CMakeLists.txt b/tests/unit/algorithm/CMakeLists.txt index acab485ff..4b981ab2a 100644 --- a/tests/unit/algorithm/CMakeLists.txt +++ b/tests/unit/algorithm/CMakeLists.txt @@ -6,9 +6,11 @@ set(tests simple_als simple_lra + simple_randomforest ) set(simple_lra_FLAGS COMPONENT_DEPENDENCIES iostreams) +set(simple_randomforest_FLAGS COMPONENT_DEPENDENCIES iostreams) foreach(test ${tests}) set(sources ${test}.cpp) diff --git a/tests/unit/algorithm/simple_randomforest.cpp b/tests/unit/algorithm/simple_randomforest.cpp new file mode 100644 index 000000000..032440a39 --- /dev/null +++ b/tests/unit/algorithm/simple_randomforest.cpp @@ -0,0 +1,73 @@ +// Copyright (c) 2017 Hartmut Kaiser +// Copyright (c) 2018 Christopher Taylor +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include + +#include +#include + +#include + +/////////////////////////////////////////////////////////////////////////////// +char const* const randomforest_test_code = R"(block( + // + // randomforest algorithm + // + define(randomforest_test, train_data, train_labels, test_data, + block( + define(test_labels, constant(0.0, shape(test_data, 0))), + define(mx_depth, 10), + define(min_size, 1), + define(sample_size, 1.0), + define(n_trees, 10), + define(model, randomforest_fit(train_data, + train_labels, mx_depth, min_size, sample_size, n_trees)), + block( + store(test_labels, randomforest_predict(model, test_data)), + cout(test_data) + ) + ) + ), + randomforest_test +))"; + +void test_randomforest() +{ + blaze::DynamicMatrix v1{ + {15.04, 16.74}, {13.82, 24.49}, {12.54, 16.32}, {23.09, 19.83}, + {9.268, 12.87}, {9.676, 13.14}, {12.22, 20.04}, {11.06, 17.12}, + {16.3, 15.7}, {15.46, 23.95}, {11.74, 14.69}, {14.81, 14.7}, + {13.4, 20.52}, {14.58, 13.66}, {15.05, 19.07}, {11.34, 18.61}, + {18.31, 20.58}, {19.89, 20.26}, {12.88, 18.22}, {12.75, 16.7}, + {9.295, 13.9}, {24.63, 21.6}, {11.26, 19.83}, {13.71, 18.68}, + {9.847, 15.68}, {8.571, 13.1}, {13.46, 18.75}, {12.34, 12.27}, + {13.94, 13.17}, {12.07, 13.44}}; + + blaze::DynamicVector v2{1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, + 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1}; + + // compile the given code + phylanx::execution_tree::compiler::function_list snippets; + auto const& code = phylanx::execution_tree::compile(randomforest_test_code, snippets); + auto randomforest_test = code.run(); + + // evaluate generated execution tree + auto x = phylanx::ir::node_data{v1}; + auto y = phylanx::ir::node_data{v2}; + auto z = phylanx::ir::node_data{v1}; + + randomforest_test(x, y, z); +} + +int main(int argc, char* argv[]) +{ + test_randomforest(); + return hpx::util::report_errors(); +} +