Loading [MathJax]/extensions/tex2jax.js
cuML C++ API  23.12
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
decision_forest_builder.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 #include <algorithm>
18 #include <cmath>
19 #include <cstddef>
29 #include <numeric>
30 #include <optional>
31 #include <stdint.h>
32 #include <vector>
33 
34 namespace ML {
35 namespace experimental {
36 namespace fil {
37 namespace detail {
38 
39 /*
40  * Exception indicating that FIL model could not be built from given input
41  */
42 struct model_builder_error : std::exception {
43  model_builder_error() : model_builder_error("Error while building model") {}
44  model_builder_error(char const* msg) : msg_{msg} {}
45  virtual char const* what() const noexcept { return msg_; }
46 
47  private:
48  char const* msg_;
49 };
50 
51 /*
52  * Struct used to build FIL forests
53  */
54 template <typename decision_forest_t>
56  /* The type for nodes in the given decision_forest type */
57  using node_type = typename decision_forest_t::node_type;
58 
59  /* Add a root node, indicating the beginning of a new tree */
61  {
62  if (root_node_indexes_.empty()) {
63  root_node_indexes_.emplace_back();
64  } else {
65  max_tree_size_ = std::max(cur_tree_size_, max_tree_size_);
66  if (alignment_ != index_type{}) {
67  if (cur_tree_size_ % alignment_ != index_type{}) {
68  auto padding = (alignment_ - cur_tree_size_ % alignment_);
69  for (auto i = index_type{}; i < padding; ++i) {
70  add_node(typename node_type::threshold_type{}, std::nullopt);
71  }
72  }
73  }
74  root_node_indexes_.push_back(root_node_indexes_.back() + cur_tree_size_);
75  cur_tree_size_ = index_type{};
76  }
77  }
78 
79  /* Add a node with a categorical split */
80  template <typename iter_t>
82  iter_t vec_begin,
83  iter_t vec_end,
84  std::optional<int> tl_node_id = std::nullopt,
85  bool default_to_distant_child = false,
86  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
87  typename node_type::offset_type offset = typename node_type::offset_type{})
88  {
89  auto constexpr const bin_width = index_type(sizeof(typename node_type::index_type) * 8);
90  auto node_value = typename node_type::index_type{};
91  auto set_storage = &node_value;
92  auto max_node_categories = *std::max_element(vec_begin, vec_end) + 1;
93  if (max_num_categories_ > bin_width) {
94  // TODO(wphicks): Check for overflow here
95  node_value = categorical_storage_.size();
96  auto bins_required = raft_proto::ceildiv(max_node_categories, bin_width);
97  categorical_storage_.push_back(max_node_categories);
98  categorical_storage_.resize(categorical_storage_.size() + bins_required);
99  set_storage = &(categorical_storage_[node_value + 1]);
100  }
101  auto set = bitset{set_storage, max_node_categories};
102  std::for_each(vec_begin, vec_end, [&set](auto&& cat_index) { set.set(cat_index); });
103 
104  add_node(node_value, tl_node_id, false, default_to_distant_child, true, feature, offset, false);
105  }
106 
107  /* Add a leaf node with vector output */
108  template <typename iter_t>
109  void add_leaf_vector_node(iter_t vec_begin,
110  iter_t vec_end,
111  std::optional<int> tl_node_id = std::nullopt)
112  {
113  auto leaf_index = typename node_type::index_type(vector_output_.size() / output_size_);
114  std::copy(vec_begin, vec_end, std::back_inserter(vector_output_));
115  nodes_.emplace_back(leaf_index,
116  true,
117  false,
118  false,
119  typename node_type::metadata_storage_type{},
120  typename node_type::offset_type{});
121  // 0 indicates the lack of ID mapping for a particular node
122  node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
123  ++cur_tree_size_;
124  }
125 
126  /* Add a node to the model */
127  template <typename value_t>
128  void add_node(
129  value_t val,
130  std::optional<int> tl_node_id = std::nullopt,
131  bool is_leaf_node = true,
132  bool default_to_distant_child = false,
133  bool is_categorical_node = false,
134  typename node_type::metadata_storage_type feature = typename node_type::metadata_storage_type{},
135  typename node_type::offset_type offset = typename node_type::offset_type{},
136  bool is_inclusive = false)
137  {
138  if (is_inclusive) { val = std::nextafter(val, std::numeric_limits<value_t>::infinity()); }
139  nodes_.emplace_back(
140  val, is_leaf_node, default_to_distant_child, is_categorical_node, feature, offset);
141  // 0 indicates the lack of ID mapping for a particular node
142  node_id_mapping_.push_back(static_cast<index_type>(tl_node_id.value_or(0)));
143  ++cur_tree_size_;
144  }
145 
146  /* Set the element-wise postprocessing operation for this model */
147  void set_element_postproc(element_op val) { element_postproc_ = val; }
148  /* Set the row-wise postprocessing operation for this model */
149  void set_row_postproc(row_op val) { row_postproc_ = val; }
150  /* Set the value to divide by during postprocessing */
151  void set_average_factor(double val) { average_factor_ = val; }
152  /* Set the the bias term to remove during postprocessing */
153  void set_bias(double val) { bias_ = val; }
154  /* Set the the value of the constant used in the postprocessing operation
155  * (if any) */
156  void set_postproc_constant(double val) { postproc_constant_ = val; }
157  /* Set the number of outputs per row for this model */
159  {
160  if (output_size_ != index_type{1} && output_size_ != val) {
161  throw model_import_error("Inconsistent leaf vector size");
162  }
163  output_size_ = val;
164  }
165 
167  index_type align_bytes = index_type{})
168  : cur_tree_size_{},
169  max_num_categories_{max_num_categories},
170  alignment_{std::lcm(align_bytes, index_type(sizeof(node_type)))},
171  output_size_{1},
172  element_postproc_{},
173  average_factor_{},
174  row_postproc_{},
175  bias_{},
176  postproc_constant_{},
177  max_tree_size_{},
178  nodes_{},
179  root_node_indexes_{},
180  vector_output_{}
181  {
182  }
183 
184  /* Return the FIL decision forest built by this builder */
185  auto get_decision_forest(index_type num_feature,
186  index_type num_class,
188  int device = 0,
190  {
191  // Allow narrowing for preprocessing constants. They are stored as doubles
192  // for consistency in the builder but must be converted to the proper types
193  // for the concrete forest model.
194 #pragma GCC diagnostic push
195 #pragma GCC diagnostic ignored "-Wnarrowing"
196  return decision_forest_t{
198  raft_proto::buffer{nodes_.data(), nodes_.size()}, mem_type, device, stream},
199  raft_proto::buffer{raft_proto::buffer{root_node_indexes_.data(), root_node_indexes_.size()},
200  mem_type,
201  device,
202  stream},
203  raft_proto::buffer{raft_proto::buffer{node_id_mapping_.data(), node_id_mapping_.size()},
204  mem_type,
205  device,
206  stream},
207  num_feature,
208  num_class,
209  max_num_categories_ != 0,
210  vector_output_.empty()
211  ? std::nullopt
212  : std::make_optional<raft_proto::buffer<typename node_type::threshold_type>>(
213  raft_proto::buffer{vector_output_.data(), vector_output_.size()},
214  mem_type,
215  device,
216  stream),
217  categorical_storage_.empty()
218  ? std::nullopt
219  : std::make_optional<raft_proto::buffer<typename node_type::index_type>>(
220  raft_proto::buffer{categorical_storage_.data(), categorical_storage_.size()},
221  mem_type,
222  device,
223  stream),
224  output_size_,
225  row_postproc_,
226  element_postproc_,
227  static_cast<typename node_type::threshold_type>(average_factor_),
228  static_cast<typename node_type::threshold_type>(bias_),
229  static_cast<typename node_type::threshold_type>(postproc_constant_)};
230 #pragma GCC diagnostic pop
231  }
232 
233  private:
234  index_type cur_tree_size_;
235  index_type max_num_categories_;
236  index_type alignment_;
237  index_type output_size_;
238  row_op row_postproc_;
239  element_op element_postproc_;
240  double average_factor_;
241  double bias_;
242  double postproc_constant_;
243  index_type max_tree_size_;
244 
245  std::vector<node_type> nodes_;
246  std::vector<index_type> root_node_indexes_;
247  std::vector<typename node_type::threshold_type> vector_output_;
248  std::vector<typename node_type::index_type> categorical_storage_;
249  std::vector<index_type> node_id_mapping_;
250 };
251 
252 } // namespace detail
253 } // namespace fil
254 } // namespace experimental
255 } // namespace ML
math_t max(math_t a, math_t b)
Definition: learning_rate.h:26
element_op
Definition: postproc_ops.hpp:29
uint32_t index_type
Definition: index_type.hpp:21
row_op
Definition: postproc_ops.hpp:22
Definition: dbscan.hpp:27
HOST DEVICE constexpr auto ceildiv(T dividend, U divisor)
Definition: ceildiv.hpp:21
int cuda_stream
Definition: cuda_stream.hpp:25
device_type
Definition: device_type.hpp:18
const_agnostic_same_t< T, U > copy(buffer< T > &dst, buffer< U > const &src, typename buffer< T >::index_type dst_offset, typename buffer< U >::index_type src_offset, typename buffer< T >::index_type size, cuda_stream stream)
Definition: buffer.hpp:325
Definition: decision_forest_builder.hpp:55
void set_element_postproc(element_op val)
Definition: decision_forest_builder.hpp:147
void start_new_tree()
Definition: decision_forest_builder.hpp:60
typename decision_forest_t::node_type node_type
Definition: decision_forest_builder.hpp:57
void set_average_factor(double val)
Definition: decision_forest_builder.hpp:151
void add_leaf_vector_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt)
Definition: decision_forest_builder.hpp:109
void set_output_size(index_type val)
Definition: decision_forest_builder.hpp:158
void set_postproc_constant(double val)
Definition: decision_forest_builder.hpp:156
void set_bias(double val)
Definition: decision_forest_builder.hpp:153
decision_forest_builder(index_type max_num_categories=index_type{}, index_type align_bytes=index_type{})
Definition: decision_forest_builder.hpp:166
void add_categorical_node(iter_t vec_begin, iter_t vec_end, std::optional< int > tl_node_id=std::nullopt, bool default_to_distant_child=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{})
Definition: decision_forest_builder.hpp:81
auto get_decision_forest(index_type num_feature, index_type num_class, raft_proto::device_type mem_type=raft_proto::device_type::cpu, int device=0, raft_proto::cuda_stream stream=raft_proto::cuda_stream{})
Definition: decision_forest_builder.hpp:185
void set_row_postproc(row_op val)
Definition: decision_forest_builder.hpp:149
void add_node(value_t val, std::optional< int > tl_node_id=std::nullopt, bool is_leaf_node=true, bool default_to_distant_child=false, bool is_categorical_node=false, typename node_type::metadata_storage_type feature=typename node_type::metadata_storage_type{}, typename node_type::offset_type offset=typename node_type::offset_type{}, bool is_inclusive=false)
Definition: decision_forest_builder.hpp:128
Definition: decision_forest_builder.hpp:42
model_builder_error(char const *msg)
Definition: decision_forest_builder.hpp:44
model_builder_error()
Definition: decision_forest_builder.hpp:43
virtual char const * what() const noexcept
Definition: decision_forest_builder.hpp:45
Definition: exceptions.hpp:36
A container which may or may not own its own data on host or device.
Definition: buffer.hpp:39