Loading [MathJax]/extensions/tex2jax.js
cuML C++ API  23.12
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
cpu.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 #include <cstddef>
30 #include <optional>
31 namespace ML {
32 namespace experimental {
33 namespace fil {
34 namespace detail {
35 namespace inference {
36 
37 /* A wrapper around the underlying inference kernels to support dispatching to
38  * the right kernel
39  *
40  * This specialization is used for CPU inference and for requests for GPU
41  * inference on non-GPU-enabled builds. An exception will be thrown if a
42  * request is made for GPU on inference on a non-GPU-enabled build.
43  *
44  * @tparam D The type of device (CPU/GPU) on which to perform inference.
45  * @tparam has_categorical_nodes Whether or not any node in the model has
46  * categorical splits.
47  * @tparam vector_output_t If non-nullptr_t, the type of vector leaf output
48  * @tparam categorical_data_t If non-nullptr_t, the type of non-local
49  * categorical data storage
50  *
51  * @param forest The forest to be used for inference.
52  * @param postproc The postprocessor object to be used for postprocessing raw
53  * output from the forest.
54  * @param row_count The number of rows in the input
55  * @param col_count The number of columns per row in the input
56  * @param output_count The number of output elements per row
57  * @param vector_output If non-nullptr, a pointer to storage for vector leaf
58  * outputs
59  * @param categorical_data If non-nullptr, a pointer to non-local storage for
60  * data on categorical splits.
61  * @param infer_type Type of inference to perform. Defaults to summing the outputs of all trees
62  * and produce an output per row. If set to "per_tree", we will instead output all outputs of
63  * individual trees. If set to "leaf_id", we will output the integer ID of the leaf node
64  * for each tree.
65  * @param specified_chunk_size If non-nullopt, the mini-batch size used for
66  * processing rows in a batch. For CPU inference, this essentially determines
67  * the granularity of parallelism. A larger chunk size means that a single
68  * thread will process more rows for its assigned trees before fetching a
69  * new batch of rows. In general, so long as the chunk size remains much
70  * smaller than the batch size (minimally less than the batch size divided by
71  * the number of available cores), larger batches see improved performance with
72  * larger chunk sizes. Unlike for GPU, any positive value is valid (up to
73  * hardware constraints), but it is recommended to test powers of 2 from 1
74  * (for individual row inference) to 512 (for very large batch
75  * inference). A value of 64 is a generally-useful default.
76  */
77 template <raft_proto::device_type D,
78  bool has_categorical_nodes,
79  typename forest_t,
80  typename vector_output_t = std::nullptr_t,
81  typename categorical_data_t = std::nullptr_t>
82 std::enable_if_t<std::disjunction_v<std::bool_constant<D == raft_proto::device_type::cpu>,
83  std::bool_constant<!raft_proto::GPU_ENABLED>>,
84  void>
87  typename forest_t::io_type* output,
88  typename forest_t::io_type* input,
89  index_type row_count,
90  index_type col_count,
91  index_type output_count,
92  vector_output_t vector_output = nullptr,
93  categorical_data_t categorical_data = nullptr,
95  std::optional<index_type> specified_chunk_size = std::nullopt,
98 {
99  if constexpr (D == raft_proto::device_type::gpu) {
100  throw raft_proto::gpu_unsupported("Tried to use GPU inference in CPU-only build");
101  } else {
102  if (infer_type == infer_kind::leaf_id) {
103  infer_kernel_cpu<has_categorical_nodes, true>(
104  forest,
105  postproc,
106  output,
107  input,
108  row_count,
109  col_count,
110  output_count,
111  specified_chunk_size.value_or(hardware_constructive_interference_size),
112  hardware_constructive_interference_size,
113  vector_output,
114  categorical_data,
115  infer_type);
116  } else {
117  infer_kernel_cpu<has_categorical_nodes, false>(
118  forest,
119  postproc,
120  output,
121  input,
122  row_count,
123  col_count,
124  output_count,
125  specified_chunk_size.value_or(hardware_constructive_interference_size),
126  hardware_constructive_interference_size,
127  vector_output,
128  categorical_data,
129  infer_type);
130  }
131  }
132 }
133 
134 /* This macro is invoked here to declare all standard specializations of this
135  * template as extern. This ensures that this (relatively complex) code is
136  * compiled as few times as possible. A macro is used because ever
137  * specialization must be explicitly declared. The final argument to the macro
138  * references the 8 specialization variants compiled in standard cuML FIL. */
140 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 1)
141 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 2)
142 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 3)
143 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 4)
144 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 5)
145 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 6)
146 CUML_FIL_INFER_ALL(extern template, raft_proto::device_type::cpu, 7)
147 
148 } // namespace inference
149 } // namespace detail
150 } // namespace fil
151 
152 } // namespace experimental
153 } // namespace ML
#define CUML_FIL_INFER_ALL(template_type, dev, variant_index)
Definition: infer_macros.hpp:147
std::enable_if_t< std::disjunction_v< std::bool_constant< D==raft_proto::device_type::cpu >, std::bool_constant<!raft_proto::GPU_ENABLED > >, void > infer(forest_t const &forest, postprocessor< typename forest_t::io_type > const &postproc, typename forest_t::io_type *output, typename forest_t::io_type *input, index_type row_count, index_type col_count, index_type output_count, vector_output_t vector_output=nullptr, categorical_data_t categorical_data=nullptr, infer_kind infer_type=infer_kind::default_kind, std::optional< index_type > specified_chunk_size=std::nullopt, raft_proto::device_id< D > device=raft_proto::device_id< D >{}, raft_proto::cuda_stream=raft_proto::cuda_stream{})
Definition: cpu.hpp:85
uint32_t index_type
Definition: index_type.hpp:21
infer_kind
Definition: infer_kind.hpp:20
forest< real_t > * forest_t
Definition: fil.h:89
Definition: dbscan.hpp:27
Definition: buffer.hpp:33
int cuda_stream
Definition: cuda_stream.hpp:25
device_type
Definition: device_type.hpp:18
Definition: forest.hpp:34
Definition: postprocessor.hpp:137
Definition: base.hpp:22
Definition: gpu_support.hpp:46