Loading [MathJax]/extensions/tex2jax.js
cuML C++ API  23.12
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
gpu_introspection.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2023, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 #include <cuda_runtime_api.h>
22 #include <vector>
23 
24 namespace ML {
25 namespace experimental {
26 namespace fil {
27 namespace detail {
28 
31 {
32  auto thread_local cache = std::vector<int>{};
33  if (cache.size() == 0) {
34  auto device_count = int{};
35  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
36  cache.resize(device_count);
37  for (auto dev = 0; dev < device_count; ++dev) {
39  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerBlockOptin, dev));
40  }
41  }
42  return index_type(cache.at(device_id.value()));
43 }
44 
46 {
47  auto thread_local cache = std::vector<int>{};
48  if (cache.size() == 0) {
49  auto device_count = int{};
50  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
51  cache.resize(device_count);
52  for (auto dev = 0; dev < device_count; ++dev) {
54  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMultiProcessorCount, dev));
55  }
56  }
57  return index_type(cache.at(device_id.value()));
58 }
59 
61 {
62  auto result = int{};
64  cudaDeviceGetAttribute(&result, cudaDevAttrMaxThreadsPerMultiProcessor, device_id.value()));
65  return index_type(result);
66 }
67 
69 {
70  auto thread_local cache = std::vector<int>{};
71  if (cache.size() == 0) {
72  auto device_count = int{};
73  raft_proto::cuda_check(cudaGetDeviceCount(&device_count));
74  cache.resize(device_count);
75  for (auto dev = 0; dev < device_count; ++dev) {
77  cudaDeviceGetAttribute(&(cache[dev]), cudaDevAttrMaxSharedMemoryPerMultiprocessor, dev));
78  }
79  }
80  return index_type(cache.at(device_id.value()));
81 }
82 
84 {
85  auto result = int{};
87  cudaDeviceGetAttribute(&result, cudaDevAttrMemoryClockRate, device_id.value()));
88  return index_type(result);
89 }
90 
92 {
93  auto result = int{};
94  raft_proto::cuda_check(cudaDeviceGetAttribute(&result, cudaDevAttrClockRate, device_id.value()));
95  return index_type(result);
96 }
97 
98 /* The maximum number of bytes that can be read in a single instruction */
99 auto constexpr static const MAX_READ_CHUNK = index_type{128};
100 auto constexpr static const MAX_BLOCKS = index_type{65536};
101 auto constexpr static const WARP_SIZE = index_type{32};
102 auto constexpr static const MAX_THREADS_PER_BLOCK = index_type{256};
103 #ifdef __CUDACC__
104 #if __CUDA_ARCH__ == 720 || __CUDA_ARCH__ == 750 || __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870
105 auto constexpr static const MAX_THREADS_PER_SM = index_type{1024};
106 #else
107 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
108 #endif
109 #else
110 auto constexpr static const MAX_THREADS_PER_SM = index_type{2048};
111 #endif
112 
113 auto constexpr static const MIN_BLOCKS_PER_SM = MAX_THREADS_PER_SM / MAX_THREADS_PER_BLOCK;
114 
115 } // namespace detail
116 } // namespace fil
117 } // namespace experimental
118 } // namespace ML
auto get_mem_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:83
auto get_sm_count(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:45
auto get_core_clock_rate(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:91
auto get_max_threads_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:60
auto get_max_shared_mem_per_sm(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:68
auto get_max_shared_mem_per_block(raft_proto::device_id< raft_proto::device_type::gpu > device_id)
Definition: gpu_introspection.hpp:29
uint32_t index_type
Definition: index_type.hpp:21
Definition: dbscan.hpp:27
void cuda_check(error_t const &err) noexcept(!GPU_ENABLED)
Definition: cuda_check.hpp:26
detail::device_id< D > device_id
Definition: device_id.hpp:28
Definition: base.hpp:22