RMM: cuda_async_memory_resource.hpp 源文件

 /*

  * Copyright (c) 2021-2025, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  * https://apache.ac.cn/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <rmm/cuda_device.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/runtime_async_alloc.hpp>

 #include <rmm/detail/thrust_namespace.h>

 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>

 #include <rmm/mr/device/device_memory_resource.hpp>


 #include <cuda/std/type_traits>

 #include <cuda_runtime_api.h>


 #include <cstddef>

 #include <optional>


 namespace RMM_NAMESPACE {

 namespace mr {

 class cuda_async_memory_resource final : public device_memory_resource {

  public

  enum class allocation_handle_type {

  none = 0x0,

  posix_file_descriptor = 0x1,

  win32 = 0x2,

  win32_kmt = 0x4,

  fabric = 0x8

  };


  // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)

  cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size = {},

  std::optional<std::size_t> release_threshold = {},

  std::optional<allocation_handle_type> export_handle_type = {})

  {

  // Check if cudaMallocAsync Memory pool supported

  RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),

  "cudaMallocAsync not supported with this CUDA driver/runtime version");


  // Construct explicit pool

  cudaMemPoolProps pool_props{};

  pool_props.allocType = cudaMemAllocationTypePinned;

  pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(

  export_handle_type.value_or(allocation_handle_type::none));

  RMM_EXPECTS(

  rmm::detail::runtime_async_alloc::is_export_handle_type_supported(pool_props.handleTypes),

  "Requested IPC memory handle type not supported");

  pool_props.location.type = cudaMemLocationTypeDevice;

  pool_props.location.id = rmm::get_current_cuda_device().value();

  cudaMemPool_t cuda_pool_handle{};

  RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));

  pool_ = cuda_async_view_memory_resource{cuda_pool_handle};


  // CUDA 驱动程序版本低于 11.5 已知与异步分配器不兼容。

  // If cuda driver < 11.5, we'll disable `cudaMemPoolReuseAllowOpportunistic`.

  // See https://github.com/NVIDIA/spark-rapids/issues/4710.

  int driver_version{};

  RMM_CUDA_TRY(cudaDriverGetVersion(&driver_version));

  constexpr auto min_async_version{11050};

  if (driver_version < min_async_version) {

  int disabled{0};

  RMM_CUDA_TRY(

  cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));

  }


  auto const [free, total] = rmm::available_device_memory();


  // Need an l-value to take address to pass to cudaMemPoolSetAttribute

  uint64_t threshold = release_threshold.value_or(total);

  RMM_CUDA_TRY(

  cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));


  // Allocate and immediately deallocate the initial_pool_size to prime the pool with the

  // specified size

  auto const pool_size = initial_pool_size.value_or(free / 2);

  auto* ptr = do_allocate(pool_size, cuda_stream_default);

  do_deallocate(ptr, pool_size, cuda_stream_default);

  }


  [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }


  ~cuda_async_memory_resource() override

  {

  RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));

  }

  cuda_async_memory_resource(cuda_async_memory_resource const&) = delete;

  cuda_async_memory_resource(cuda_async_memory_resource&&) = delete;

  cuda_async_memory_resource& operator=(cuda_async_memory_resource const&) = delete;

  cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete;


  private

  cuda_async_view_memory_resource pool_{};


  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override

  {

  void* ptr{nullptr};

  ptr = pool_.allocate(bytes, stream);

  return ptr;

  }


  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override

  {

  pool_.deallocate(ptr, bytes, stream);

  }


  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override

  {

  auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);

  return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());

  }

 };

  // 组结束

 } // namespace mr

 } // namespace RMM_NAMESPACE

rmm::cuda_stream_view
CUDA 流的强类型非拥有包装器，带默认构造函数。
定义于： cuda_stream_view.hpp:39

rmm::mr::cuda_async_memory_resource
继承自 `device_memory_resource` 的类，使用 cudaMallocAsync/cudaFreeAsync 进行内存分配/释放...
定义于： cuda_async_memory_resource.hpp:45

rmm::mr::cuda_async_memory_resource::allocation_handle_type
allocation_handle_type
用于指定内存分配句柄类型的标志。
定义于： cuda_async_memory_resource.hpp:58

rmm::mr::cuda_async_memory_resource::cuda_async_memory_resource
cuda_async_memory_resource(std::optional< std::size_t > initial_pool_size={}, std::optional< std::size_t > release_threshold={}, std::optional< allocation_handle_type > export_handle_type={})
构造一个 cuda_async_memory_resource，带可选的初始内存池大小和释放阈值...
定义于： cuda_async_memory_resource.hpp:85

rmm::mr::cuda_async_memory_resource::pool_handle
cudaMemPool_t pool_handle() const noexcept
返回 CUDA 内存池的底层原生句柄。
定义于： cuda_async_memory_resource.hpp:138

rmm::mr::device_memory_resource
所有 librmm 设备内存分配的基类。
定义于： device_memory_resource.hpp:92

cuda_async_view_memory_resource.hpp

cuda_device.hpp

cuda_stream_view.hpp

device_memory_resource.hpp

rmm::available_device_memory
std::pair< std::size_t, std::size_t > available_device_memory()
返回当前设备的可用和总设备内存（字节）。
定义于： cuda_device.hpp:123

rmm::get_current_cuda_device
cuda_device_id get_current_cuda_device()
返回当前设备的 cuda_device_id。
定义于： cuda_device.hpp:99

rmm::cuda_stream_default
static constexpr cuda_stream_view cuda_stream_default
默认流（stream 0）的静态 cuda_stream_view，方便使用。
定义于： cuda_stream_view.hpp:125

rmm::cuda_device_id::value
constexpr value_type value() const noexcept
包装的整数值。
定义于： cuda_device.hpp:57