RMM: pool_memory_resource.hpp 源文件

 /*

  * Copyright (c) 2020-2025, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  * https://apache.ac.cn/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <rmm/aligned.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/format.hpp>

 #include <rmm/detail/logging_assert.hpp>

 #include <rmm/detail/thrust_namespace.h>

 #include <rmm/logger.hpp>

 #include <rmm/mr/device/detail/coalescing_free_list.hpp>

 #include <rmm/mr/device/detail/stream_ordered_memory_resource.hpp>

 #include <rmm/mr/device/device_memory_resource.hpp>

 #include <rmm/mr/device/per_device_resource.hpp>

 #include <rmm/resource_ref.hpp>


 #include <cuda/std/type_traits>

 #include <cuda_runtime_api.h>

 #include <thrust/iterator/counting_iterator.h>

 #include <thrust/iterator/transform_iterator.h>


 #include <algorithm>

 #include <cstddef>

 #include <mutex>

 #include <optional>

 #include <set>


 namespace RMM_NAMESPACE {

 namespace mr {

 namespace detail {

 template <class PoolResource, class Upstream, class Property, class = void>

 struct maybe_remove_property {};


 template <class PoolResource, class Upstream, class Property>

 struct maybe_remove_property<PoolResource,

  Upstream,

  Property,

  cuda::std::enable_if_t<!cuda::has_property<Upstream, Property>>> {

 #if defined(__GNUC__) && !defined(__clang__) // GCC warns about compatibility

  // issues with pre ISO C++ code

 #pragma GCC diagnostic push

 #pragma GCC diagnostic ignored "-Wnon-template-friend"

 #endif // __GNUC__ and not __clang__

  friend void get_property(const PoolResource&, Property) = delete;

 #if defined(__GNUC__) && !defined(__clang__)

 #pragma GCC diagnostic pop

 #endif // __GNUC__ and not __clang__

 };

 } // namespace detail


 template <typename Upstream>

 class pool_memory_resource final

  : public detail:

  maybe_remove_property<pool_memory_resource<Upstream>, Upstream, cuda::mr::device_accessible>,

  public detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,

  detail::coalescing_free_list>,

  public cuda::forward_property<pool_memory_resource<Upstream>, Upstream> {

  public

  friend class detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,

  detail::coalescing_free_list>;


  explicit pool_memory_resource(device_async_resource_ref upstream_mr,

  std::size_t initial_pool_size,

  std::optional<std::size_t> maximum_pool_size = std::nullopt)

  : upstream_mr_{upstream_mr}

  {

  RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),

  "Error, Initial pool size required to be a multiple of 256 bytes");

  RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),

  "Error, Maximum pool size required to be a multiple of 256 bytes");


  initialize_pool(initial_pool_size, maximum_pool_size);

  }


  explicit pool_memory_resource(Upstream* upstream_mr,

  std::size_t initial_pool_size,

  std::optional<std::size_t> maximum_pool_size = std::nullopt)

  : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)}

  {

  RMM_EXPECTS(rmm::is_aligned(initial_pool_size, rmm::CUDA_ALLOCATION_ALIGNMENT),

  "Error, Initial pool size required to be a multiple of 256 bytes");

  RMM_EXPECTS(rmm::is_aligned(maximum_pool_size.value_or(0), rmm::CUDA_ALLOCATION_ALIGNMENT),

  "Error, Maximum pool size required to be a multiple of 256 bytes");


  initialize_pool(initial_pool_size, maximum_pool_size);

  }


  template <typename Upstream2 = Upstream,

  cuda::std::enable_if_t<cuda::mr::async_resource<Upstream2>, int> = 0>

  explicit pool_memory_resource(Upstream2& upstream_mr,

  std::size_t initial_pool_size,

  std::optional<std::size_t> maximum_pool_size = std::nullopt)

  : pool_memory_resource(cuda::std::addressof(upstream_mr), initial_pool_size, maximum_pool_size)

  {

  }


  ~pool_memory_resource() override { release(); }


  pool_memory_resource() = delete;

  pool_memory_resource(pool_memory_resource const&) = delete;

  pool_memory_resource(pool_memory_resource&&) = delete;

  pool_memory_resource& operator=(pool_memory_resource const&) = delete;

  pool_memory_resource& operator=(pool_memory_resource&&) = delete;


  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept

  {

  return upstream_mr_;

  }


  [[nodiscard]] std::size_t pool_size() const noexcept { return current_pool_size_; }


  protected

  using free_list = detail::coalescing_free_list;

  using block_type = free_list::block_type;

  using typename detail::stream_ordered_memory_resource<pool_memory_resource<Upstream>,

  detail::coalescing_free_list>::split_block;

  using lock_guard = std::lock_guard<std::mutex>


  [[nodiscard]] std::size_t get_maximum_allocation_size() const

  {

  return std::numeric_limits<std::size_t>::max();

  }


  block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)

  {

  auto report_error = [&](const char* reason) {

  RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",

  rmm::detail::format_stream(stream),

  min_size,

  reason);

  auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +

  rmm::detail::format_bytes(min_size) + std::string("): ") + reason;

  RMM_FAIL(msg.c_str(), rmm::out_of_memory);

  };


  while (try_size >= min_size) {

  try {

  auto block = block_from_upstream(try_size, stream);

  current_pool_size_ += block.size();

  return block;

  } catch (std::exception const& e) {

  if (try_size == min_size) { report_error(e.what()); }

  }

  try_size = std::max(min_size, try_size / 2);

  }


  auto const max_size = maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max());

  auto const msg = std::string("Not enough room to grow, current/max/try size = ") +

  rmm::detail::format_bytes(pool_size()) + ", " +

  rmm::detail::format_bytes(max_size) + ", " +

  rmm::detail::format_bytes(min_size);

  report_error(msg.c_str());

  return {};

  }


  void initialize_pool(std::size_t initial_size, std::optional<std::size_t> maximum_size)

  {

  current_pool_size_ = 0; // try_to_expand will set this if it succeeds

  maximum_pool_size_ = maximum_size;


  RMM_EXPECTS(

  initial_size <= maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max()),

  "Initial pool size exceeds the maximum pool size!");


  if (initial_size > 0) {

  auto const block = try_to_expand(initial_size, initial_size, cuda_stream_legacy);

  this->insert_block(block, cuda_stream_legacy);

  }

  }


  block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)

  {

  // Strategy: If maximum_pool_size_ is set, then grow geometrically, e.g. by halfway to the

  // limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each

  // time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,

  // until either success or the attempt is less than the requested size.


  return try_to_expand(size_to_grow(size), size, stream);

  }


  [[nodiscard]] std::size_t size_to_grow(std::size_t size) const

  {

  if (maximum_pool_size_.has_value()) {

  auto const unaligned_remaining = maximum_pool_size_.value() - pool_size();

  using rmm::align_up;

  auto const remaining = align_up(unaligned_remaining, rmm::CUDA_ALLOCATION_ALIGNMENT);

  auto const aligned_size = align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);

  return (aligned_size <= remaining) ? std::max(aligned_size, remaining / 2) : 0;

  }

  return std::max(size, pool_size());

  };


  block_type block_from_upstream(std::size_t size, cuda_stream_view stream)

  {

  RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);


  if (size == 0) { return {}; }


  void* ptr = get_upstream_resource().allocate_async(size, stream);

  return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;

  }


  split_block allocate_from_block(block_type const& block, std::size_t size)

  {

  block_type const alloc{block.pointer(), size, block.is_head()};

 #ifdef RMM_POOL_TRACK_ALLOCATIONS

  allocated_blocks_.insert(alloc);

 #endif


  auto rest = (block.size() > size)

  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

  ? block_type{block.pointer() + size, block.size() - size, false}

  : block_type{};

  return {alloc, rest};

  }


  block_type free_block(void* ptr, std::size_t size) noexcept

  {

 #ifdef RMM_POOL_TRACK_ALLOCATIONS

  if (ptr == nullptr) return block_type{};

  auto const iter = allocated_blocks_.find(static_cast<char*>(ptr));

  RMM_LOGGING_ASSERT(iter != allocated_blocks_.end());


  auto block = *iter;

  RMM_LOGGING_ASSERT(block.size() == rmm::align_up(size, allocation_alignment));

  allocated_blocks_.erase(iter);


  return block;

 #else

  auto const iter = upstream_blocks_.find(static_cast<char*>(ptr));

  return block_type{static_cast<char*>(ptr), size, (iter != upstream_blocks_.end())};

 #endif

  }


  void release()

  {

  lock_guard lock(this->get_mutex());


  for (auto block : upstream_blocks_) {

  get_upstream_resource().deallocate(block.pointer(), block.size());

  }

  upstream_blocks_.clear();

 #ifdef RMM_POOL_TRACK_ALLOCATIONS

  allocated_blocks_.clear();

 #endif


  current_pool_size_ = 0;

  }


 #ifdef RMM_DEBUG_PRINT

  void print()

  {

  lock_guard lock(this->get_mutex());


  auto const [free, total] = rmm::available_device_memory();

  std::cout << "GPU free memory: " << free << " total: " << total << "\n";


  std::cout << "upstream_blocks: " << upstream_blocks_.size() << "\n";

  std::size_t upstream_total{0};


  for (auto blocks : upstream_blocks_) {

  blocks.print();

  upstream_total += blocks.size();

  }

  std::cout << "total upstream: " << upstream_total << " B\n";


 #ifdef RMM_POOL_TRACK_ALLOCATIONS

  std::cout << "allocated_blocks: " << allocated_blocks_.size() << "\n";

  for (auto block : allocated_blocks_)

  block.print();

 #endif


  this->print_free_blocks();

  }

 #endif


  std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks)

  {

  std::size_t largest{};

  std::size_t total{};

  std::for_each(blocks.cbegin(), blocks.cend(), [&largest, &total](auto const& block) {

  total += block.size();

  largest = std::max(largest, block.size());

  });

  return {largest, total};

  }


  private

  // The "heap" to allocate the pool from

  device_async_resource_ref upstream_mr_;

  std::size_t current_pool_size_{};

  std::optional<std::size_t> maximum_pool_size_{};


 #ifdef RMM_POOL_TRACK_ALLOCATIONS

  std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> allocated_blocks_;

 #endif


  // blocks allocated from upstream

  std::set<block_type, rmm::mr::detail::compare_blocks<block_type>> upstream_blocks_;

 }; // namespace mr

  // end of group

 } // namespace mr

 } // namespace RMM_NAMESPACE

aligned.hpp

rmm::cuda_stream_view
用于 CUDA stream 的强类型非拥有的包装器，带有默认构造函数。
定义： cuda_stream_view.hpp:39

rmm::mr::pool_memory_resource
一个合并式的最佳适配子分配器，它使用从上游 memory_resource 分配的内存池...
定义： pool_memory_resource.hpp:105

rmm::mr::pool_memory_resource::free_block
block_type free_block(void *ptr, std::size_t size) noexcept
查找、释放并返回与指针 ptr 关联的块。
定义： pool_memory_resource.hpp:401

rmm::mr::pool_memory_resource::initialize_pool
void initialize_pool(std::size_t initial_size, std::optional< std::size_t > maximum_size)
为内存池分配初始内存。
定义： pool_memory_resource.hpp:293

rmm::mr::pool_memory_resource::allocate_from_block
split_block allocate_from_block(block_type const &block, std::size_t size)
如果需要，分割块以返回一个指向 size 字节内存的指针。
定义： pool_memory_resource.hpp:379

rmm::mr::pool_memory_resource::get_upstream_resource
device_async_resource_ref get_upstream_resource() const noexcept
上游资源的 rmm::device_async_resource_ref
定义： pool_memory_resource.hpp:204

rmm::mr::pool_memory_resource::size_to_grow
std::size_t size_to_grow(std::size_t size) const
给定一个最小尺寸，计算一个合适的尺寸来扩展内存池。
定义： pool_memory_resource.hpp:339

rmm::mr::pool_memory_resource::block_type
free_list::block_type block_type
空闲列表返回的块的类型。
定义： pool_memory_resource.hpp:220

rmm::mr::pool_memory_resource::free_list_summary
std::pair< std::size_t, std::size_t > free_list_summary(free_list const &blocks)
获取指定空闲列表中最大可用块大小和总空闲大小。
定义： pool_memory_resource.hpp:479

rmm::mr::pool_memory_resource::get_maximum_allocation_size
std::size_t get_maximum_allocation_size() const
获取此内存资源支持的最大分配大小。
定义： pool_memory_resource.hpp:233

rmm::mr::pool_memory_resource::expand_pool
block_type expand_pool(std::size_t size, free_list &blocks, cuda_stream_view stream)
从上游分配空间以供应子分配池，并返回一个足够大小的块。
定义： pool_memory_resource.hpp:317

rmm::mr::pool_memory_resource::pool_memory_resource
pool_memory_resource(Upstream2 &upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
构造一个 pool_memory_resource 并使用 upstream_mr 分配初始设备内存池。
定义： pool_memory_resource.hpp:182

rmm::mr::pool_memory_resource::release
void release()
释放从上游 memory_resource 分配的所有内存。
定义： pool_memory_resource.hpp:423

rmm::mr::pool_memory_resource::try_to_expand
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
尝试通过从上游分配至少 min_size 字节的块来扩展内存池。
定义： pool_memory_resource.hpp:253

rmm::mr::pool_memory_resource::lock_guard
std::lock_guard< std::mutex > lock_guard
用于同步访问的锁类型。
定义： pool_memory_resource.hpp:223

rmm::mr::pool_memory_resource::pool_size
std::size_t pool_size() const noexcept
计算当前内存池的大小。
定义： pool_memory_resource.hpp:216

rmm::mr::pool_memory_resource::~pool_memory_resource
~pool_memory_resource() override
销毁 pool_memory_resource 并使用上游资源释放它分配的所有内存。
定义： pool_memory_resource.hpp:193

rmm::mr::pool_memory_resource::free_list
detail::coalescing_free_list free_list
空闲列表实现。
定义： pool_memory_resource.hpp:219

rmm::mr::pool_memory_resource::pool_memory_resource
pool_memory_resource(Upstream *upstream_mr, std::size_t initial_pool_size, std::optional< std::size_t > maximum_pool_size=std::nullopt)
构造一个 pool_memory_resource 并使用 upstream_mr 分配初始设备内存池。
定义： pool_memory_resource.hpp:152

rmm::mr::pool_memory_resource::block_from_upstream
block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
从上游分配一个块来扩展子分配池。
定义： pool_memory_resource.hpp:359

rmm::out_of_memory
当 RMM 内存不足时抛出的异常。
定义： error.hpp:87

cuda_stream_view.hpp

device_memory_resource.hpp

rmm::available_device_memory
std::pair< std::size_t, std::size_t > available_device_memory()
返回当前设备的可用和总设备内存（字节）。
定义： cuda_device.hpp:123

rmm::cuda_stream_legacy
static const cuda_stream_view cuda_stream_legacy
cudaStreamLegacy 的静态 cuda_stream_view，为了方便。
定义： cuda_stream_view.hpp:131

rmm::device_async_resource_ref
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
cuda::mr::async_resource_ref 的别名，具有属性 cuda::mr::device_accessible。
定义： resource_ref.hpp:40

rmm::to_device_async_resource_ref_checked
device_async_resource_ref to_device_async_resource_ref_checked(Resource *res)
将指向内存资源的指针转换为 device_async_resource_ref，检查是否为 nullptr
定义： resource_ref.hpp:78

rmm::CUDA_ALLOCATION_ALIGNMENT
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
CUDA 内存分配使用的默认对齐方式。
定义： aligned.hpp:43

rmm::is_aligned
constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
检查值是否与指定 2 的幂的倍数对齐。
定义： aligned.hpp:105

rmm::align_up
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
向上对齐到指定 2 的幂的最近倍数。
定义： aligned.hpp:77

per_device_resource.hpp
每设备 device_memory_resources 的管理。

resource_ref.hpp

rmm::mr::detail::maybe_remove_property
用于移除 device_accessible 属性的辅助类。
定义： pool_memory_resource.hpp:63