RMM: fixed_size_memory_resource.hpp 源文件

 /*

  * Copyright (c) 2020-2025, NVIDIA CORPORATION.

  *

  * Licensed under the Apache License, Version 2.0 (the "License");

  * you may not use this file except in compliance with the License.

  * You may obtain a copy of the License at

  *

  * https://apache.ac.cn/licenses/LICENSE-2.0

  *

  * Unless required by applicable law or agreed to in writing, software

  * distributed under the License is distributed on an "AS IS" BASIS,

  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  * See the License for the specific language governing permissions and

  * limitations under the License.

  */

 #pragma once


 #include <rmm/aligned.hpp>

 #include <rmm/cuda_stream_view.hpp>

 #include <rmm/detail/error.hpp>

 #include <rmm/detail/export.hpp>

 #include <rmm/detail/logging_assert.hpp>

 #include <rmm/detail/thrust_namespace.h>

 #include <rmm/mr/device/detail/fixed_size_free_list.hpp>

 #include <rmm/mr/device/detail/stream_ordered_memory_resource.hpp>

 #include <rmm/resource_ref.hpp>


 #include <cuda_runtime_api.h>

 #include <thrust/iterator/counting_iterator.h>

 #include <thrust/iterator/transform_iterator.h>


 #include <algorithm>

 #include <cstddef>

 #include <utility>

 #include <vector>


 namespace RMM_NAMESPACE {

 namespace mr {

 template <typename Upstream>

 class fixed_size_memory_resource

  : public detail::stream_ordered_memory_resource<fixed_size_memory_resource<Upstream>,

  detail::fixed_size_free_list> {

  public

  friend class detail::stream_ordered_memory_resource<fixed_size_memory_resource<Upstream>,

  detail::fixed_size_free_list>;


  static constexpr std::size_t default_block_size = 1 << 20;


  static constexpr std::size_t default_blocks_to_preallocate = 128;


  explicit fixed_size_memory_resource(

  device_async_resource_ref upstream_mr,

  // NOLINTNEXTLINE bugprone-easily-swappable-parameters

  std::size_t block_size = default_block_size,

  std::size_t blocks_to_preallocate = default_blocks_to_preallocate)

  : upstream_mr_{upstream_mr},

  block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)},

  upstream_chunk_size_{block_size_ * blocks_to_preallocate}

  {

  // allocate initial blocks and insert into free list

  this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy);

  }


  explicit fixed_size_memory_resource(

  Upstream* upstream_mr,

  // NOLINTNEXTLINE bugprone-easily-swappable-parameters

  std::size_t block_size = default_block_size,

  std::size_t blocks_to_preallocate = default_blocks_to_preallocate)

  : upstream_mr_{to_device_async_resource_ref_checked(upstream_mr)},

  block_size_{align_up(block_size, CUDA_ALLOCATION_ALIGNMENT)},

  upstream_chunk_size_{block_size_ * blocks_to_preallocate}

  {

  // allocate initial blocks and insert into free list

  this->insert_blocks(std::move(blocks_from_upstream(cuda_stream_legacy)), cuda_stream_legacy);

  }


  ~fixed_size_memory_resource() override { release(); }


  fixed_size_memory_resource() = delete;

  fixed_size_memory_resource(fixed_size_memory_resource const&) = delete;

  fixed_size_memory_resource(fixed_size_memory_resource&&) = delete;

  fixed_size_memory_resource& operator=(fixed_size_memory_resource const&) = delete;

  fixed_size_memory_resource& operator=(fixed_size_memory_resource&&) = delete;


  [[nodiscard]] device_async_resource_ref get_upstream_resource() const noexcept

  {

  return upstream_mr_;

  }


  [[nodiscard]] std::size_t get_block_size() const noexcept { return block_size_; }


  protected

  using free_list = detail::fixed_size_free_list;

  using block_type = free_list::block_type;

  using typename detail::stream_ordered_memory_resource<fixed_size_memory_resource<Upstream>,

  detail::fixed_size_free_list>::split_block;

  using lock_guard = std::lock_guard<std::mutex>;


  [[nodiscard]] std::size_t get_maximum_allocation_size() const { return get_block_size(); }


  block_type expand_pool(std::size_t size, free_list& blocks, cuda_stream_view stream)

  {

  blocks.insert(std::move(blocks_from_upstream(stream)));

  return blocks.get_block(size);

  }


  free_list blocks_from_upstream(cuda_stream_view stream)

  {

  void* ptr = get_upstream_resource().allocate_async(upstream_chunk_size_, stream);

  block_type block{ptr};

  upstream_blocks_.push_back(block);


  auto num_blocks = upstream_chunk_size_ / block_size_;


  auto block_gen = [ptr, this](int index) {

  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)

  return block_type{static_cast<char*>(ptr) + index * block_size_};

  };

  auto first =

  thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), block_gen);

  return free_list(first, first + num_blocks);

  }


  split_block allocate_from_block(block_type const& block, std::size_t size)

  {

  return {block, block_type{nullptr}};

  }


  block_type free_block(void* ptr, std::size_t size) noexcept

  {

  // Deallocating a fixed-size block just inserts it in the free list, which is

  // handled by the parent class

  RMM_LOGGING_ASSERT(align_up(size, CUDA_ALLOCATION_ALIGNMENT) <= block_size_);

  return block_type{ptr};

  }


  void release()

  {

  lock_guard lock(this->get_mutex());


  for (auto block : upstream_blocks_) {

  get_upstream_resource().deallocate(block.pointer(), upstream_chunk_size_);

  }

  upstream_blocks_.clear();

  }


 #ifdef RMM_DEBUG_PRINT

  void print()

  {

  lock_guard lock(this->get_mutex());


  auto const [free, total] = rmm::available_device_memory();

  std::cout << "GPU free memory: " << free << " total: " << total << "\n";


  std::cout << "upstream_blocks: " << upstream_blocks_.size() << "\n";

  std::size_t upstream_total{0};


  for (auto blocks : upstream_blocks_) {

  blocks.print();

  upstream_total += upstream_chunk_size_;

  }

  std::cout << "total upstream: " << upstream_total << " B\n";


  this->print_free_blocks();

  }

 #endif


  std::pair<std::size_t, std::size_t> free_list_summary(free_list const& blocks)

  {

  return blocks.is_empty() ? std::make_pair(std::size_t{0}, std::size_t{0})

  : std::make_pair(block_size_, blocks.size() * block_size_);

  }


  private

  device_async_resource_ref upstream_mr_; // The resource from which to allocate new blocks


  std::size_t block_size_; // size of blocks this MR allocates

  std::size_t upstream_chunk_size_; // size of chunks allocated from heap MR


  // blocks allocated from heap: so they can be easily freed

  std::vector<block_type> upstream_blocks_;

 };

  // end of group

 } // namespace mr

 } // namespace RMM_NAMESPACE

aligned.hpp

rmm::cuda_stream_view
CUDA stream 的强类型非拥有包装器，带有默认构造函数。
定义： cuda_stream_view.hpp:39

rmm::mr::fixed_size_memory_resource
一种 device_memory_resource，它分配固定大小的内存块。
定义： fixed_size_memory_resource.hpp:53

rmm::mr::fixed_size_memory_resource::free_list_summary
std::pair< std::size_t, std::size_t > free_list_summary(free_list const &blocks)
获取指定空闲列表中最大的可用块大小和总空闲大小。
定义： fixed_size_memory_resource.hpp:268

rmm::mr::fixed_size_memory_resource::free_list
detail::fixed_size_free_list free_list
空闲列表类型。
定义： fixed_size_memory_resource.hpp:140

rmm::mr::fixed_size_memory_resource::free_block
block_type free_block(void *ptr, std::size_t size) noexcept
找到、释放并返回与指针关联的块。
定义： fixed_size_memory_resource.hpp:217

rmm::mr::fixed_size_memory_resource::get_block_size
std::size_t get_block_size() const noexcept
获取此内存资源分配的块大小。
定义： fixed_size_memory_resource.hpp:137

rmm::mr::fixed_size_memory_resource::get_maximum_allocation_size
std::size_t get_maximum_allocation_size() const
获取此内存资源支持的（固定）分配大小。
定义： fixed_size_memory_resource.hpp:152

rmm::mr::fixed_size_memory_resource::block_type
free_list::block_type block_type
空闲列表管理的块类型。
定义： fixed_size_memory_resource.hpp:141

rmm::mr::fixed_size_memory_resource::get_upstream_resource
device_async_resource_ref get_upstream_resource() const noexcept
指向上游资源的 device_async_resource_ref。
定义： fixed_size_memory_resource.hpp:127

rmm::mr::fixed_size_memory_resource::expand_pool
block_type expand_pool(std::size_t size, free_list &blocks, cuda_stream_view stream)
从上游分配一个块，以供子分配池使用。
定义： fixed_size_memory_resource.hpp:165

rmm::mr::fixed_size_memory_resource::blocks_from_upstream
free_list blocks_from_upstream(cuda_stream_view stream)
从上游分配块以扩展子分配池。
定义： fixed_size_memory_resource.hpp:177

rmm::mr::fixed_size_memory_resource::lock_guard
std::lock_guard< std::mutex > lock_guard
用于同步访问的锁类型。
定义： fixed_size_memory_resource.hpp:144

rmm::mr::fixed_size_memory_resource::release
void release()
释放使用上游资源分配的所有内存。
定义： fixed_size_memory_resource.hpp:229

rmm::mr::fixed_size_memory_resource::allocate_from_block
split_block allocate_from_block(block_type const &block, std::size_t size)
如有必要，分割块并返回指向 size 字节内存的指针。
定义： fixed_size_memory_resource.hpp:204

cuda_stream_view.hpp

rmm::available_device_memory
std::pair< std::size_t, std::size_t > available_device_memory()
返回当前设备可用和总设备内存（以字节为单位）。
定义： cuda_device.hpp:123

rmm::device_async_resource_ref
cuda::mr::async_resource_ref< cuda::mr::device_accessible > device_async_resource_ref
带有 cuda::mr::device_accessible 属性的 cuda::mr::async_resource_ref 的别名。
定义： resource_ref.hpp:40

rmm::CUDA_ALLOCATION_ALIGNMENT
static constexpr std::size_t CUDA_ALLOCATION_ALIGNMENT
用于 CUDA 内存分配的默认对齐方式。
定义： aligned.hpp:43

rmm::align_up
constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcept
向上对齐到指定 2 的幂的最近倍数。
定义： aligned.hpp:77

resource_ref.hpp