libcudf: types.hpp 源文件

 /*

  * 版权所有 (c) 2019-2025, NVIDIA CORPORATION.

  *

  * 根据 Apache 许可证，版本 2.0（“许可证”）获得许可；

  * 您除非遵守许可证的规定，否则不得使用此文件。

  * 您可以在以下位置获取许可证的副本：

  *

  * https://apache.ac.cn/licenses/LICENSE-2.0

  *

  * 除非适用法律要求或书面同意，否则

  * 根据许可证分发的软件按“原样”分发，

  * 不附带任何明示或默示的保证或条件。

  * 请参阅许可证了解管理权限和

  * 限制的特定语言。

  */


 #pragma once


 #include <cudf/table/table.hpp>

 #include <cudf/types.hpp>

 #include <cudf/utilities/span.hpp>


 #include <map>

 #include <memory>

 #include <optional>

 #include <string>

 #include <unordered_map>

 #include <utility>

 #include <vector>


 namespace CUDF_EXPORT cudf {

 namespace io {

 class data_sink;

 class datasource;

 } // io 命名空间

 } // CUDF_EXPORT cudf 命名空间


 namespace CUDF_EXPORT cudf {

 namespace io {

 enum class compression_type : int32_t {

  NONE,

  AUTO,

  SNAPPY,

  GZIP,

  BZIP2,

  BROTLI,

  ZIP,

  XZ,

  ZLIB,

  LZ4,

  LZO,

  ZSTD

 };


 enum class io_type : int32_t {

  FILEPATH,

  HOST_BUFFER,

  DEVICE_BUFFER,

  VOID,

  USER_IMPLEMENTED,

 };


 enum class quote_style : int32_t {

  MINIMAL,

  ALL,

  NONNUMERIC,

  NONE

 };


 enum statistics_freq : int32_t {

  STATISTICS_NONE = 0,

  STATISTICS_ROWGROUP = 1,

  STATISTICS_PAGE = 2,

  STATISTICS_COLUMN = 3,

 };


 enum class column_encoding : int32_t {

  // 常见编码：

  USE_DEFAULT = -1,

  DICTIONARY,

  // Parquet 编码：

  PLAIN,

  DELTA_BINARY_PACKED,

  DELTA_LENGTH_BYTE_ARRAY,

  DELTA_BYTE_ARRAY,

  BYTE_STREAM_SPLIT,

  // ORC 编码：

  DIRECT,

  DIRECT_V2,

  DICTIONARY_V2,

 };


 class writer_compression_statistics {

  public

  writer_compression_statistics() = default;


  /*!

  * \brief 构造新的压缩统计对象。

  *

  * \param num_compressed_bytes 成功压缩的字节数。

  * \param num_failed_bytes 压缩失败的字节数。

  * \param num_skipped_bytes 未压缩的字节数（例如太小）。

  * \param num_compressed_output_bytes 生成的压缩数据的大小。

  */

  writer_compression_statistics(size_t num_compressed_bytes,

  size_t num_failed_bytes,

  size_t num_skipped_bytes,

  /*!

  * \brief 将另一个对象的压缩统计信息添加到此对象。

  *

  * \param other 要添加统计信息的另一个对象。

  * \return 更新了新统计信息的此对象。

  */

  writer_compression_statistics& operator+=(writer_compression_statistics const& other) noexcept

  {

  _num_compressed_bytes += other._num_compressed_bytes;

  /*!

  * \brief 返回成功压缩的字节数。

  /*!

  * \brief 返回压缩失败的字节数。

  /*!

  * \brief 返回未压缩的字节数。

  /*!

  * \brief 返回考虑进行压缩的总输入字节数。

  */

  [[nodiscard]] auto num_total_input_bytes() const noexcept

  {

  /*!

  * \brief 返回总压缩比，即 `num_compressed_bytes / num_compressed_output_bytes`

  *

  * \note 如果 `num_compressed_output_bytes` 为零，则返回 0.0。

  */

  private

  std::size_t _num_compressed_bytes = 0;

  std::size_t _num_failed_bytes = 0;

  std::size_t _num_skipped_bytes = 0;

  std::size_t _num_compressed_output_bytes = 0;

 };


 /*!

  * \brief 字典编码策略。

  *

  * 这控制何时/是否对字符串列使用字典编码

  * 在写操作期间。

  */

 /*!

  * \brief 列的 Schema 信息。

  *

  * 读操作使用此信息提供 Schema 详细信息，写操作使用此信息

  * 覆盖列数据本身提供的 Schema 详细信息。

  */


  /*!

  * \brief 从列名和可选属性构造新实例。

  *

  * \param _name 列名。

  * \param _is_nullable 列的可选可为空性。

  * \param _is_binary 列的可选二进制类型。

  */

  column_name_info() = default;


  /*!

  * \brief 相等比较运算符。

  *

  * \param rhs 要比较的另一个列信息对象。

  * \return 如果对象相等则返回 True，否则返回 False。

  */

 };


 /*!

  * \brief 表元数据信息。

  */

  /*!< Schema 信息。 */

  /*!< 每个数据源中的行数。 */

  /*!< 任意文件级用户定义键值元数据。 */

  /*!< 每个文件的任意用户定义键值元数据。 */


  // 以下变量目前仅针对 Parquet 读取器计算

  /*!< 读取的总输入行组数（仅限 Parquet）。 */

  std::optional<size_type>

  /*!< 应用统计过滤器后的行组数（仅限 Parquet）。 */

  std::optional<size_type>

  /*!< 应用 Bloom 过滤器后的行组数（仅限 Parquet）。 */

 };


 /*!

  * \brief 带有附加元数据的表。

  */

 };


 /*!

  * \brief 主机可访问内存缓冲区的视图。

  * \deprecated 请改用 `cudf::host_span`。

  */

  host_buffer() = default;

  /*!

  * \brief 从指针和大小构造新实例。

  *

  /*!

  * \brief 检查类型 `T` 是否类似于字节（char, int8_t, unsigned char, uint8_t, std::byte）。

  */

  using non_cv_T = std::remove_cv_t<T>;

  return std::is_same_v<non_cv_T, int8_t> || std::is_same_v<non_cv_T, char> ||

  std::is_same_v<non_cv_T, uint8_t> || std::is_same_v<non_cv_T, unsigned char> ||

  std::is_same_v<non_cv_T, std::byte>;

 }


 /*!

  * \brief 数据源信息。

  */

  /*!

  * \brief 从文件路径列表构造新实例。

  *

  * \param file_paths 文件路径列表。

  */

  /*!

  * \brief 从单个文件路径构造新实例。

  *

  * \param file_path 单个文件路径。

  */

  /*!

  * \brief 从主机缓冲区列表构造新实例。

  *

  * \param host_buffers 主机缓冲区列表。

  */

  explicit source_info(std::vector<host_buffer> const& host_buffers) : _type(io_type::HOST_BUFFER)

  {

  _host_buffers.reserve(host_buffers.size());

  std::transform(host_buffers.begin(),

  host_buffers.end(),

  std::back_inserter(_host_buffers),

  [](auto const hb) {

  /*!

  * \brief 从单个主机缓冲区构造新实例。

  *

  * \param host_data 指向数据的指针。

  * \param size 数据大小（字节）。

  */

  explicit source_info(char const* host_data, size_t size)

  /*!

  * \brief 从主机 span 列表构造新实例。

  *

  * \tparam T 主机 span 的元素类型。必须类似于字节。

  * \param host_buffers 主机 span 列表。

  */

  explicit source_info(cudf::host_span<cudf::host_span<T>> const host_buffers)

  : _type(io_type::HOST_BUFFER)

  {

  if constexpr (not std::is_same_v<std::remove_cv_t<T>, std::byte>) {

  _host_buffers.reserve(host_buffers.size());

  std::transform(host_buffers.begin(),

  host_buffers.end(),

  std::back_inserter(_host_buffers),

  [](auto const s) {

  return cudf::host_span<std::byte const>{

  reinterpret_cast<std::byte const*>(s.data()), s.size()};

  });

  /*!

  * \brief 从单个主机 span 构造新实例。

  *

  * \tparam T 主机 span 的元素类型。必须类似于字节。

  * \param host_data 单个主机 span。

  */

  explicit source_info(cudf::host_span<T> host_data)

  : _type(io_type::HOST_BUFFER),

  /*!

  * \brief 从设备 span 列表构造新实例。

  *

  * \param device_buffers 设备 span 列表。

  */

  /*!

  * \brief 从单个设备缓冲区构造新实例。

  *

  * \param d_buffer 单个设备缓冲区。

  */

  /*!

  * \brief 从用户实现的数据源对象列表构造新实例。

  *

  * \param sources 数据源对象列表。

  */

  /*!

  * \brief 从单个用户实现的数据源对象构造新实例。

  *

  * \param source 单个数据源对象。

  */

  /*!

  /*!

  /*!

  /*!

  /*!

  * \brief 如果类型是 `USER_IMPLEMENTED` ，则返回用户实现的数据源对象列表

  */

  private

  io_type _type = io_type::VOID;

  std::vector<std::string> _filepaths;

  std::vector<cudf::host_span<std::byte const>> _host_buffers;

  std::vector<cudf::device_span<std::byte const>> _device_buffers;

  std::vector<cudf::io::datasource*> _user_sources;

 };

 /*!

  * \brief 数据目标信息。

  /*!

  * \brief 构造具有特定数量目标的新实例。

  /*!

  * \brief 从文件路径列表构造新实例。

  *

  * \param file_paths 文件路径列表。

  */

  /*!

  * \brief 从单个文件路径构造新实例。

  *

  * \param file_path 单个文件路径。

  */

  /*!

  * \brief 从主机缓冲区列表构造新实例。

  *

  * \param buffers 主机缓冲区列表。

  /*!

  * \brief 从单个主机缓冲区构造新实例。

  /*!

  * \brief 从用户实现的数据目标对象列表构造新实例。

  *

  * \param user_sinks 数据目标对象列表。

  */

  : _type(io_type::USER_IMPLEMENTED),

  _num_sinks(user_sinks.size()),

  /*!

  * \brief 从单个用户实现的数据目标对象构造新实例。

  *

  * \param user_sink 单个数据目标对象。

  */

  /*!

  /*!

  /*!

  /*!

  /*!

  * \brief 如果类型是 `USER_IMPLEMENTED` ，则返回用户实现的数据目标对象列表

  */

  private

  io_type _type = io_type::VOID;

  size_t _num_sinks = 1;

  std::vector<std::string> _filepaths;

  std::vector<std::vector<char>*> _buffers;

  std::vector<cudf::io::data_sink*> _user_sinks;

 };


 class table_input_metadata;

 /*!

  * \brief 写操作期间用于覆盖列 Schema 详细信息的元数据。

  */

  std::optional<bool> _nullable;

  bool _list_column_is_map = false;

  bool _use_int96_timestamp = false;

  bool _output_as_binary = false;

  bool _skip_compression = false;

  std::optional<uint8_t> _decimal_precision;

  std::optional<int32_t> _parquet_field_id;

  std::optional<int32_t> _type_length;

  std::vector<column_in_metadata> children;

  column_encoding _encoding = column_encoding::USE_DEFAULT;


  public

  column_in_metadata() = default;

  /*!

  /*!

  * \brief 向元数据添加一个子列。

  *

  * \param child 子列元数据。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置列名。

  *

  * \param name 列名。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置列可为空性。

  *

  * \param nullable 如果可为空则为 True，否则为 False。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置标志，将 list 列视为 map 列。

  *

  * \return 此对象用于链式 API 调用。

  */

  column_in_metadata& set_list_column_as_map() noexcept

  /*!

  * \brief 请求将时间戳输出为 INT96。

  * Applies only to Parquet.

  * \param req 如果需要 INT96 则为 True，否则为 False。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置列十进制精度。

  * Applies only to Parquet and ORC.

  * \param precision 列的十进制精度。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置列类型长度。

  * Applies only to Parquet (fixed-width types).

  * \param length 列的类型长度。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置列 Parquet 字段 ID。

  * Applies only to Parquet.

  * \param field_id 列的 Parquet 字段 ID。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 将列输出类型设置为二进制。仅限 Parquet。

  * \param binary 如果输出应为二进制则为 True，否则为 False。

  * \return 此对象用于链式 API 调用。

  */

  column_in_metadata& set_output_as_binary(bool binary) noexcept

  {

  _output_as_binary = binary;

  if (_output_as_binary and children.size() == 1) {

  children.emplace_back();

  } else if (!_output_as_binary and children.size() == 2) {

  /*!

  * \brief 设置列压缩为禁用。

  * Applies only to Parquet and ORC.

  * \param skip 如果应跳过压缩则为 True，否则为 False。

  * \return 此对象用于链式 API 调用。

  */

  /*!

  * \brief 设置列编码。

  * \note 某些编码仅适用于特定文件格式（例如，Parquet，ORC）。

  * \param encoding 用于该列的编码。

  * \return 此对象用于链式 API 调用。

  */

  /*!


  [[nodiscard]] column_in_metadata const& child(size_type i) const noexcept { return children[i]; }


  [[nodiscard]] std::string const& get_name() const noexcept { return _name; }


  [[nodiscard]] bool is_nullability_defined() const noexcept { return _nullable.has_value(); }


  [[nodiscard]] bool nullable() const { return _nullable.value(); }


  [[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; }


  [[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; }


  [[nodiscard]] bool is_decimal_precision_set() const noexcept

  {

  return _decimal_precision.has_value();

  }


  [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }


  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }


  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }


  [[nodiscard]] bool is_parquet_field_id_set() const noexcept

  {

  return _parquet_field_id.has_value();

  }


  [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); }


  [[nodiscard]] size_type num_children() const noexcept { return children.size(); }


  [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }


  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }


  [[nodiscard]] column_encoding get_encoding() const { return _encoding; }

 };


 class table_input_metadata {

  public

  table_input_metadata() = default; // Required by cython


  explicit table_input_metadata(table_view const& table);


  explicit table_input_metadata(table_metadata const& metadata);


  std::vector<column_in_metadata> column_metadata;

 };


 struct partition_info {

  size_type start_row;

  size_type num_rows;


  partition_info() = default;

  partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows)

  {

  }

 };


 class reader_column_schema {

  // Whether to read binary data as a string column

  bool _convert_binary_to_strings{true};

  int32_t _type_length{0};


  std::vector<reader_column_schema> children;


  public

  reader_column_schema() = default;


  reader_column_schema(size_type number_of_children) { children.resize(number_of_children); }


  reader_column_schema(host_span<reader_column_schema> const& child_span)

  {

  children.assign(child_span.begin(), child_span.end());

  }


  reader_column_schema& add_child(reader_column_schema const& child)

  {

  children.push_back(child);

  return *this;

  }


  [[nodiscard]] reader_column_schema& child(size_type i) { return children[i]; }


  [[nodiscard]] reader_column_schema const& child(size_type i) const { return children[i]; }


  reader_column_schema& set_convert_binary_to_strings(bool convert_to_string)

  {

  _convert_binary_to_strings = convert_to_string;

  return *this;

  }


  reader_column_schema& set_type_length(int32_t type_length)

  {

  _type_length = type_length;

  return *this;

  }


  [[nodiscard]] bool is_enabled_convert_binary_to_strings() const

  {

  return _convert_binary_to_strings;

  }


  [[nodiscard]] int32_t get_type_length() const { return _type_length; }


  [[nodiscard]] size_t get_num_children() const { return children.size(); }

 };

  // end of group

 } // namespace io

 } // namespace CUDF_EXPORT cudf

cudf::detail::span_base::end
constexpr CUDF_HOST_DEVICE iterator end() const noexcept
返回指向 span 中最后一个元素之后元素的迭代器。
定义： span.hpp:105

cudf::detail::span_base::begin
constexpr CUDF_HOST_DEVICE iterator begin() const noexcept
返回指向 span 中第一个元素的迭代器。
定义： span.hpp:97

cudf::io::column_in_metadata
列的元数据。
定义： io/types.hpp:634

cudf::io::column_in_metadata::set_name
column_in_metadata & set_name(std::string const &name) noexcept
设置此列的名称。
定义： io/types.hpp:674

cudf::io::column_in_metadata::add_child
column_in_metadata & add_child(column_in_metadata const &child)
添加此列的子元数据。
定义： io/types.hpp:662

cudf::io::column_in_metadata::is_enabled_output_as_binary
bool is_enabled_output_as_binary() const noexcept
获取此列是否编码为二进制或字符串数据。
定义： io/types.hpp:925

cudf::io::column_in_metadata::set_parquet_field_id
column_in_metadata & set_parquet_field_id(int32_t field_id) noexcept
设置此列的 parquet 字段 ID。
定义： io/types.hpp:751

cudf::io::column_in_metadata::set_int96_timestamps
column_in_metadata & set_int96_timestamps(bool req) noexcept
指定此时间戳列是否应使用已弃用的 int96 物理类型进行编码...。
定义： io/types.hpp:713

cudf::io::column_in_metadata::set_decimal_precision
column_in_metadata & set_decimal_precision(uint8_t precision) noexcept
设置此列的十进制精度。仅当此列为十进制（定点）类型时有效。
定义： io/types.hpp:726

cudf::io::column_in_metadata::is_enabled_int96_timestamps
bool is_enabled_int96_timestamps() const noexcept
获取是否使用已弃用的 int96 物理类型对时间戳列进行编码。
定义： io/types.hpp:857

cudf::io::column_in_metadata::is_parquet_field_id_set
bool is_parquet_field_id_set() const noexcept
获取是否已为此列设置了 parquet 字段 ID。
定义： io/types.hpp:899

cudf::io::column_in_metadata::is_decimal_precision_set
bool is_decimal_precision_set() const noexcept
获取是否已为此十进制列设置精度。
定义： io/types.hpp:864

cudf::io::column_in_metadata::is_type_length_set
bool is_type_length_set() const noexcept
获取是否已为此列设置类型长度。
定义： io/types.hpp:883

cudf::io::column_in_metadata::nullable
bool nullable() const
获取此列显式设置的可空性。
定义： io/types.hpp:842

cudf::io::column_in_metadata::num_children
size_type num_children() const noexcept
获取此列的子项数量。
定义： io/types.hpp:918

cudf::io::column_in_metadata::is_map
bool is_map() const noexcept
如果这是列表列的元数据，则返回它是否要编码为 map。
定义： io/types.hpp:849

cudf::io::column_in_metadata::get_decimal_precision
uint8_t get_decimal_precision() const
获取为此列设置的十进制精度。
定义： io/types.hpp:876

cudf::io::column_in_metadata::set_encoding
column_in_metadata & set_encoding(column_encoding encoding) noexcept
设置此列使用的编码。
定义： io/types.hpp:799

cudf::io::column_in_metadata::get_encoding
column_encoding get_encoding() const
获取为此列设置的编码。
定义： io/types.hpp:939

cudf::io::column_in_metadata::get_type_length
uint8_t get_type_length() const
获取为此列设置的类型长度。
定义： io/types.hpp:892

cudf::io::column_in_metadata::set_output_as_binary
column_in_metadata & set_output_as_binary(bool binary) noexcept
指定此列应写为二进制还是字符串数据。仅对以下类型有效...
定义： io/types.hpp:765

cudf::io::column_in_metadata::child
column_in_metadata & child(size_type i) noexcept
获取此列子项的引用。
定义： io/types.hpp:811

cudf::io::column_in_metadata::set_type_length
column_in_metadata & set_type_length(int32_t length) noexcept
设置列的数据长度。仅当此列是固定长度字节数组时有效。
定义： io/types.hpp:739

cudf::io::column_in_metadata::is_enabled_skip_compression
bool is_enabled_skip_compression() const noexcept
获取是否跳过压缩此列。
定义： io/types.hpp:932

cudf::io::column_in_metadata::get_parquet_field_id
int32_t get_parquet_field_id() const
获取为此列设置的 parquet 字段 ID。
定义： io/types.hpp:911

cudf::io::column_in_metadata::set_list_column_as_map
column_in_metadata & set_list_column_as_map() noexcept
指定此列表列应在写入文件中编码为 map。
定义： io/types.hpp:699

cudf::io::column_in_metadata::column_in_metadata
column_in_metadata(std::string_view name)
构造一个新的列元数据对象。
定义： io/types.hpp:655

cudf::io::column_in_metadata::get_name
std::string const & get_name() const noexcept
获取此列的名称。
定义： io/types.hpp:826

cudf::io::column_in_metadata::set_nullability
column_in_metadata & set_nullability(bool nullable) noexcept
设置此列的可空性。
定义： io/types.hpp:686

cudf::io::column_in_metadata::is_nullability_defined
bool is_nullability_defined() const noexcept
获取此列是否已显式设置可空性。
定义： io/types.hpp:833

cudf::io::column_in_metadata::child
column_in_metadata const & child(size_type i) const noexcept
获取此列子项的 const 引用。
定义： io/types.hpp:819

cudf::io::column_in_metadata::set_skip_compression
column_in_metadata & set_skip_compression(bool skip) noexcept
指定此列是否应跳过压缩，无论指定的压缩编解码器是什么...
定义： io/types.hpp:783

cudf::io::data_sink
用于存储 writer 输出数据的接口类。
定义： data_sink.hpp:43

cudf::io::datasource
用于向 reader 提供输入数据的接口类。
定义： datasource.hpp:42

cudf::io::reader_column_schema
reader 的 schema 元素
定义： io/types.hpp:997

cudf::io::reader_column_schema::child
reader_column_schema const & child(size_type i) const
获取此列子项的 const 引用。
定义： io/types.hpp:1050

cudf::io::reader_column_schema::set_type_length
reader_column_schema & set_type_length(int32_t type_length)
设置固定长度数据的长度。
定义： io/types.hpp:1072

cudf::io::reader_column_schema::is_enabled_convert_binary_to_strings
bool is_enabled_convert_binary_to_strings() const
获取此列是否编码为二进制或字符串数据。
定义： io/types.hpp:1083

cudf::io::reader_column_schema::get_type_length
int32_t get_type_length() const
获取此固定长度数据的字节长度。
定义： io/types.hpp:1093

cudf::io::reader_column_schema::reader_column_schema
reader_column_schema(host_span< reader_column_schema > const &child_span)
构造一个包含定义子项的 span 的新 reader 列 schema 对象。
定义： io/types.hpp:1019

cudf::io::reader_column_schema::set_convert_binary_to_strings
reader_column_schema & set_convert_binary_to_strings(bool convert_to_string)
指定此列应写为二进制还是字符串数据。仅对以下类型有效...
定义： io/types.hpp:1060

cudf::io::reader_column_schema::get_num_children
size_t get_num_children() const
获取子对象的数量。
定义： io/types.hpp:1100

cudf::io::reader_column_schema::add_child
reader_column_schema & add_child(reader_column_schema const &child)
添加此列的子元数据。
定义： io/types.hpp:1030

cudf::io::reader_column_schema::child
reader_column_schema & child(size_type i)
获取此列子项的引用。
定义： io/types.hpp:1042

cudf::io::reader_column_schema::reader_column_schema
reader_column_schema(size_type number_of_children)
构造一个新的 reader 列 schema 对象。
定义： io/types.hpp:1012

cudf::io::table_input_metadata
表的元数据。
定义： io/types.hpp:945

cudf::io::table_input_metadata::table_input_metadata
table_input_metadata(table_view const &table)
从 table_view 构造新的 table_input_metadata。

cudf::io::table_input_metadata::table_input_metadata
table_input_metadata(table_metadata const &metadata)
从 table_metadata 对象构造新的 table_input_metadata。

cudf::io::table_input_metadata::column_metadata
std::vector< column_in_metadata > column_metadata
列元数据列表。
定义： io/types.hpp:968

cudf::io::writer_compression_statistics
writer 执行的压缩统计信息。
定义： io/types.hpp:127

cudf::io::writer_compression_statistics::compression_ratio
auto compression_ratio() const noexcept
返回成功压缩块的压缩比。
定义： io/types.hpp:210

cudf::io::writer_compression_statistics::num_total_input_bytes
auto num_total_input_bytes() const noexcept
返回压缩输入的总大小。
定义： io/types.hpp:197

cudf::io::writer_compression_statistics::operator+=
writer_compression_statistics & operator+=(writer_compression_statistics const &other) noexcept
添加另一个 writer_compression_statistics 对象的值。
定义： io/types.hpp:159

cudf::io::writer_compression_statistics::num_failed_bytes
auto num_failed_bytes() const noexcept
返回压缩失败块中的字节数。
定义： io/types.hpp:183

cudf::io::writer_compression_statistics::writer_compression_statistics
writer_compression_statistics()=default
默认构造函数。

cudf::io::writer_compression_statistics::num_skipped_bytes
auto num_skipped_bytes() const noexcept
返回压缩过程中跳过块中的字节数。
定义： io/types.hpp:190

cudf::io::writer_compression_statistics::writer_compression_statistics
writer_compression_statistics(size_t num_compressed_bytes, size_t num_failed_bytes, size_t num_skipped_bytes, size_t num_compressed_output_bytes)
带初始值的构造函数。
定义： io/types.hpp:142

cudf::io::writer_compression_statistics::num_compressed_bytes
auto num_compressed_bytes() const noexcept
返回成功压缩块中的字节数。
定义： io/types.hpp:176

cudf::table_view
一组相同大小的 cudf::column_view。
定义： table_view.hpp:200

cudf::table
一组相同大小的 cudf::column。
定义： table.hpp:40

cudf::io::statistics_freq
statistics_freq
parquet/orc writer 的列统计信息粒度类型。
定义： io/types.hpp:96

cudf::io::column_encoding
column_encoding
可与 column_in_metadata::set_encoding() 一起使用的有效编码。
定义： io/types.hpp:106

cudf::io::quote_style
quote_style
处理字段数据中的引号时的行为。
定义： io/types.hpp:86

cudf::io::is_byte_like_type
constexpr auto is_byte_like_type()
如果类型是字节类（即合理地可以作为字节指针传递），则返回 true。
定义： io/types.hpp:337

cudf::io::dictionary_policy
dictionary_policy
控制 parquet writer 使用字典编码。
定义： io/types.hpp:225

cudf::io::compression_type
compression_type
压缩算法。
定义： io/types.hpp:57

cudf::io::io_type
io_type
数据源或目的地类型。
定义： io/types.hpp:75

cudf::io::STATISTICS_COLUMN
@ STATISTICS_COLUMN
完整的列和偏移量索引。意味着 STATISTICS_ROWGROUP。
定义： io/types.hpp:100

cudf::io::STATISTICS_ROWGROUP
@ STATISTICS_ROWGROUP
每行组列统计信息。
定义： io/types.hpp:98

cudf::io::STATISTICS_NONE
@ STATISTICS_NONE
无列统计信息。
定义： io/types.hpp:97

cudf::io::STATISTICS_PAGE
@ STATISTICS_PAGE
每页列统计信息。
定义： io/types.hpp:99

cudf::io::column_encoding::DELTA_BINARY_PACKED
@ DELTA_BINARY_PACKED
使用 DELTA_BINARY_PACKED 编码（仅对整数列有效）

cudf::io::column_encoding::DELTA_BYTE_ARRAY
@ DELTA_BYTE_ARRAY

cudf::io::column_encoding::USE_DEFAULT
@ USE_DEFAULT
未请求编码，使用默认编码。

cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY
@ DELTA_LENGTH_BYTE_ARRAY

cudf::io::column_encoding::PLAIN
@ PLAIN
使用 plain 编码。

cudf::io::column_encoding::BYTE_STREAM_SPLIT
@ BYTE_STREAM_SPLIT
使用 BYTE_STREAM_SPLIT 编码（对所有固定宽度类型有效）

cudf::io::quote_style::MINIMAL
@ MINIMAL
仅引用包含特殊字符的字段。

cudf::io::quote_style::ALL
@ ALL
引用所有字段。

cudf::io::quote_style::NONNUMERIC
@ NONNUMERIC
引用所有非数字字段。

cudf::io::ALWAYS
@ ALWAYS
无论对压缩的影响如何，都使用字典。
定义： io/types.hpp:228

cudf::io::ADAPTIVE
@ ADAPTIVE
在不影响压缩的情况下使用字典。
定义： io/types.hpp:227

cudf::io::NEVER
@ NEVER
从不使用字典编码。
定义： io/types.hpp:226

cudf::io::compression_type::BROTLI
@ BROTLI
BROTLI 格式，使用 LZ77 + Huffman + 二阶上下文建模。

cudf::io::compression_type::XZ
@ XZ
XZ 格式，使用 LZMA(2) 算法。

cudf::io::compression_type::ZIP
@ ZIP
ZIP 格式，使用 DEFLATE 算法。

cudf::io::compression_type::BZIP2
@ BZIP2
BZIP2 格式，使用 Burrows-Wheeler 变换。

cudf::io::compression_type::AUTO
@ AUTO
自动检测或选择压缩格式。

cudf::io::io_type::HOST_BUFFER
@ HOST_BUFFER
输入/输出是主机内存中的缓冲区。

cudf::io::io_type::USER_IMPLEMENTED
@ USER_IMPLEMENTED
输入/输出由自定义用户类处理。

cudf::io::io_type::VOID
@ VOID
无输入/输出。不执行任何工作。适用于基准测试。

cudf::io::io_type::FILEPATH
@ FILEPATH
输入/输出是文件路径。

cudf::io::io_type::DEVICE_BUFFER
@ DEVICE_BUFFER
输入/输出是设备内存中的缓冲区。

cudf::transform
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
通过对输入列的每个元素应用转换函数来创建新列。

cudf::size_type
int32_t size_type
列和表的行索引类型。
定义： types.hpp:95

cudf
cuDF 接口
定义： host_udf.hpp:37

cudf::nullable
bool nullable(table_view const &view)
如果表中的任何列是可空的（而不是整个层级结构），则返回 True。

span.hpp
span 的 API。

cudf::device_span
C++20 std::span 的设备版本，功能集有所减少。
定义： span.hpp:355

cudf::host_span
C++20 std::span，功能集有所减少。
定义： span.hpp:194

cudf::io::column_name_info
输出列的详细名称（以及可选的可空性）信息。
定义： io/types.hpp:237

cudf::io::column_name_info::is_nullable
std::optional< bool > is_nullable
列的可空性。
定义： io/types.hpp:239

cudf::io::column_name_info::is_binary
std::optional< bool > is_binary
列是二进制的（即不是列表）
定义： io/types.hpp:240

cudf::io::column_name_info::children
std::vector< column_name_info > children
子列名称。
定义： io/types.hpp:242

cudf::io::column_name_info::operator==
bool operator==(column_name_info const &rhs) const
比较两个列名称信息结构是否相等。
定义： io/types.hpp:266

cudf::io::column_name_info::type_length
std::optional< int32_t > type_length
数据字节宽度（针对固定长度数据）
定义： io/types.hpp:241

cudf::io::column_name_info::name
std::string name
列名。
定义： io/types.hpp:238

cudf::io::column_name_info::column_name_info
column_name_info(std::string _name, std::optional< bool > _is_nullable=std::nullopt, std::optional< bool > _is_binary=std::nullopt)
构造一个带有名称、可选可空性且无子项的列名称信息。
定义： io/types.hpp:251

cudf::io::host_buffer
主机内存缓冲区的非拥有视图。
定义： io/types.hpp:315

cudf::io::host_buffer::host_buffer
host_buffer(char const *data, size_t size)
构造一个新的主机缓冲区对象。
定义： io/types.hpp:326

cudf::io::partition_info
写入分区数据集时使用的信息。
定义： io/types.hpp:977

cudf::io::partition_info::partition_info
partition_info(size_type start_row, size_type num_rows)
构造一个新的 partition_info。
定义： io/types.hpp:988

cudf::io::partition_info::start_row
size_type start_row
分区的起始行。
定义： io/types.hpp:978

cudf::io::partition_info::num_rows
size_type num_rows
分区中的行数。
定义： io/types.hpp:979

cudf::io::sink_info
写入接口的目的地信息。
定义： io/types.hpp:523

cudf::io::sink_info::buffers
auto const & buffers() const
获取输入的宿主缓冲区。
定义： io/types.hpp:613

cudf::io::sink_info::sink_info
sink_info(std::vector< std::vector< char > * > buffers)
为多个宿主缓冲区构造新的 sink info 对象。
定义： io/types.hpp:557

cudf::io::sink_info::filepaths
auto const & filepaths() const
获取输入的的文件路径。
定义： io/types.hpp:607

cudf::io::sink_info::sink_info
sink_info(std::string file_path)
为单个文件构造新的 sink info 对象。
定义： io/types.hpp:547

cudf::io::sink_info::sink_info
sink_info(class cudf::io::data_sink *user_sink)
为单个用户实现的 sink 构造新的 sink info 对象。
定义： io/types.hpp:585

cudf::io::sink_info::sink_info
sink_info(std::vector< cudf::io::data_sink * > const &user_sinks)
为多个用户实现的 sink 构造新的 sink info 对象。
定义： io/types.hpp:573

cudf::io::sink_info::num_sinks
auto num_sinks() const
获取 sink 的数量。
定义： io/types.hpp:601

cudf::io::sink_info::user_sinks
auto const & user_sinks() const
获取输入的的用户 sink。
定义： io/types.hpp:619

cudf::io::sink_info::sink_info
sink_info(size_t num_sinks)
构造新的 sink info 对象。
定义： io/types.hpp:530

cudf::io::sink_info::type
auto type() const
获取输入的类型。
定义： io/types.hpp:595

cudf::io::sink_info::sink_info
sink_info(std::vector< char > *buffer)
为单个宿主缓冲区构造新的 sink info 对象。
定义： io/types.hpp:566

cudf::io::sink_info::sink_info
sink_info(std::vector< std::string > file_paths)
为多个文件构造新的 sink info 对象。
定义： io/types.hpp:537

cudf::io::source_info
读取接口的源信息。
定义： io/types.hpp:348

cudf::io::source_info::device_buffers
auto const & device_buffers() const
获取输入的设备缓冲区。
定义： io/types.hpp:504

cudf::io::source_info::source_info
source_info(char const *host_data, size_t size)
为单个缓冲区构造新的 source info 对象。
定义： io/types.hpp:398

cudf::io::source_info::source_info
source_info(std::vector< std::string > file_paths)
为多个文件构造新的 source info 对象。
定义： io/types.hpp:356

cudf::io::source_info::filepaths
auto const & filepaths() const
获取输入的的文件路径。
定义： io/types.hpp:492

cudf::io::source_info::source_info
source_info(cudf::host_span< T > host_data)
为单个缓冲区构造新的 source info 对象。
定义： io/types.hpp:434

cudf::io::source_info::source_info
source_info(cudf::host_span< cudf::host_span< T >> const host_buffers)
为宿主内存中的多个缓冲区构造新的 source info 对象。
定义： io/types.hpp:411

cudf::io::source_info::source_info
source_info(cudf::device_span< std::byte const > d_buffer)
从设备缓冲区构造新的 source info 对象。
定义： io/types.hpp:456

cudf::io::source_info::source_info
source_info(cudf::io::datasource *source)
为单个用户实现的 source 构造新的 source info 对象。
定义： io/types.hpp:476

cudf::io::source_info::source_info
source_info(std::vector< cudf::io::datasource * > const &sources)
为多个用户实现的 source 构造新的 source info 对象。
定义： io/types.hpp:466

cudf::io::source_info::source_info
source_info(cudf::host_span< cudf::device_span< std::byte const >> device_buffers)
为设备内存中的多个缓冲区构造新的 source info 对象。
定义： io/types.hpp:446

cudf::io::source_info::host_buffers
auto const & host_buffers() const
获取输入的宿主缓冲区。
定义： io/types.hpp:498

cudf::io::source_info::type
auto type() const
获取输入的类型。
定义： io/types.hpp:486

cudf::io::source_info::source_info
source_info(std::string file_path)
为单个文件构造新的 source info 对象。
定义： io/types.hpp:366

cudf::io::source_info::user_sources
auto const & user_sources() const
获取输入的的用户 source。
定义： io/types.hpp:510

cudf::io::source_info::source_info
source_info(std::vector< host_buffer > const &host_buffers)
为宿主内存中的多个缓冲区构造新的 source info 对象。
定义： io/types.hpp:378

cudf::io::table_metadata
IO reader 返回的表元数据。
定义： io/types.hpp:277

cudf::io::table_metadata::per_file_user_data
std::vector< std::unordered_map< std::string, std::string > > per_file_user_data
每文件格式相关的元数据，以键值对形式呈现。
定义： io/types.hpp:286

cudf::io::table_metadata::num_row_groups_after_stats_filter
std::optional< size_type > num_row_groups_after_stats_filter
定义： io/types.hpp:291

cudf::io::table_metadata::num_row_groups_after_bloom_filter
std::optional< size_type > num_row_groups_after_bloom_filter
定义： io/types.hpp:295

cudf::io::table_metadata::num_rows_per_source
std::vector< size_t > num_rows_per_source
定义： io/types.hpp:280

cudf::io::table_metadata::schema_info
std::vector< column_name_info > schema_info
整个输出层级的详细名称信息。
定义： io/types.hpp:279

cudf::io::table_metadata::user_data
std::map< std::string, std::string > user_data
定义： io/types.hpp:283

cudf::io::table_with_metadata
带有表元数据的表，IO reader 使用它按值返回元数据。
定义： io/types.hpp:303

cudf::io::table_with_metadata::tbl
std::unique_ptr< table > tbl
表。
定义： io/types.hpp:304

cudf::io::table_with_metadata::metadata
table_metadata metadata
表元数据。
定义： io/types.hpp:305

table.hpp
cudf::table 的类定义。

types.hpp
libcudf 的类型声明。