column_factories.hpp
转到此文件的文档。
1 /*
2  * Copyright (c) 2019-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * https://apache.ac.cn/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <cudf/column/column.hpp>
19 #include <cudf/types.hpp>
22 #include <cudf/utilities/span.hpp>
24 
25 #include <rmm/cuda_stream_view.hpp>
26 
27 namespace CUDF_EXPORT cudf {
43 /**
44  * @brief Constructs an empty column of the specified data type.
45  *
46  * @param type Logical data type of the column.
47  * @return An empty column of the specified type.
48  */
49 std::unique_ptr<column> make_empty_column(data_type type);
50 
51 /**
52  * @brief Creates an empty column of the specified type ID.
53  *
54  * @param id Type ID of the column.
55  * @return An empty column of the specified type.
56  */
57 std::unique_ptr<column> make_empty_column(type_id id);
58 
59 /**
60  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
61  * specified numeric type.
62  *
63  * @param type Numeric data type of the column.
64  * @param size Number of elements in the column.
65  * @param state Indicates the desired null mask allocation state.
66  * @param stream CUDA stream used for device memory operations.
67  * @param mr Device memory resource to use for allocations.
68  * @return A unique_ptr to the newly created column.
69  */
70 std::unique_ptr<column> make_numeric_column(
71  data_type type,
72  size_type size,
73  mask_state state = mask_state::UNALLOCATED,
76 
77 /**
78  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
79  * specified numeric type, optionally providing a null mask and count.
80  *
81  * @param type Numeric data type of the column.
82  * @param size Number of elements in the column.
83  * @param null_mask Optional, device bitmask to indicate null values. Size must be `size`.
84  * @param null_count The number of null values in the column.
85  * @param stream CUDA stream used for device memory operations.
86  * @param mr Device memory resource to use for allocations.
87  * @return A unique_ptr to the newly created column.
88  */
89 template <typename B>
90 std::unique_ptr<column> make_numeric_column(
91  data_type type,
92  size_type size,
93  B&& null_mask,
97 {
98  CUDF_EXPECTS(is_numeric(type), "无效,非数值类型。");
99  return std::make_unique<column>(type,
100  size,
101  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
102  std::forward<B>(null_mask),
103  null_count);
104 }
105 
106 /**
107  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
108  * specified fixed point type.
109  *
110  * @param type Fixed point data type of the column.
111  * @param size Number of elements in the column.
112  * @param state Indicates the desired null mask allocation state.
113  * @param stream CUDA stream used for device memory operations.
114  * @param mr Device memory resource to use for allocations.
115  * @return A unique_ptr to the newly created column.
116  */
117 std::unique_ptr<column> make_fixed_point_column(
118  data_type type,
119  size_type size,
120  mask_state state = mask_state::UNALLOCATED,
123 
124 /**
125  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
126  * specified fixed point type, optionally providing a null mask and count.
127  *
128  * @param type Fixed point data type of the column.
129  * @param size Number of elements in the column.
130  * @param null_mask Optional, device bitmask to indicate null values. Size must be `size`.
131  * @param null_count The number of null values in the column.
132  * @param stream CUDA stream used for device memory operations.
133  * @param mr Device memory resource to use for allocations.
134  * @return A unique_ptr to the newly created column.
135  */
136 template <typename B>
137 std::unique_ptr<column> make_fixed_point_column(
138  data_type type,
139  size_type size,
140  B&& null_mask,
144 {
145  CUDF_EXPECTS(is_fixed_point(type), "无效,非定点类型。");
146  return std::make_unique<column>(type,
147  size,
148  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
149  std::forward<B>(null_mask),
150  null_count);
151 }
152 
153 /**
154  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
155  * specified timestamp type.
156  *
157  * @param type Timestamp data type of the column.
158  * @param size Number of elements in the column.
159  * @param state Indicates the desired null mask allocation state.
160  * @param stream CUDA stream used for device memory operations.
161  * @param mr Device memory resource to use for allocations.
162  * @return A unique_ptr to the newly created column.
163  */
164 std::unique_ptr<column> make_timestamp_column(
165  data_type type,
166  size_type size,
167  mask_state state = mask_state::UNALLOCATED,
170 
171 /**
172  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
173  * specified timestamp type, optionally providing a null mask and count.
174  *
175  * @param type Timestamp data type of the column.
176  * @param size Number of elements in the column.
177  * @param null_mask Optional, device bitmask to indicate null values. Size must be `size`.
178  * @param null_count The number of null values in the column.
179  * @param stream CUDA stream used for device memory operations.
180  * @param mr Device memory resource to use for allocations.
181  * @return A unique_ptr to the newly created column.
182  */
183 template <typename B>
184 std::unique_ptr<column> make_timestamp_column(
185  data_type type,
186  size_type size,
187  B&& null_mask,
191 {
192  CUDF_EXPECTS(is_timestamp(type), "无效,非时间戳类型。");
193  return std::make_unique<column>(type,
194  size,
195  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
196  std::forward<B>(null_mask),
197  null_count);
198 }
199 
200 /**
201  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
202  * specified duration type.
203  *
204  * @param type Duration data type of the column.
205  * @param size Number of elements in the column.
206  * @param state Indicates the desired null mask allocation state.
207  * @param stream CUDA stream used for device memory operations.
208  * @param mr Device memory resource to use for allocations.
209  * @return A unique_ptr to the newly created column.
210  */
211 std::unique_ptr<column> make_duration_column(
212  data_type type,
213  size_type size,
214  mask_state state = mask_state::UNALLOCATED,
217 
218 /**
219  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
220  * specified duration type, optionally providing a null mask and count.
221  *
222  * @param type Duration data type of the column.
223  * @param size Number of elements in the column.
224  * @param null_mask Optional, device bitmask to indicate null values. Size must be `size`.
225  * @param null_count The number of null values in the column.
226  * @param stream CUDA stream used for device memory operations.
227  * @param mr Device memory resource to use for allocations.
228  * @return A unique_ptr to the newly created column.
229  */
230 template <typename B>
231 std::unique_ptr<column> make_duration_column(
232  data_type type,
233  size_type size,
234  B&& null_mask,
238 {
239  CUDF_EXPECTS(is_duration(type), "无效,非 duration 类型。");
240  return std::make_unique<column>(type,
241  size,
242  rmm::device_buffer{size * cudf::size_of(type), stream, mr},
243  std::forward<B>(null_mask),
244  null_count);
245 }
246 
247 /**
248  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
249  * specified fixed width type.
250  *
251  * @param type Fixed width data type of the column.
252  * @param size Number of elements in the column.
253  * @param state Indicates the desired null mask allocation state.
254  * @param stream CUDA stream used for device memory operations.
255  * @param mr Device memory resource to use for allocations.
256  * @return A unique_ptr to the newly created column.
257  */
258 std::unique_ptr<column> make_fixed_width_column(
259  data_type type,
260  size_type size,
261  mask_state state = mask_state::UNALLOCATED,
264 
265 /**
266  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
267  * specified fixed width type, optionally providing a null mask and count.
268  *
269  * @param type Fixed width data type of the column.
270  * @param size Number of elements in the column.
271  * @param null_mask Optional, device bitmask to indicate null values. Size must be `size`.
272  * @param null_count The number of null values in the column.
273  * @param stream CUDA stream used for device memory operations.
274  * @param mr Device memory resource to use for allocations.
275  * @return A unique_ptr to the newly created column.
276  */
277 template <typename B>
278 std::unique_ptr<column> make_fixed_width_column(
279  data_type type,
280  size_type size,
281  B&& null_mask,
286  CUDF_EXPECTS(is_fixed_width(type), "无效,非固定宽度类型。");
287  if (is_timestamp(type)) {
288  return make_timestamp_column(type, size, std::forward<B>(null_mask), null_count, stream, mr);
289  } else if (is_duration(type)) {
290  return make_duration_column(type, size, std::forward<B>(null_mask), null_count, stream, mr);
291  } else if (is_fixed_point(type)) {
292  return make_fixed_point_column(type, size, std::forward<B>(null_mask), null_count, stream, mr);
293  }
294  return make_numeric_column(type, size, std::forward<B>(null_mask), null_count, stream, mr);
295 }
296 
297 /**
298  * @brief Construct a STRING type column given an array of device spans of pointer/size pairs.
299  *
300  * The array must be on device. For example:
301  *
302  * @code
303  * thrust::pair<char const*, cudf::size_type> h_strings[] = {{"hi",2}, {"there",5}, {"world",5}};
304  * rmm::device_buffer d_strings(h_strings, sizeof(h_strings), stream);
305  * cudf::device_span<thrust::pair<char const*, cudf::size_type> const>
306  * strings_view(reinterpret_cast<thrust::pair<char const*, cudf::size_type> const*>(d_strings.data()), 3);
307  * auto column = cudf::make_strings_column(strings_view, stream);
308  * @endcode
309  *
310  * All chars must be valid UTF-8 bytes.
311  *
312  * @throw cudf::logic_error if any of the chars are not valid UTF-8.
313  *
314  * @param strings A device span of device pointers and size of each string.
315  * @param stream CUDA stream used for device memory operations.
316  * @param mr Device memory resource to use for allocations.
317  * @return New column.
318  */
319 std::unique_ptr<column> make_strings_column(
320  cudf::device_span<thrust::pair<char const*, size_type> const> strings,
323 
324 /**
325  * @brief Given an array of device spans of pointer/size pairs, construct a batch of STRING type
326  * columns.
327  *
328  * This function takes a device array of device spans, where each span represents the strings for a
329  * single column. This is an efficient way to construct multiple columns simultaneously.
330  *
331  * All chars must be valid UTF-8 bytes.
332  *
333  * @throw cudf::logic_error if any of the chars are not valid UTF-8.
334  *
335  * @param input A vector of device spans, where each span is a sequence of strings for one column.
336  * @param stream CUDA stream used for device memory operations.
337  * @param mr Device memory resource to use for allocations.
338  * @return A vector of new columns.
339  */
340 std::vector<std::unique_ptr<column>> make_strings_column_batch(
341  std::vector<cudf::device_span<thrust::pair<char const*, size_type> const>> const& input,
344 
345 /**