19 #include <cudf/strings/detail/utf8.hpp>
21 #include <cudf/utilities/export.hpp>
30 #include <thrust/count.h>
31 #include <thrust/execution_policy.h>
34 #include <cuda/std/utility>
41 namespace CUDF_EXPORT
cudf {
54 if ((str ==
nullptr) || (bytes == 0))
return 0;
55 auto ptr =
reinterpret_cast<uint8_t const*
>(str);
57 return thrust::count_if(
58 thrust::seq, ptr, ptr + bytes, [](uint8_t chr) {
return is_begin_utf8_char(chr); });
61 auto const end = ptr + bytes;
63 chars += is_begin_utf8_char(*ptr++);
80 __device__
inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
84 auto ptr = d_str.data();
85 auto const end_ptr = ptr + d_str.size_bytes();
86 while ((pos > 0) && (ptr < end_ptr)) {
87 auto const width = strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(*ptr));
104 static __constant__
char max_string_sentinel[5]{
"\xF7\xBF\xBF\xBF"};
129 char const* psentinel{
nullptr};
130 #if defined(__CUDA_ARCH__)
131 psentinel = &cudf::strings::detail::max_string_sentinel[0];
134 cudaGetSymbolAddress((
void**)&psentinel, cudf::strings::detail::max_string_sentinel));
136 return {psentinel, 4};
141 if (_length == UNKNOWN_STRING_LENGTH)
142 _length = strings::detail::characters_in_string(_data, _bytes);
148 __device__
inline string_view::const_iterator::const_iterator(
string_view const& str,
size_type pos)
149 : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{str.byte_offset(pos)}
153 __device__
inline string_view::const_iterator::const_iterator(string_view
const& str,
156 : p{str.data()}, bytes{str.size_bytes()}, char_pos{pos}, byte_pos{offset}
160 __device__
inline string_view::const_iterator& string_view::const_iterator::operator++()
162 if (byte_pos < bytes) {
165 cuda::std::max(1, strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[byte_pos])));
171 __device__
inline string_view::const_iterator string_view::const_iterator::operator++(
int)
173 string_view::const_iterator tmp(*
this);
179 string_view::const_iterator::difference_type offset)
const
181 const_iterator tmp(*
this);
184 offset > 0 ? ++tmp : --tmp;
188 __device__
inline string_view::const_iterator& string_view::const_iterator::operator+=(
189 string_view::const_iterator::difference_type offset)
193 offset > 0 ? operator++() : operator--();
197 __device__
inline string_view::const_iterator& string_view::const_iterator::operator--()
200 if (byte_pos == char_pos) {
203 while (strings::detail::bytes_in_utf8_byte(
static_cast<uint8_t
>(p[--byte_pos])) == 0)
211 __device__
inline string_view::const_iterator string_view::const_iterator::operator--(
int)
213 string_view::const_iterator tmp(*
this);
218 __device__
inline string_view::const_iterator& string_view::const_iterator::operator-=(
219 string_view::const_iterator::difference_type offset)
223 offset > 0 ? operator--() : operator++();
228 string_view::const_iterator::difference_type offset)
const
230 const_iterator tmp(*
this);
233 offset > 0 ? --tmp : ++tmp;
237 __device__
inline string_view::const_iterator& string_view::const_iterator::move_to(
240 *
this += (new_pos - char_pos);
245 string_view::const_iterator
const& rhs)
const
247 return (p == rhs.p) && (char_pos == rhs.char_pos);
251 string_view::const_iterator
const& rhs)
const
253 return (p != rhs.p) || (char_pos != rhs.char_pos);
257 string_view::const_iterator
const& rhs)
const
259 return (p == rhs.p) && (char_pos < rhs.char_pos);
263 string_view::const_iterator
const& rhs)
const
265 return (p == rhs.p) && (char_pos <= rhs.char_pos);
269 string_view::const_iterator
const& rhs)
const
271 return (p == rhs.p) && (char_pos > rhs.char_pos);
275 string_view::const_iterator
const& rhs)
const
277 return (p == rhs.p) && (char_pos >= rhs.char_pos);
283 strings::detail::to_char_utf8(p +
byte_offset(), chr);
287 __device__
inline size_type string_view::const_iterator::position()
const {
return char_pos; }
289 __device__
inline size_type string_view::const_iterator::byte_offset()
const {
return byte_pos; }
291 __device__
inline string_view::const_iterator
string_view::begin()
const {
return {*
this, 0, 0}; }
302 if (offset >= _bytes)
return 0;
304 strings::detail::to_char_utf8(
data() + offset, chr);
311 return cuda::std::get<0>(strings::detail::bytes_to_character_position(*
this, pos));
322 auto const* ptr1 =
reinterpret_cast<unsigned char const*
>(this->
data());
323 auto const* ptr2 =
reinterpret_cast<unsigned char const*
>(
data);
324 if ((ptr1 == ptr2) && (bytes == len1))
return 0;
326 for (; (idx < len1) && (idx < bytes); ++idx) {
327 if (*ptr1 != *ptr2)
return static_cast<int32_t
>(*ptr1) -
static_cast<int32_t
>(*ptr2);
331 if (idx < len1)
return 1;
332 if (idx < bytes)
return -1;
359 return (rc == 0) || (rc < 0);
365 return (rc == 0) || (rc > 0);
375 template <
bool forward>
376 __device__
inline size_type string_view::find_impl(
char const* str,
381 if (!str || pos < 0) {
return npos; }
382 if (pos > 0 && pos >
length()) {
return npos; }
385 auto const itr =
begin() + pos;
386 auto const spos = itr.byte_offset();
388 (count >= 0) && ((pos + count) <
length()) ? (itr + count).byte_offset() :
size_bytes();
390 auto const find_length = (epos - spos) - bytes + 1;
393 auto ptr =
data() + (forward ? spos : (epos - bytes));
394 for (
size_type idx = 0; idx < find_length; ++idx) {
395 if (d_target.compare(ptr, bytes) == 0) {
396 return forward ? pos : character_offset(epos - bytes - idx);
399 pos += strings::detail::is_begin_utf8_char(*ptr);
400 forward ? ++ptr : --ptr;
410 return find_impl<true>(str, bytes, pos, count);
416 size_type chwidth = strings::detail::from_char_utf8(chr, str);
417 return find(str, chwidth, pos, count);
432 return find_impl<false>(str, bytes, pos, count);
438 size_type chwidth = strings::detail::from_char_utf8(chr, str);
439 return rfind(str, chwidth, pos, count);
446 auto const spos =
begin() + pos;
448 auto ss =
string_view{
data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
450 if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
457 return strings::detail::characters_in_string(
data(), bytepos);
非拥有、不可变的设备数据视图,是一个表示 UTF-8 的可变长度字符数组...
CUDF_HOST_DEVICE size_type size_bytes() const
返回此字符串中的字节数。
size_type rfind(string_view const &str, size_type pos=0, size_type count=-1) const
返回在此字符串中找到参数 str 的最后一个出现位置的字符位置...
CUDF_HOST_DEVICE string_view()
默认构造函数表示一个空字符串。
size_type length() const
返回此字符串中的字符数。
string_view substr(size_type start, size_type length) const
返回此字符串的子字符串。必须仍然维护原始字符串和设备内存,以便此子字符串视图的生命周期有效。
bool operator==(string_view const &rhs) const
如果 rhs 与此字符串完全匹配,则返回 true。
const_iterator end() const
返回指向此字符串末尾之后的新迭代器。
int compare(string_view const &str) const
将目标字符串与此字符串进行比较。每个字符都按 UTF-8 码点值进行比较。
bool operator>=(string_view const &rhs) const
如果 rhs 匹配或排在此字符串之前,则返回 true。
CUDF_HOST_DEVICE char const * data() const
返回指向内部设备数组的指针。
size_type byte_offset(size_type pos) const
返回给定字符位置相对于 data() 的字节偏移量。
const_iterator begin() const
返回指向此字符串开头的新迭代器。
char_utf8 operator[](size_type pos) const
返回给定字符位置的单个 UTF-8 字符。
bool operator!=(string_view const &rhs) const
如果 rhs 与此字符串不匹配,则返回 true。
size_type find(string_view const &str, size_type pos=0, size_type count=-1) const
返回在此字符串中找到参数 str 的第一个出现位置的字符位置...
bool operator<=(string_view const &rhs) const
如果此字符串匹配或排在 rhs 之前,则返回 true。
bool operator<(string_view const &rhs) const
如果此字符串排在 rhs 之前,则返回 true。
bool operator>(string_view const &rhs) const
如果 rhs 排在此字符串之前,则返回 true。
static cudf::size_type const npos
无位置值。
bool operator==(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
bool operator!=(polymorphic_allocator< T > const &lhs, polymorphic_allocator< U > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator-(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<=(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator*(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator>(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE fixed_point< Rep1, Rad1 > operator+(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
CUDF_HOST_DEVICE bool operator<(fixed_point< Rep1, Rad1 > const &lhs, fixed_point< Rep1, Rad1 > const &rhs)
#define CUDF_CUDA_TRY(call)
用于 CUDA 运行时 API 函数的错误检查宏。
int32_t size_type
用于列和表的行索引类型。
uint32_t char_utf8
UTF-8 字符为 1-4 字节。
#define CUDF_HOST_DEVICE
表示函数或方法可在主机和设备上使用。