Re: UTF16 <-> UTF32

Liste des GroupesRevenir à cl c++ 
Sujet : Re: UTF16 <-> UTF32
De : Bonita.Montero (at) *nospam* gmail.com (Bonita Montero)
Groupes : comp.lang.c++
Date : 06. Sep 2024, 18:22:40
Autres entêtes
Organisation : A noiseless patient Spider
Message-ID : <vbfa8c$smqu$1@raubtier-asyl.eternal-september.org>
References : 1 2
User-Agent : Mozilla Thunderbird
This is my final unicode.h. Have a look at u8_feeder::next<bool>
and u8_gen<bool>. Like with all routines the bool switches off
error detection if you know your string is a valid Unicode string.
#include <span>
#include <random>
#include <chrono>
#include <bit>
#include <concepts>
#include <cassert>
#include "inline.h"
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable: 4554)
#endif
#if defined(__llvm__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wshift-op-parentheses"
#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
#pragma clang diagnostic ignored "-Wbitwise-op-parentheses"
#pragma clang diagnostic ignored "-Wunqualified-std-cast-call"
#endif
#pragma push_macro("FI")
#define FI FORCEINLINE
#pragma push_macro("LFI")
#define LFI L_FORCEINLINE
struct u8_feeder
{
u8_feeder( std::u8string_view sv );
u8_feeder &operator =( std::u8string_view sv );
template<bool Err = true>
int32_t next();
operator std::u8string_view::iterator() const;
private:
std::u8string_view::iterator m_cur, m_end;
};
inline u8_feeder::u8_feeder( std::u8string_view sv ) :
m_cur( sv.begin() ),
m_end( sv.end() )
{
}
inline u8_feeder &u8_feeder::operator =( std::u8string_view sv )
{
m_cur = sv.begin();
m_end = sv.end();
return *this;
}
template<bool Err>
FI int32_t u8_feeder::next()
{
ptrdiff_t rem = m_end - m_cur;
if( !rem ) [[unlikely]]
return -1;
unsigned ones = std::countl_zero( (unsigned char)~*m_cur );
if( !ones ) [[likely]]
return *m_cur++;
if constexpr( Err )
{
if( ones == 1 ) [[unlikely]]
return - 1;
if( ones > 4 ) [[unlikely]]
return - 1;
}
char32_t c = *m_cur++ & (char8_t)0x7F >> ones;
auto step = [&]() LFI
{
char8_t c8 = *m_cur++;
if constexpr( Err )
if( (c8 & 0xC0) != 0x80 ) [[likely]]
return false;
c = c << 6 | c8 & 0x3F;
return true;
};
size_t add = ones - 1;
if( !step() ) [[unlikely]]
return -1;
if( !--add )
goto ret;
if( !step() ) [[unlikely]]
return -1;
if( !--add )
goto ret;
if( !step() ) [[unlikely]]
return -1;
if constexpr( Err )
if( c > 0x10FFFF ) [[unlikely]]
return -1;
ret:
if constexpr( Err )
if( (c & -0x800) == 0xD800 ) [[unlikely]]
return -1;
[[assume(c <= 0x10FFFF)]];
return c;
}
inline u8_feeder::operator std::u8string_view::iterator() const
{
return m_cur;
}
struct u16_feeder
{
u16_feeder( std::u16string_view sv );
u16_feeder &operator =( std::u16string_view sv );
template<bool Err = true>
int32_t next();
operator std::u16string_view::iterator() const;
private:
std::u16string_view::iterator m_cur, m_end;
};
inline u16_feeder::u16_feeder( std::u16string_view sv ) :
m_cur( sv.begin() ),
m_end( sv.end() )
{
}
inline u16_feeder &u16_feeder::operator =( std::u16string_view sv )
{
m_cur = sv.begin();
m_end = sv.end();
return *this;
}
template<bool Err>
FI int32_t u16_feeder::next()
{
using namespace std;
constexpr char16_t
HIGH_SURR = 0xD800,
LOW_SURR  = 0xDC00,
SURR_MASK = 0xFC00,
SURR_HDR = HIGH_SURR;
constexpr uint32_t
SURR_HDR_MSK = -0x800;
char32_t c;
ptrdiff_t rem = m_end - m_cur;
if( !rem ) [[unlikely]]
return -1;
if( (*m_cur & SURR_HDR_MSK) != SURR_HDR ) [[likely]]
c = *m_cur++;
else
{
if( rem < 2 ) [[unlikely]]
return -1;
if constexpr( Err )
{
if( (*m_cur & SURR_MASK) != HIGH_SURR ) [[unlikely]]
return -1;
if( (m_cur[1] & SURR_MASK) != LOW_SURR ) [[unlikely]]
return -1;
}
c = 0x10000 + ((char32_t)(*m_cur & ~SURR_MASK) << 10 | (char32_t)(m_cur[1] & ~SURR_MASK));
if constexpr( Err )
if( (c & SURR_HDR_MSK) == SURR_HDR ) [[unlikely]]
return -1;
m_cur += 2;
}
return c;
}
inline u16_feeder::operator std::u16string_view::iterator() const
{
return m_cur;
}
struct u32_feeder
{
u32_feeder( std::u32string_view sv );
u32_feeder &operator =( std::u32string_view sv );
template<bool Err = true>
int32_t next();
operator std::u32string_view::iterator();
private:
std::u32string_view::iterator m_cur, m_end;
};
u32_feeder::u32_feeder( std::u32string_view sv ) :
m_cur( sv.begin() ),
m_end( sv.end() )
{
}
inline u32_feeder &u32_feeder::operator =( std::u32string_view sv )
{
m_cur = sv.begin();
m_end = sv.end();
return *this;
}
template<bool Err>
FI int32_t u32_feeder::next()
{
using namespace std;
ptrdiff_t rem = m_end - m_cur;
if( !rem ) [[unlikely]]
return -1;
char32_t c = *m_cur++;
if constexpr( Err )
{
if( (c & -0x800) == 0xD800 ) [[unlikely]]
return -1;
if( c > 0x10FFFF ) [[unlikely]]
return -1;
}
return c;
}
inline u32_feeder::operator std::u32string_view::iterator()
{
return m_cur;
}
#define YYY
template<bool Err, typename Consumer>
requires requires( Consumer consumer, char8_t c ) { { consumer( c ) }; }
FI size_t u8_gen( char32_t c, Consumer consumer )
{
using namespace std;
static struct Map
{
uint8_t head, firstBit;
} const rawMap[22] =
{
{ 0xF0, 18 }, // 21
{ 0xF0, 18 }, // 20
{ 0xF0, 18 }, // 19
{ 0xF0, 18 }, // 18
{ 0xF0, 18 }, // 17
{ 0xE0, 12 }, // 16
{ 0xE0, 12 }, // 15
{ 0xE0, 12 }, // 14
{ 0xE0, 12 }, // 13
{ 0xE0, 12 }, // 12
{ 0xC0, 6 },  // 11
{ 0xC0, 6 },  // 10
{ 0xC0, 6 },  // 9
{ 0xC0, 6 },  // 8
{ 0, 0 },     // 7
{ 0, 0 },     // 6
{ 0, 0 },     // 5
{ 0, 0 },     // 4
{ 0, 0 },     // 3
{ 0, 0 },     // 2
{ 0, 0 },     // 1
{ 0, 0 }      // 0
};
span<Map const> map( rawMap );
unsigned lzCnt = countl_zero( (uint32_t)c );
if constexpr( Err )
{
if( lzCnt < 11 ) [[unlikely]]
return 0;
if( c > 0x10FFFF ) [[unlikely]]
return 0;
}
Map const &mapped = map[lzCnt - 11];
int8_t bit = mapped.firstBit;
consumer( (char8_t)(mapped.head | c >> bit) );
size_t n = 1;
for( ; (bit -= 6) >= 0; ++n )
consumer( (char8_t)(0x80 | c >> bit & 0x3F) );
assert(bit == -6);
return n;
}
template<bool Err, typename Consumer>
requires requires( Consumer consumer, char16_t c ) { { consumer( c ) }; }
FI size_t u16_gen( char32_t c, Consumer consumer )
{
if( c <= 0xFFFF ) [[likely]]
{
if constexpr( Err )
if( (c & -0x800) == 0xD800 ) [[unlikely]]
return 0;
consumer( (char16_t)c );
return 1;
}
else if( !Err || c <= 0x10FFFF ) [[likely]]
{
c -= 0x10000;
consumer( (char16_t)(0xD800 | c >> 10) );
consumer( (char16_t)(0xDC00 | c & 0x3FF) );
return 2;
}
return 0;
}
template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } -> std::convertible_to<std::span<char32_t>>; }
size_t u8_to_u32( std::u8string_view u8Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u8_feeder u8f( u8Str );
for( ; u8f.next<Err>() >= 0; ++n );
if( u8f != u8Str.end() ) [[unlikely]]
return u8Str.end() - u8f;
span<char32_t> sp = reserve( n );
auto it = sp.begin();
u8f = u8Str;
for( int32_t c ; (c = u8f.next<false>()) >= 0; *it++ = c );
return 0;
}
template<bool Err>
size_t u8_to_u32( std::u8string_view u8Str, std::u32string &u32Str )
{
using namespace std;
return u8_to_u32<Err>( u8Str,
[&]( size_t n ) LFI -> span<char32_t>
{
u32Str.resize_and_overwrite( n, [&]( char32_t *, size_t n ) LFI { return n; } );
return u32Str;
} );
}
template<bool Err>
std::pair<size_t, std::u32string> u8_to_u32( std::u8string_view u8Str )
{
using namespace std;
u32string u32Str;
size_t rem = u8_to_u32<Err>( u8Str,
[&]( size_t n ) LFI -> span<char32_t>
{
u32Str.resize_and_overwrite( n, [&]( char32_t *, size_t n ) LFI { return n; } );
return u32Str;
} );
return { rem, move( u32Str ) };
}
template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } -> std::convertible_to<std::span<char16_t>>; }
size_t u8_to_u16( std::u8string_view u8Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u8_feeder u8f( u8Str );
for( int32_t c; (c = u8f.next<Err>()) >= 0; u16_gen<false>( c, [&]( char16_t ) LFI { ++n; } ) );
if( u8f != u8Str.end() ) [[unlikely]]
return u8Str.end() - u8f;
span<char16_t> sp( reserve( n ) );
auto it = sp.begin();
u8f = u8Str;
for( int32_t c; (c = u8f.next<false>()) >= 0; u16_gen<false>( c, [&]( char16_t c16 ) LFI { *it++ = c16; } ) );
return 0;
}
template<bool Err>
size_t u8_to_u16( std::u8string_view u8Str, std::u16string &u16Str )
{
using namespace std;
return u8_to_u16<Err>( u8Str,
[&]( size_t n ) LFI -> span<char16_t>
{
u16Str.resize_and_overwrite( n, [&]( char16_t *, size_t n ) LFI { return n; } );
return u16Str;
} );
}
template<bool Err>
std::pair<size_t, std::u16string> u8_to_u16( std::u8string_view u8Str )
{
using namespace std;
u16string u16Str;
size_t rem = u8_to_u16<Err>( u8Str,
[&]( size_t n ) LFI -> span<char16_t>
{
u16Str.resize_and_overwrite( n, [&]( char16_t *, size_t n ) LFI { return n; } );
return u16Str;
} );
return { rem, move( u16Str ) };
}
template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } -> std::convertible_to<std::span<char32_t>>; }
size_t u16_to_u32( std::u16string_view u16Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u16_feeder u16f( u16Str );
for( ; u16f.next<Err>() >= 0; ++n );
if( u16f != u16Str.end() )
return u16Str.end() - u16f;
span<char32_t> sp = reserve( n );
auto it = sp.begin();
u16f = u16Str;
for( int32_t c; (c = u16f.next<false>()) >= 0; *it++ = c );
return 0;
}
template<bool Err>
size_t u16_to_u32( std::u16string_view u16Str, std::u32string &u32Str )
{
using namespace std;
return u16_to_u32<Err>( u16Str,
[&]( size_t n ) LFI -> span<char32_t>
{
u32Str.resize_and_overwrite( n, [&]( char32_t *, size_t n ) LFI { return n; } );
return u32Str;
} );
}
template<bool Err>
std::pair<size_t, std::u32string> u16_to_u32( std::u16string_view u16Str )
{
using namespace std;
u32string u32Str;
size_t rem = u16_to_u32<Err>( u16Str, u32Str );
return { rem, move( u32Str ) };
}
template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } -> std::convertible_to<std::span<char8_t>>; }
size_t u32_to_u8( std::u32string_view u32Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u32_feeder u32f( u32Str );
for( int32_t c; (c = u32f.next<Err>()) >= 0; n += u8_gen<false>( c, [&]( char8_t ) LFI {} ) );
if( u32f != u32Str.end() ) [[unlikely]]
return u32Str.end() - u32f;
span<char8_t> sp = reserve( n );
auto it = sp.begin();
u32f = u32Str;
for( int32_t c; (c = u32f.next<false>()) >= 0; u8_gen<false>( c, [&]( char8_t c8 ) LFI { *it++ = c8; } ) );
return 0;
}
template<bool Err>
size_t u32_to_u8( std::u32string_view u32Str, std::u8string &u8Str )
{
using namespace std;
return u32_to_u8<Err>( u32Str,
[&]( size_t n ) LFI -> span<char8_t>
{
u8Str.resize_and_overwrite( n, [&]( char8_t *, size_t n ) LFI { return n; } );
return u8Str;
} );
}
template<bool Err>
std::pair<size_t, std::u8string> u32_to_u8( std::u32string_view u32Str )
{
using namespace std;
u8string u8Str;
size_t rem = u32_to_u8<Err>( u32Str, u8Str );
return { rem, move( u8Str ) };
}
template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } -> std::convertible_to<std::span<char16_t>>; }
size_t u32_to_u16( std::u32string_view u32Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u32_feeder u32f( u32Str );
for( int32_t c; (c = u32f.next<Err>()) >= 0; u16_gen<false>( c, [&]( char16_t ) LFI { ++n; } ) );
if( u32f != u32Str.end() ) [[unlikely]]
return u32Str.end() - u32f;
span<char16_t> sp = reserve( n );
auto it = sp.begin();
u32f = u32Str;
for( int32_t c; (c = u32f.next<false>()) >= 0; u16_gen<false>( c, [&]( char16_t c16 ) LFI { *it++ = c16; } ) );
return 0;
}
template<bool Err>
size_t u32_to_u16( std::u32string_view u32Str, std::u16string &u16Str )
{
using namespace std;
return u32_to_u16<Err>( u32Str,
[&]( size_t n ) LFI -> span<char16_t>
{
u16Str.resize_and_overwrite( n, [&]( char16_t *, size_t n ) LFI { return n; } );
return u16Str;
} );
}
template<bool Err>
std::pair<size_t, std::u16string> u32_to_u16( std::u32string_view u32Str )
{
using namespace std;
u16string u16Str;
size_t rem = u32_to_u16<Err>( u32Str, u16Str );
return { rem, move( u16Str ) };
}
template<bool Err, typename Reserve>
requires requires( Reserve reserve, size_t n ) { { reserve( n ) } -> std::convertible_to<std::span<char8_t>>; }
size_t u16_to_u8( std::u16string_view u16Str, Reserve reserve )
{
using namespace std;
size_t n = 0;
u16_feeder u16f( u16Str );
for( int32_t c; (c = u16f.next<Err>()) >= 0; n += u8_gen<false>( c, [&]( char8_t ) LFI {} ) );
if( u16f != u16Str.end() )
return u16Str.end() - u16f;
span<char8_t> sp = reserve( n );
auto it = sp.begin();
u16f = u16Str;
for( int32_t c; (c = u16f.next<false>()) >= 0; u8_gen<false>( c, [&]( char8_t c8 ) LFI { *it++ = c8; } ) );
return 0;
}
template<bool Err>
size_t u16_to_u8( std::u16string_view u16Str, std::u8string &u8Str )
{
using namespace std;
return u16_to_u8<Err>( u16Str,
[&]( size_t n ) LFI -> span<char8_t>
{
u8Str.resize_and_overwrite( n, [&]( char8_t *, size_t n ) LFI { return n; } );
return u8Str;
} );
}
template<bool Err>
std::pair<size_t, std::u8string> u16_to_u8( std::u16string_view u16Str )
{
using namespace std;
u8string u8Str;
size_t rem = u16_to_u8<Err>( u16Str,
[&]( size_t n ) -> span<char8_t>
{
u8Str.resize_and_overwrite( n, [&]( char8_t *, size_t n ) LFI { return n; } );
return u8Str;
} );
return { rem, move( u8Str ) };
}
template<bool Err, typename Consumer>
requires requires( Consumer consumer, char32_t c ) { { consumer( c ) }; }
FI size_t u8_iterate( std::u8string_view u8Str, Consumer consumer )
{
u8_feeder u8f( u8Str );
for( int32_t c; (c = u8f.next<Err>()) >= 0; )
if constexpr ( !requires( Consumer consumer, char32_t c ) { { consumer( c ) } -> std::convertible_to<bool>; } )
consumer( c );
else
if( !consumer( c ) ) [[unlikely]]
break;
return u8Str.end() - u8f;
}
template<bool Err, typename Consumer>
requires requires( Consumer consumer, char32_t c ) { { consumer( c ) }; }
FI size_t u16_iterate( std::u16string_view u16Str, Consumer consumer )
{
u16_feeder u16f( u16Str );
for( int32_t c; (c = u16f.next<Err>()) >= 0; )
if constexpr ( !requires( Consumer consumer, char32_t c ) { { consumer( c ) } -> std::convertible_to<bool>; } )
consumer( c );
else
if( !consumer( c ) ) [[unlikely]]
break;
return u16Str.end() - u16f;
}
template<bool Err, typename Consumer>
requires requires( Consumer consumer, char32_t c ) { { consumer( c ) }; }
FI size_t u32_iterate( std::u32string_view u32Str, Consumer consumer )
{
u32_feeder u32f( u32Str );
for( int32_t c; (c = u32f.next<Err>()) >= 0; )
if constexpr ( !requires( Consumer consumer, char32_t c ) { { consumer( c ) } -> std::convertible_to<bool>; } )
consumer( c );
else
if( !consumer( c ) ) [[unlikely]]
break;
return u32Str.end() - u32f;
}
#pragma pop_macro("FI")
#pragma pop_macro("LFI")
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
#if defined(__llvm__)
#pragma clang diagnostic pop
#endif

Date Sujet#  Auteur
31 Aug 24 * UTF16 <-> UTF323Bonita Montero
1 Sep 24 `* Re: UTF16 <-> UTF322Chris Ahlstrom
6 Sep 24  `- Re: UTF16 <-> UTF321Bonita Montero

Haut de la page

Les messages affichés proviennent d'usenet.

NewsPortal