geode/codegen/Broma/PEGTL-3.2.7/include/tao/pegtl/contrib/unescape.hpp

215 lines
6.8 KiB
C++
Raw Normal View History

2022-07-30 12:24:03 -04:00
// Copyright (c) 2014-2022 Dr. Colin Hirsch and Daniel Frey
// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
#ifndef TAO_PEGTL_CONTRIB_UNESCAPE_HPP
#define TAO_PEGTL_CONTRIB_UNESCAPE_HPP
#include <cassert>
#include <exception>
#include <string>
#include "../ascii.hpp"
#include "../config.hpp"
#include "../parse_error.hpp"
namespace TAO_PEGTL_NAMESPACE::unescape
{
// Utility functions for the unescape actions.
[[nodiscard]] inline bool utf8_append_utf32( std::string& string, const unsigned utf32 )
{
if( utf32 <= 0x7f ) {
string += char( utf32 & 0xff );
return true;
}
if( utf32 <= 0x7ff ) {
char tmp[] = { char( ( ( utf32 & 0x7c0 ) >> 6 ) | 0xc0 ),
char( ( ( utf32 & 0x03f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
if( utf32 <= 0xffff ) {
if( utf32 >= 0xd800 && utf32 <= 0xdfff ) {
// nope, this is a UTF-16 surrogate
return false;
}
char tmp[] = { char( ( ( utf32 & 0xf000 ) >> 12 ) | 0xe0 ),
char( ( ( utf32 & 0x0fc0 ) >> 6 ) | 0x80 ),
char( ( ( utf32 & 0x003f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
if( utf32 <= 0x10ffff ) {
char tmp[] = { char( ( ( utf32 & 0x1c0000 ) >> 18 ) | 0xf0 ),
char( ( ( utf32 & 0x03f000 ) >> 12 ) | 0x80 ),
char( ( ( utf32 & 0x000fc0 ) >> 6 ) | 0x80 ),
char( ( ( utf32 & 0x00003f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
return false;
}
// This function MUST only be called for characters matching TAO_PEGTL_NAMESPACE::ascii::xdigit!
template< typename I >
[[nodiscard]] I unhex_char( const char c )
{
switch( c ) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return I( c - '0' );
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
return I( c - 'a' + 10 );
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
return I( c - 'A' + 10 );
default: // LCOV_EXCL_LINE
std::terminate(); // LCOV_EXCL_LINE
}
}
template< typename I >
[[nodiscard]] I unhex_string( const char* begin, const char* end )
{
I r = 0;
while( begin != end ) {
r <<= 4;
r += unhex_char< I >( *begin++ );
}
return r;
}
// Actions for common unescape situations.
struct append_all
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
s.append( in.begin(), in.size() );
}
};
// This action MUST be called for a character matching T which MUST be TAO_PEGTL_NAMESPACE::one< ... >.
template< typename T, char... Rs >
struct unescape_c
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( in.size() == 1 );
s += apply_one( in, static_cast< const T* >( nullptr ) );
}
template< typename ActionInput, char... Qs >
[[nodiscard]] static char apply_one( const ActionInput& in, const one< Qs... >* /*unused*/ )
{
static_assert( sizeof...( Qs ) == sizeof...( Rs ), "size mismatch between escaped characters and their mappings" );
return apply_two( in, { Qs... }, { Rs... } );
}
template< typename ActionInput >
[[nodiscard]] static char apply_two( const ActionInput& in, const std::initializer_list< char >& q, const std::initializer_list< char >& r )
{
const char c = *in.begin();
for( std::size_t i = 0; i < q.size(); ++i ) {
if( *( q.begin() + i ) == c ) {
return *( r.begin() + i );
}
}
std::terminate(); // LCOV_EXCL_LINE
}
};
// See src/example/pegtl/unescape.cpp for why the following two actions
// skip the first input character. They also MUST be called
// with non-empty matched inputs!
struct unescape_u
{
#if defined( __cpp_exceptions )
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( !in.empty() ); // First character MUST be present, usually 'u' or 'U'.
if( !utf8_append_utf32( s, unhex_string< unsigned >( in.begin() + 1, in.end() ) ) ) {
throw parse_error( "invalid escaped unicode code point", in );
}
}
#else
template< typename ActionInput >
static bool apply( const ActionInput& in, std::string& s )
{
assert( !in.empty() ); // First character MUST be present, usually 'u' or 'U'.
return utf8_append_utf32( s, unhex_string< unsigned >( in.begin() + 1, in.end() ) );
}
#endif
};
struct unescape_x
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( !in.empty() ); // First character MUST be present, usually 'x'.
s += unhex_string< char >( in.begin() + 1, in.end() );
}
};
// The unescape_j action is similar to unescape_u, however unlike
// unescape_u it
// (a) assumes exactly 4 hexdigits per escape sequence,
// (b) accepts multiple consecutive escaped 16-bit values.
// When applied to more than one escape sequence, unescape_j
// translates UTF-16 surrogate pairs in the input into a single
// UTF-8 sequence in s, as required for JSON by RFC 8259.
struct unescape_j
{
template< typename ActionInput >
static bool apply( const ActionInput& in, std::string& s )
{
assert( ( ( in.size() + 1 ) % 6 ) == 0 ); // Expects multiple "\\u1234", starting with the first "u".
for( const char* b = in.begin() + 1; b < in.end(); b += 6 ) {
const auto c = unhex_string< unsigned >( b, b + 4 );
if( ( 0xd800 <= c ) && ( c <= 0xdbff ) && ( b + 6 < in.end() ) ) {
const auto d = unhex_string< unsigned >( b + 6, b + 10 );
if( ( 0xdc00 <= d ) && ( d <= 0xdfff ) ) {
b += 6;
(void)utf8_append_utf32( s, ( ( ( c & 0x03ff ) << 10 ) | ( d & 0x03ff ) ) + 0x10000 );
continue;
}
}
if( !utf8_append_utf32( s, c ) ) {
#if defined( __cpp_exceptions )
throw parse_error( "invalid escaped unicode code point", in );
#else
return false;
#endif
}
}
return true;
}
};
} // namespace TAO_PEGTL_NAMESPACE::unescape
#endif