diff --git a/3rdparty/simpleini/LICENCE.txt b/3rdparty/simpleini/LICENCE.txt index 88aefbcdbd..405d417296 100644 --- a/3rdparty/simpleini/LICENCE.txt +++ b/3rdparty/simpleini/LICENCE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2006-2022 Brodie Thiesfield +Copyright (c) 2006-2024 Brodie Thiesfield Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/3rdparty/simpleini/include/ConvertUTF.h b/3rdparty/simpleini/include/ConvertUTF.h index 6ecc819cb7..528c963987 100644 --- a/3rdparty/simpleini/include/ConvertUTF.h +++ b/3rdparty/simpleini/include/ConvertUTF.h @@ -1,8 +1,10 @@ /* + * https://web.archive.org/web/20090529064329/http://www.unicode.org:80/Public/PROGRAMS/CVTUTF/ + * * Copyright 2001-2004 Unicode, Inc. - * + * * Disclaimer - * + * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine @@ -10,9 +12,9 @@ * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. - * + * * Limitations on Rights to Redistribute This Code - * + * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form @@ -20,74 +22,74 @@ * remains attached. */ -/* --------------------------------------------------------------------- + /* --------------------------------------------------------------------- - Conversions between UTF32, UTF-16, and UTF-8. Header file. + Conversions between UTF32, UTF-16, and UTF-8. Header file. - Several functions are included here, forming a complete set of - conversions between the three formats. UTF-7 is not included - here, but is handled in a separate source file. + Several funtions are included here, forming a complete set of + conversions between the three formats. UTF-7 is not included + here, but is handled in a separate source file. - Each of these routines takes pointers to input buffers and output - buffers. The input buffers are const. + Each of these routines takes pointers to input buffers and output + buffers. The input buffers are const. - Each routine converts the text between *sourceStart and sourceEnd, - putting the result into the buffer between *targetStart and - targetEnd. Note: the end pointers are *after* the last item: e.g. - *(sourceEnd - 1) is the last item. + Each routine converts the text between *sourceStart and sourceEnd, + putting the result into the buffer between *targetStart and + targetEnd. Note: the end pointers are *after* the last item: e.g. + *(sourceEnd - 1) is the last item. - The return result indicates whether the conversion was successful, - and if not, whether the problem was in the source or target buffers. - (Only the first encountered problem is indicated.) + The return result indicates whether the conversion was successful, + and if not, whether the problem was in the source or target buffers. + (Only the first encountered problem is indicated.) - After the conversion, *sourceStart and *targetStart are both - updated to point to the end of last text successfully converted in - the respective buffers. + After the conversion, *sourceStart and *targetStart are both + updated to point to the end of last text successfully converted in + the respective buffers. - Input parameters: - sourceStart - pointer to a pointer to the source buffer. - The contents of this are modified on return so that - it points at the next thing to be converted. - targetStart - similarly, pointer to pointer to the target buffer. - sourceEnd, targetEnd - respectively pointers to the ends of the - two buffers, for overflow checking only. + Input parameters: + sourceStart - pointer to a pointer to the source buffer. + The contents of this are modified on return so that + it points at the next thing to be converted. + targetStart - similarly, pointer to pointer to the target buffer. + sourceEnd, targetEnd - respectively pointers to the ends of the + two buffers, for overflow checking only. - These conversion functions take a ConversionFlags argument. When this - flag is set to strict, both irregular sequences and isolated surrogates - will cause an error. When the flag is set to lenient, both irregular - sequences and isolated surrogates are converted. + These conversion functions take a ConversionFlags argument. When this + flag is set to strict, both irregular sequences and isolated surrogates + will cause an error. When the flag is set to lenient, both irregular + sequences and isolated surrogates are converted. - Whether the flag is strict or lenient, all illegal sequences will cause - an error return. This includes sequences such as: , , - or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code - must check for illegal sequences. + Whether the flag is strict or lenient, all illegal sequences will cause + an error return. This includes sequences such as: , , + or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code + must check for illegal sequences. - When the flag is set to lenient, characters over 0x10FFFF are converted - to the replacement character; otherwise (when the flag is set to strict) - they constitute an error. + When the flag is set to lenient, characters over 0x10FFFF are converted + to the replacement character; otherwise (when the flag is set to strict) + they constitute an error. - Output parameters: - The value "sourceIllegal" is returned from some routines if the input - sequence is malformed. When "sourceIllegal" is returned, the source - value will point to the illegal value that caused the problem. E.g., - in UTF-8 when a sequence is malformed, it points to the start of the - malformed sequence. + Output parameters: + The value "sourceIllegal" is returned from some routines if the input + sequence is malformed. When "sourceIllegal" is returned, the source + value will point to the illegal value that caused the problem. E.g., + in UTF-8 when a sequence is malformed, it points to the start of the + malformed sequence. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Fixes & updates, Sept 2001. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Fixes & updates, Sept 2001. ------------------------------------------------------------------------- */ + ------------------------------------------------------------------------ */ -/* --------------------------------------------------------------------- - The following 4 definitions are compiler-specific. - The C standard does not guarantee that wchar_t has at least - 16 bits, so wchar_t is no less portable than unsigned short! - All should be unsigned values to avoid sign extension during - bit mask & shift operations. ------------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- + The following 4 definitions are compiler-specific. + The C standard does not guarantee that wchar_t has at least + 16 bits, so wchar_t is no less portable than unsigned short! + All should be unsigned values to avoid sign extension during + bit mask & shift operations. + ------------------------------------------------------------------------ */ -typedef unsigned int UTF32; /* at least 32 bits */ +typedef unsigned long UTF32; /* at least 32 bits */ typedef unsigned short UTF16; /* at least 16 bits */ typedef unsigned char UTF8; /* typically 8 bits */ typedef unsigned char Boolean; /* 0 or 1 */ @@ -100,15 +102,15 @@ typedef unsigned char Boolean; /* 0 or 1 */ #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insufficient room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ + conversionOK, /* conversion successful */ + sourceExhausted, /* partial character in source, but hit end */ + targetExhausted, /* insuff. room in target for conversion */ + sourceIllegal /* source sequence is illegal/malformed */ } ConversionResult; typedef enum { - strictConversion = 0, - lenientConversion + strictConversion = 0, + lenientConversion } ConversionFlags; /* This is for C++ and does no harm in C */ @@ -116,34 +118,34 @@ typedef enum { extern "C" { #endif -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + ConversionResult ConvertUTF8toUTF16( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + ConversionResult ConvertUTF16toUTF8( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + ConversionResult ConvertUTF8toUTF32( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + ConversionResult ConvertUTF32toUTF8( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); -Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); + ConversionResult ConvertUTF16toUTF32( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); + + ConversionResult ConvertUTF32toUTF16( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); + + Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd); #ifdef __cplusplus } #endif -/* --------------------------------------------------------------------- */ +/* --------------------------------------------------------------------- */ \ No newline at end of file diff --git a/3rdparty/simpleini/include/SimpleIni.h b/3rdparty/simpleini/include/SimpleIni.h index 231ddf860c..9b837f9990 100644 --- a/3rdparty/simpleini/include/SimpleIni.h +++ b/3rdparty/simpleini/include/SimpleIni.h @@ -5,7 +5,7 @@ File SimpleIni.h Author Brodie Thiesfield Source https://github.com/brofield/simpleini - Version 4.22 + Version 4.25 Jump to the @link CSimpleIniTempl CSimpleIni @endlink interface documentation. @@ -53,7 +53,7 @@ -# If you will only be using straight utf8 files and access the data via the char interface, then you do not need any conversion library and could define SI_NO_CONVERSION. Note that no conversion also means no validation of the data. - If no converter is specified then the default converter is SI_CONVERT_GENERIC + If no converter is specified then the default converter is SI_NO_CONVERSION on Mac/Linux and SI_CONVERT_WIN32 on Windows. If you need widechar support on Mac/Linux then use either SI_CONVERT_GENERIC or SI_CONVERT_ICU. These are also supported on all platforms. @@ -161,6 +161,9 @@ @section notes NOTES + - The maximum supported file size is 1 GiB (SI_MAX_FILE_SIZE). Files larger + than this will be rejected with SI_FILE error to prevent excessive memory + allocation and potential denial of service attacks. - To load UTF-8 data on Windows 95, you need to use Microsoft Layer for Unicode, or SI_CONVERT_GENERIC, or SI_CONVERT_ICU. - When using SI_CONVERT_GENERIC, ConvertUTF.c must be compiled and linked. @@ -261,6 +264,10 @@ constexpr int SI_FAIL = -1; //!< Generic failure constexpr int SI_NOMEM = -2; //!< Out of memory error constexpr int SI_FILE = -3; //!< File error (see errno for detail error) +//! Maximum supported file size (1 GiB). Files larger than this will be rejected +//! to prevent excessive memory allocation and potential denial of service. +constexpr size_t SI_MAX_FILE_SIZE = 1024ULL * 1024ULL * 1024ULL; + #define SI_UTF8_SIGNATURE "\xEF\xBB\xBF" #ifdef _WIN32 @@ -357,7 +364,7 @@ public: if (lhs.nOrder != rhs.nOrder) { return lhs.nOrder < rhs.nOrder; } - return KeyOrder()(lhs.pItem, rhs.pItem); + return KeyOrder()(lhs, rhs); } }; }; @@ -1462,9 +1469,14 @@ CSimpleIniTempl::LoadFile( if (lSize == 0) { return SI_OK; } - + + // check file size is within supported limits (SI_MAX_FILE_SIZE) + if (static_cast(lSize) > SI_MAX_FILE_SIZE) { + return SI_FILE; + } + // allocate and ensure NULL terminated - char * pData = new(std::nothrow) char[lSize+static_cast(1)]; + char * pData = new(std::nothrow) char[static_cast(lSize) + 1]; if (!pData) { return SI_NOMEM; } @@ -1516,13 +1528,18 @@ CSimpleIniTempl::LoadData( return SI_FAIL; } + // check converted data size is within supported limits (SI_MAX_FILE_SIZE) + if (uLen >= (SI_MAX_FILE_SIZE / sizeof(SI_CHAR))) { + return SI_FILE; + } + // allocate memory for the data, ensure that there is a NULL // terminator wherever the converted data ends - SI_CHAR * pData = new(std::nothrow) SI_CHAR[uLen+1]; + SI_CHAR * pData = new(std::nothrow) SI_CHAR[uLen + 1]; if (!pData) { return SI_NOMEM; } - memset(pData, 0, sizeof(SI_CHAR)*(uLen+1)); + memset(pData, 0, sizeof(SI_CHAR) * (uLen + 1)); // convert the data if (!converter.ConvertFromStore(a_pData, a_uDataLen, pData, uLen)) { @@ -1800,6 +1817,7 @@ CSimpleIniTempl::IsMultiLineData( } // embedded newlines + const SI_CHAR * pStart = a_pData; while (*a_pData) { if (IsNewLineChar(*a_pData)) { return true; @@ -1807,8 +1825,8 @@ CSimpleIniTempl::IsMultiLineData( ++a_pData; } - // check for suffix - if (IsSpace(*--a_pData)) { + // check for suffix (ensure we don't go before start of string) + if (a_pData > pStart && IsSpace(*(a_pData - 1))) { return true; } @@ -1821,7 +1839,7 @@ CSimpleIniTempl::IsSingleLineQuotedValue( const SI_CHAR* a_pData ) const { - // data needs quoting if it starts or ends with whitespace + // data needs quoting if it starts or ends with whitespace // and doesn't have embedded newlines // empty string @@ -1835,6 +1853,7 @@ CSimpleIniTempl::IsSingleLineQuotedValue( } // embedded newlines + const SI_CHAR * pStart = a_pData; while (*a_pData) { if (IsNewLineChar(*a_pData)) { return false; @@ -1842,8 +1861,8 @@ CSimpleIniTempl::IsSingleLineQuotedValue( ++a_pData; } - // check for suffix - if (IsSpace(*--a_pData)) { + // check for suffix (ensure we don't go before start of string) + if (a_pData > pStart && IsSpace(*(a_pData - 1))) { return true; } @@ -2088,7 +2107,8 @@ CSimpleIniTempl::AddEntry( if (pComment) { DeleteString(a_pComment); a_pComment = pComment; - CopyString(a_pComment); + rc = CopyString(a_pComment); + if (rc < 0) return rc; } Delete(a_pSection, a_pKey); iKey = keyval.end(); @@ -2257,12 +2277,13 @@ CSimpleIniTempl::GetDoubleValue( return a_nDefault; } - char * pszSuffix = NULL; + char * pszSuffix = szValue; double nValue = strtod(szValue, &pszSuffix); // any invalid strings will return the default value - if (!pszSuffix || *pszSuffix) { - return a_nDefault; + // check if no conversion was performed or if there are trailing characters + if (pszSuffix == szValue || *pszSuffix) { + return a_nDefault; } return nValue; @@ -2421,7 +2442,7 @@ CSimpleIniTempl::GetSectionSize( int nCount = 0; const SI_CHAR * pLastKey = NULL; typename TKeyVal::const_iterator iKeyVal = section.begin(); - for (int n = 0; iKeyVal != section.end(); ++iKeyVal, ++n) { + for (; iKeyVal != section.end(); ++iKeyVal) { if (!pLastKey || IsLess(pLastKey, iKeyVal->first.pItem)) { ++nCount; pLastKey = iKeyVal->first.pItem; @@ -2464,7 +2485,7 @@ CSimpleIniTempl::GetAllSections( { a_names.clear(); typename TSection::const_iterator i = m_data.begin(); - for (int n = 0; i != m_data.end(); ++i, ++n ) { + for (; i != m_data.end(); ++i) { a_names.push_back(i->first); } } @@ -2490,7 +2511,7 @@ CSimpleIniTempl::GetAllKeys( const TKeyVal & section = iSection->second; const SI_CHAR * pLastKey = NULL; typename TKeyVal::const_iterator iKeyVal = section.begin(); - for (int n = 0; iKeyVal != section.end(); ++iKeyVal, ++n ) { + for (; iKeyVal != section.end(); ++iKeyVal) { if (!pLastKey || IsLess(pLastKey, iKeyVal->first.pItem)) { a_names.push_back(iKeyVal->first); pLastKey = iKeyVal->first.pItem; @@ -2828,7 +2849,7 @@ CSimpleIniTempl::DeleteString( // strings may exist either inside the data block, or they will be // individually allocated and stored in m_strings. We only physically // delete those stored in m_strings. - if (a_pString < m_pData || a_pString >= m_pData + m_uDataLen) { + if (!m_pData || a_pString < m_pData || a_pString >= m_pData + m_uDataLen) { typename TNamesDepend::iterator i = m_strings.begin(); for (;i != m_strings.end(); ++i) { if (a_pString == i->pItem) { @@ -2850,17 +2871,19 @@ CSimpleIniTempl::DeleteString( // // SI_NO_CONVERSION Do not make the "W" wide character version of the // library available. Only CSimpleIniA etc is defined. +// Default on Linux/MacOS/etc. +// SI_CONVERT_WIN32 Use the Win32 API functions for conversion. +// Default on Windows. // SI_CONVERT_GENERIC Use the Unicode reference conversion library in // the accompanying files ConvertUTF.h/c // SI_CONVERT_ICU Use the IBM ICU conversion library. Requires // ICU headers on include path and icuuc.lib -// SI_CONVERT_WIN32 Use the Win32 API functions for conversion. #if !defined(SI_NO_CONVERSION) && !defined(SI_CONVERT_GENERIC) && !defined(SI_CONVERT_WIN32) && !defined(SI_CONVERT_ICU) # ifdef _WIN32 # define SI_CONVERT_WIN32 # else -# define SI_CONVERT_GENERIC +# define SI_NO_CONVERSION # endif #endif @@ -3079,14 +3102,18 @@ public: return a_uInputDataLen; } -#if defined(SI_NO_MBSTOWCS_NULL) || (!defined(_MSC_VER) && !defined(_linux)) + // get the required buffer size +#if defined(_MSC_VER) + size_t uBufSiz; + errno_t e = mbstowcs_s(&uBufSiz, NULL, 0, a_pInputData, a_uInputDataLen); + return (e == 0) ? uBufSiz : (size_t) -1; +#elif !defined(SI_NO_MBSTOWCS_NULL) + return mbstowcs(NULL, a_pInputData, a_uInputDataLen); +#else // fall back processing for platforms that don't support a NULL dest to mbstowcs // worst case scenario is 1:1, this will be a sufficient buffer size (void)a_pInputData; return a_uInputDataLen; -#else - // get the actual required buffer size - return mbstowcs(NULL, a_pInputData, a_uInputDataLen); #endif } @@ -3135,9 +3162,18 @@ public: } // convert to wchar_t +#if defined(_MSC_VER) + size_t uBufSiz; + errno_t e = mbstowcs_s(&uBufSiz, + a_pOutputData, a_uOutputDataSize, + a_pInputData, a_uInputDataLen); + (void)uBufSiz; + return (e == 0); +#else size_t retval = mbstowcs(a_pOutputData, a_pInputData, a_uOutputDataSize); return retval != (size_t)(-1); +#endif } /** Calculate the number of char required by the storage format of this diff --git a/3rdparty/simpleini/src/ConvertUTF.c b/3rdparty/simpleini/src/ConvertUTF.c index 9eaf933a6f..de6bf13ee8 100644 --- a/3rdparty/simpleini/src/ConvertUTF.c +++ b/3rdparty/simpleini/src/ConvertUTF.c @@ -1,8 +1,10 @@ /* + * https://web.archive.org/web/20090529064329/http://www.unicode.org:80/Public/PROGRAMS/CVTUTF/ + * * Copyright 2001-2004 Unicode, Inc. - * + * * Disclaimer - * + * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine @@ -10,9 +12,9 @@ * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. - * + * * Limitations on Rights to Redistribute This Code - * + * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form @@ -20,23 +22,23 @@ * remains attached. */ -/* --------------------------------------------------------------------- + /* --------------------------------------------------------------------- - Conversions between UTF32, UTF-16, and UTF-8. Source code file. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. - June 2002: Tim Dodd added detection and handling of incomplete - source sequences, enhanced error detection, added casts - to eliminate compiler warnings. - July 2003: slight mods to back out aggressive FFFE detection. - Jan 2004: updated switches in from-UTF8 conversions. - Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + Conversions between UTF32, UTF-16, and UTF-8. Source code file. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. + June 2002: Tim Dodd added detection and handling of incomplete + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. + July 2003: slight mods to back out aggressive FFFE detection. + Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. - See the header file "ConvertUTF.h" for complete documentation. + See the header file "ConvertUTF.h" for complete documentation. ------------------------------------------------------------------------- */ + ------------------------------------------------------------------------ */ #include "ConvertUTF.h" @@ -44,7 +46,7 @@ #include #endif -static const int halfShift = 10; /* used for shifting by 10 bits */ +static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; static const UTF32 halfMask = 0x3FFUL; @@ -58,108 +60,116 @@ static const UTF32 halfMask = 0x3FFUL; /* --------------------------------------------------------------------- */ -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, +ConversionResult ConvertUTF32toUTF16( + const UTF32** sourceStart, const UTF32* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else { + *target++ = (UTF16)ch; /* normal case */ + } + } + else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } + else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - } - *sourceStart = source; - *targetStart = target; - return result; + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, +ConversionResult ConvertUTF16toUTF32( + const UTF16** sourceStart, const UTF16* sourceEnd, UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } + else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } + else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; + *sourceStart = source; + *targetStart = target; #ifdef CVTUTF_DEBUG -if (result == sourceIllegal) { - fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); - fflush(stderr); -} + if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); + } #endif - return result; + return result; } /* --------------------------------------------------------------------- */ @@ -172,14 +182,14 @@ if (result == sourceIllegal) { * allowed in earlier algorithms. */ static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; /* @@ -187,14 +197,14 @@ static const char trailingBytesForUTF8[256] = { * This table contains as many values as there might be trailing bytes * in a UTF-8 sequence. */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; /* * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed * into the first byte, depending on how many bytes follow. There are * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequences + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs * for *legal* UTF-8 will be 4 or fewer bytes total. */ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; @@ -209,74 +219,86 @@ static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC * into an inline function. */ -/* --------------------------------------------------------------------- */ + /* --------------------------------------------------------------------- */ -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, +ConversionResult ConvertUTF16toUTF8( + const UTF16** sourceStart, const UTF16* sourceEnd, UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } + else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } + else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { + bytesToWrite = 1; + } + else if (ch < (UTF32)0x800) { + bytesToWrite = 2; + } + else if (ch < (UTF32)0x10000) { + bytesToWrite = 3; + } + else if (ch < (UTF32)0x110000) { + bytesToWrite = 4; + } + else { + bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ @@ -292,29 +314,29 @@ ConversionResult ConvertUTF16toUTF8 ( * definition of UTF-8 goes up to 4-byte sequences. */ -static Boolean isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; +static Boolean isLegalUTF8(const UTF8* source, int length) { + UTF8 a; + const UTF8* srcptr = source + length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; + if (*source > 0xF4) return false; + return true; } /* --------------------------------------------------------------------- */ @@ -323,217 +345,234 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { * Exported function to return whether a UTF-8 sequence is legal or not. * This is not used here; it's just exported. */ -Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; - } - return isLegalUTF8(source, length); +Boolean isLegalUTF8Sequence(const UTF8* source, const UTF8* sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + if (source + length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); } /* --------------------------------------------------------------------- */ -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, +ConversionResult ConvertUTF8toUTF16( + const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead + 1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead + 1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead + 1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else { + *target++ = (UTF16)ch; /* normal case */ + } + } + else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead + 1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } + else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead + 1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); } - } - *sourceStart = source; - *targetStart = target; - return result; + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, +ConversionResult ConvertUTF32toUTF8( + const UTF32** sourceStart, const UTF32* sourceEnd, UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { + bytesToWrite = 1; + } + else if (ch < (UTF32)0x800) { + bytesToWrite = 2; + } + else if (ch < (UTF32)0x10000) { + bytesToWrite = 3; + } + else if (ch <= UNI_MAX_LEGAL_UTF32) { + bytesToWrite = 4; + } + else { + bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- */ -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, +ConversionResult ConvertUTF8toUTF32( + const UTF8** sourceStart, const UTF8* sourceEnd, UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead + 1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead + 1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead + 1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } + else { + *target++ = ch; + } + } + else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; } - } - *sourceStart = source; - *targetStart = target; - return result; + *sourceStart = source; + *targetStart = target; + return result; } /* --------------------------------------------------------------------- - Note A. - The fall-through switches in UTF-8 reading code save a - temp variable, some decrements & conditionals. The switches - are equivalent to the following loop: + Note A. + The fall-through switches in UTF-8 reading code save a + temp variable, some decrements & conditionals. The switches + are equivalent to the following loop: { - int tmpBytesToRead = extraBytesToRead+1; - do { + int tmpBytesToRead = extraBytesToRead+1; + do { ch += *source++; --tmpBytesToRead; if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); + } while (tmpBytesToRead > 0); } - In UTF-8 writing code, the switches on "bytesToWrite" are - similarly unrolled loops. + In UTF-8 writing code, the switches on "bytesToWrite" are + similarly unrolled loops. - --------------------------------------------------------------------- */ + --------------------------------------------------------------------- */ \ No newline at end of file