LCOV - code coverage report
Current view: top level - core/core/string - SPUnicode.h (source / functions) Hit Total Coverage
Test: coverage.info Lines: 138 144 95.8 %
Date: 2024-05-12 00:16:13 Functions: 41 41 100.0 %

          Line data    Source code
       1             : /**
       2             : Copyright (c) 2016-2022 Roman Katuntsev <sbkarr@stappler.org>
       3             : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
       4             : 
       5             : Permission is hereby granted, free of charge, to any person obtaining a copy
       6             : of this software and associated documentation files (the "Software"), to deal
       7             : in the Software without restriction, including without limitation the rights
       8             : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
       9             : copies of the Software, and to permit persons to whom the Software is
      10             : furnished to do so, subject to the following conditions:
      11             : 
      12             : The above copyright notice and this permission notice shall be included in
      13             : all copies or substantial portions of the Software.
      14             : 
      15             : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      16             : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      17             : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      18             : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      19             : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      20             : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      21             : THE SOFTWARE.
      22             : **/
      23             : 
      24             : #ifndef STAPPLER_CORE_STRING_SPUNICODE_H_
      25             : #define STAPPLER_CORE_STRING_SPUNICODE_H_
      26             : 
      27             : #include "SPMemString.h"
      28             : 
      29             : namespace STAPPLER_VERSIONIZED stappler::unicode {
      30             : 
      31             : // Length lookup table
      32             : constexpr const uint8_t utf8_length_data[256] = {
      33             :         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      34             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      35             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      36             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      37             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      38             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      39             :         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
      40             :         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
      41             : };
      42             : 
      43             : constexpr const uint8_t utf16_length_data[256] = {
      44             :         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      45             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      46             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      47             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      48             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      49             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      50             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      51             :         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1
      52             : };
      53             : 
      54             : constexpr const uint8_t utf8_length_mask[256] = {
      55             :     0x00, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
      56             :     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
      57             :     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
      58             :     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
      59             :     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
      60             :     0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
      61             :     0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
      62             :         0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x7f, 0x7f
      63             : };
      64             : 
      65             : // check if char is not start of utf8 symbol
      66             : SPINLINE constexpr inline bool isUtf8Surrogate(char c)  {
      67       15900 :         return (c & 0xC0) == 0x80;
      68             : }
      69             : 
      70     3004988 : constexpr inline char32_t utf8Decode32(const char *ptr, uint8_t &offset) {
      71     3004988 :         uint8_t mask = utf8_length_mask[uint8_t(*ptr)];
      72     3004988 :         offset = utf8_length_data[uint8_t(*ptr)];
      73     3004988 :         char32_t ret = ptr[0] & mask;
      74     3682548 :         for (uint8_t c = 1; c < offset; ++c) {
      75      677560 :                 if ((ptr[c] & 0xc0) != 0x80) { ret = 0; break; }
      76      677560 :                 ret <<= 6; ret |= (ptr[c] & 0x3f);
      77             :         }
      78     3004988 :         return ret;
      79             : }
      80             : 
      81             : char32_t utf8HtmlDecode32(const char *ptr, uint8_t &offset);
      82             : 
      83      284368 : constexpr inline char32_t utf8Decode32(const char *ptr) {
      84             :         uint8_t offset;
      85      284368 :         return utf8Decode32(ptr, offset);
      86             : }
      87             : 
      88      680324 : inline constexpr uint8_t utf8EncodeLength(char16_t c) {
      89             :         return ( c < 0x80 ? 1
      90             :                 : ( c < 0x800 ? 2
      91             :                         :  3
      92             :                 )
      93      680324 :         );
      94             : }
      95             : 
      96         225 : inline constexpr uint8_t utf8EncodeLength(char32_t c) {
      97         225 :         if (c < 0x80) {
      98         100 :                 return 1;
      99         125 :         } else if (c < 0x800) {
     100          25 :                 return 2;
     101         100 :         } else if (c < 0x1'0000) {
     102          50 :                 return 3;
     103          50 :         } else if (c < 0x11'0000) {
     104          25 :                 return 4;
     105             :         } else {
     106          25 :                 return 5;
     107             :         }
     108             : }
     109             : 
     110             : template <typename PutCharFn>
     111         125 : inline uint8_t utf8EncodeCb(const PutCharFn &cb, char16_t c) {
     112         125 :         if (c < 0x80) {
     113          75 :                 cb(char(c));
     114          75 :                 return 1;
     115          50 :         } else if (c < 0x800) {
     116           0 :                 cb(0xc0 | (c >> 6));
     117           0 :                 cb(0x80 | (c & 0x3f));
     118           0 :                 return 2;
     119             :         } else {
     120          50 :                 cb(0xe0 | (c >> 12));
     121          50 :                 cb(0x80 | (c >> 6 & 0x3f));
     122          50 :                 cb(0x80 | (c & 0x3f));
     123          50 :                 return 3;
     124             :         }
     125             : }
     126             : 
     127             : template <typename PutCharFn>
     128      680451 : inline uint8_t utf8EncodeCb(const PutCharFn &cb, char32_t c) {
     129      680451 :         if (c < 0x80) {
     130      429995 :                 cb(char(c));
     131      429995 :                 return 1;
     132      250456 :         } else if (c < 0x800) {
     133      250356 :                 cb(0xc0 | (c >> 6));
     134      250356 :                 cb(0x80 | (c & 0x3f));
     135      250356 :                 return 2;
     136         100 :         } else if (c < 0x1'0000) {
     137          50 :                 cb(0b1110'0000 | (c >> 12));
     138          50 :                 cb(0x80 | (c >> 6 & 0x3f));
     139          50 :                 cb(0x80 | (c & 0x3f));
     140          50 :                 return 3;
     141          50 :         } else if (c < 0x11'0000) {
     142          25 :                 cb(0b1111'0000 | (c >> 18));
     143          25 :                 cb(0x80 | (c >> 12 & 0x3f));
     144          25 :                 cb(0x80 | (c >> 6 & 0x3f));
     145          25 :                 cb(0x80 | (c & 0x3f));
     146          25 :                 return 4;
     147             :         } else {
     148          25 :                 cb(0b1111'1000 | (c >> 24));
     149          25 :                 cb(0x80 | (c >> 18 & 0x3f));
     150          25 :                 cb(0x80 | (c >> 12 & 0x3f));
     151          25 :                 cb(0x80 | (c >> 6 & 0x3f));
     152          25 :                 cb(0x80 | (c & 0x3f));
     153          25 :                 return 5;
     154             :         }
     155             : }
     156             : 
     157          25 : inline uint8_t utf8EncodeBuf(char *ptr, char16_t ch) {
     158          50 :         return utf8EncodeCb([&] (char c) {
     159          25 :                 *ptr++ = c;
     160          25 :         }, ch);
     161             : }
     162             : 
     163         500 : inline uint8_t utf8EncodeBuf(char *ptr, char32_t ch) {
     164        1000 :         return utf8EncodeCb([&] (char c) {
     165         900 :                 *ptr++ = c;
     166         500 :         }, ch);
     167             : }
     168             : 
     169          25 : inline uint8_t utf8Encode(std::string &str, char16_t ch) {
     170          50 :         return utf8EncodeCb([&] (char c) {
     171          25 :                 str.push_back(c);
     172          50 :         }, ch);
     173             : }
     174             : 
     175       17076 : inline uint8_t utf8Encode(std::string &str, char32_t ch) {
     176       34152 :         return utf8EncodeCb([&] (char c) {
     177       22257 :                 str.push_back(c);
     178       34152 :         }, ch);
     179             : }
     180             : 
     181          50 : inline uint8_t utf8Encode(memory::string &str, char16_t ch) {
     182         100 :         return utf8EncodeCb([&] (char c) {
     183         150 :                 str.push_back(c);
     184          50 :         }, ch);
     185             : }
     186             : 
     187      662875 : inline uint8_t utf8Encode(memory::string &str, char32_t ch) {
     188     1325750 :         return utf8EncodeCb([&] (char c) {
     189      907925 :                 str.push_back(c);
     190      662875 :         }, ch);
     191             : }
     192             : 
     193             : inline uint8_t utf8Encode(std::ostream &str, char16_t ch) {
     194             :         return utf8EncodeCb([&] (char c) {
     195             :                 str << c;
     196             :         }, ch);
     197             : }
     198             : 
     199             : inline uint8_t utf8Encode(std::ostream &str, char32_t ch) {
     200             :         return utf8EncodeCb([&] (char c) {
     201             :                 str << c;
     202             :         }, ch);
     203             : }
     204             : 
     205     1555186 : constexpr inline char32_t utf16Decode32(const char16_t *ptr, uint8_t &offset) {
     206     1555186 :         if ((*ptr & char16_t(0xD800)) != 0) {
     207          25 :                 offset = 2;
     208          25 :                 return char32_t(0b0000'0011'1111'1111 & ptr[0]) << 10 | char32_t(0b0000'0011'1111'1111 & ptr[1]);
     209             :         } else {
     210     1555161 :                 offset = 1;
     211     1555161 :                 return char32_t(*ptr);
     212             :         }
     213             : }
     214             : 
     215          25 : constexpr inline char32_t utf16Decode32(const char16_t *ptr) {
     216             :         uint8_t offset;
     217          25 :         return utf16Decode32(ptr, offset);
     218             : }
     219             : 
     220        1225 : constexpr inline uint8_t utf16EncodeLength(char32_t c) {
     221        1225 :         if (c < 0xD800) {
     222        1175 :                 return 1;
     223          50 :         } else if (c <= 0xDFFF) {
     224             :                 // do nothing, wrong encoding
     225           0 :                 return 0;
     226          50 :         } else if (c < 0x10000) {
     227           0 :                 return 1;
     228             :         } else {
     229          50 :                 return 2;
     230             :         }
     231             : }
     232             : 
     233             : template <typename PutCharFn>
     234     1587586 : inline uint8_t utf16EncodeCb(const PutCharFn &cb, char32_t c) {
     235     1587586 :         if (c < 0xD800) {
     236     1587486 :                 cb(char16_t(c));
     237     1587486 :                 return 1;
     238         100 :         } else if (c <= 0xDFFF) {
     239          25 :                 return 0;
     240          75 :         } else if (c < 0x10000) {
     241          25 :                 cb(char16_t(c));
     242          25 :                 return 1;
     243             :         } else {
     244          50 :                 cb(char16_t(((0b1111'1111'1100'0000'0000 & c) >> 10) + 0xD800));
     245          50 :                 cb(char16_t(((0b0000'0000'0011'1111'1111 & c) >> 00) + 0xDC00));
     246          50 :                 return 2;
     247             :         }
     248             : }
     249             : 
     250         500 : inline uint8_t utf16EncodeBuf(char16_t *ptr, char32_t ch) {
     251        1000 :         return utf16EncodeCb([&] (char16_t c) {
     252         500 :                 *ptr++ = c;
     253         500 :         }, ch);
     254             : }
     255             : 
     256      877972 : inline uint8_t utf16Encode(std::u16string &str, char32_t ch) {
     257     1755944 :         return utf16EncodeCb([&] (char16_t c) {
     258      877972 :                 str.push_back(c);
     259     1755944 :         }, ch);
     260             : }
     261             : 
     262      709114 : inline uint8_t utf16Encode(memory::u16string &str, char32_t ch) {
     263     1418228 :         return utf16EncodeCb([&] (char16_t c) {
     264      709139 :                 str.push_back(c);
     265      709114 :         }, ch);
     266             : }
     267             : 
     268             : template <typename std::enable_if<std::is_class<std::ctype<char16_t>>::value>::type* = nullptr>
     269             : inline uint8_t utf16Encode(std::basic_ostream<char16_t> &out, char32_t ch) {
     270             :         return utf16EncodeCb([&] (char16_t c) {
     271             :                 out << c;
     272             :         }, ch);
     273             : }
     274             : 
     275             : }
     276             : 
     277             : 
     278             : // A part of SPString.h header placed here, to be available in utilities,
     279             : // that included by SPString.h (like StringView)
     280             : 
     281             : namespace STAPPLER_VERSIONIZED stappler::platform {
     282             : 
     283             : char32_t tolower(char32_t c);
     284             : char32_t toupper(char32_t c);
     285             : char32_t totitle(char32_t c);
     286             : 
     287             : }
     288             : 
     289             : namespace STAPPLER_VERSIONIZED stappler::string {
     290             : 
     291             : static constexpr size_t DOUBLE_MAX_DIGITS = 27;
     292             : 
     293       22050 : inline char32_t tolower(char32_t c) { return platform::tolower(c); }
     294        6480 : inline char32_t toupper(char32_t c) { return platform::toupper(c); }
     295             : inline char32_t totitle(char32_t c) { return platform::totitle(c); }
     296             : 
     297             : 
     298             : // fast itoa implementation
     299             : // data will be written at the end of buffer, no trailing zero (do not try to use strlen on it!)
     300             : // designed to be used with StringView: StringView(buf + bufSize - ret, ret)
     301             : 
     302             : size_t _itoa(int64_t number, char* buffer, size_t bufSize);
     303             : size_t _itoa(uint64_t number, char* buffer, size_t bufSize);
     304             : 
     305             : size_t _itoa(int64_t number, char16_t* buffer, size_t bufSize);
     306             : size_t _itoa(uint64_t number, char16_t* buffer, size_t bufSize);
     307             : 
     308             : size_t _itoa_len(int64_t number);
     309             : size_t _itoa_len(uint64_t number);
     310             : 
     311             : // fast dtoa implementation
     312             : // data will be written from beginning, no trailing zero (do not try to use strlen on it!)
     313             : // designed to be used with StringView: StringView(buf, ret)
     314             : 
     315             : size_t _dtoa(double number, char* buffer, size_t bufSize);
     316             : size_t _dtoa(double number, char16_t* buffer, size_t bufSize);
     317             : 
     318             : size_t _dtoa_len(double number);
     319             : 
     320             : // read number from string and offset pointers
     321             : 
     322             : template <typename T, typename Char>
     323     5137417 : inline auto readNumber(const Char *ptr, size_t len, int base, uint8_t &offset) -> Result<T> {
     324             :         // prevent to read out of bounds, copy symbols to stack buffer
     325     5137417 :         char buf[32] = { 0 }; // int64_t/scientific double character length max
     326     5137417 :         size_t m = min(size_t(31), len);
     327     5137416 :         size_t i = 0;
     328    53419825 :         for (; i < m; i++) {
     329    48282409 :                 auto c = ptr[i];
     330    48282409 :                 if (c < 127) {
     331    48282409 :                         buf[i] = c;
     332             :                 } else {
     333           0 :                         break;
     334             :                 }
     335             :         }
     336             : 
     337             :         // read number from internal buffer
     338     5137416 :         char * ret = nullptr;
     339     5137416 :         auto val = StringToNumber<T>(buf, &ret, base);
     340     5137417 :         if (*ret == 0) {
     341             :                 // while string was used
     342     5106267 :                 offset = i;
     343       31150 :         } else if (ret && ret != buf) {
     344             :                 // part of string was used
     345       26850 :                 offset = ret - buf;
     346             :         } else {
     347             :                 // fail to read number
     348        4300 :                 offset = 0;
     349        4300 :                 return Result<T>();
     350             :         }
     351     5133117 :         return Result<T>(val);
     352             : }
     353             : 
     354             : }
     355             : 
     356             : #endif /* STAPPLER_CORE_STRING_SPUNICODE_H_ */

Generated by: LCOV version 1.14