LCOV - code coverage report
Current view: top level - core/core/string - SPUnicode.cc (source / functions) Hit Total Coverage
Test: coverage.info Lines: 169 181 93.4 %
Date: 2024-05-12 00:16:13 Functions: 8 8 100.0 %

          Line data    Source code
       1             : /**
       2             : Copyright (c) 2016-2022 Roman Katuntsev <sbkarr@stappler.org>
       3             : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
       4             : 
       5             : Permission is hereby granted, free of charge, to any person obtaining a copy
       6             : of this software and associated documentation files (the "Software"), to deal
       7             : in the Software without restriction, including without limitation the rights
       8             : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
       9             : copies of the Software, and to permit persons to whom the Software is
      10             : furnished to do so, subject to the following conditions:
      11             : 
      12             : The above copyright notice and this permission notice shall be included in
      13             : all copies or substantial portions of the Software.
      14             : 
      15             : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      16             : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      17             : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      18             : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      19             : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
      20             : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
      21             : THE SOFTWARE.
      22             : **/
      23             : 
      24             : #include "SPUnicode.h"
      25             : #include "SPString.h"
      26             : 
      27             : #if WIN32
      28             : #include "SPPlatformUnistd.h"
      29             : #include <libloaderapi.h>
      30             : #else
      31             : #include <dlfcn.h>
      32             : #endif
      33             : 
      34             : namespace STAPPLER_VERSIONIZED stappler::unicode {
      35             : 
      36         600 : static char32_t Utf8DecodeHtml32(const char *ptr, uint32_t len) {
      37         600 :         if (ptr[0] == '#') {
      38         200 :                 if (len > 1 && (ptr[1] == 'x' || ptr[1] == 'X')) {
      39         100 :                         return char32_t(strtol(ptr + 2, nullptr, 16));
      40             :                 }
      41         100 :                 return char32_t(strtol(ptr + 1, nullptr, 10));
      42         400 :         } else if (strncmp(ptr, "amp", len) == 0) {
      43          50 :                 return '&';
      44         350 :         } else if (strncmp(ptr, "nbsp", len) == 0) {
      45          50 :                 return 0xA0;
      46         300 :         } else if (strncmp(ptr, "quot", len) == 0) {
      47          50 :                 return '"';
      48         250 :         } else if (strncmp(ptr, "apos", len) == 0) {
      49          50 :                 return '\'';
      50         200 :         } else if (strncmp(ptr, "lt", len) == 0) {
      51          50 :                 return '<';
      52         150 :         } else if (strncmp(ptr, "gt", len) == 0) {
      53          50 :                 return '>';
      54         100 :         } else if (strncmp(ptr, "shy", len) == 0) {
      55          50 :                 return char32_t(0x00AD);
      56             :         }
      57          50 :         return 0;
      58             : }
      59             : 
      60        1825 : char32_t utf8HtmlDecode32(const char *utf8, uint8_t &offset) {
      61        1825 :         if (utf8[0] == '&') {
      62         500 :                 uint32_t len = 0;
      63        2975 :                 while (utf8[len] && utf8[len] != ';' && len < 10) {
      64        2475 :                         len ++;
      65             :                 }
      66             : 
      67         500 :                 char32_t c = 0;
      68         500 :                 if (utf8[len] == ';' && len > 2) {
      69         500 :                         c = Utf8DecodeHtml32(utf8 + 1, len - 2);
      70             :                 }
      71             : 
      72         500 :                 if (c == 0) {
      73          25 :                         return utf8Decode32(utf8, offset);
      74             :                 } else {
      75         475 :                         offset = (len + 1);
      76         475 :                         return c;
      77             :                 }
      78             :         } else {
      79        1325 :                 return utf8Decode32(utf8, offset);
      80             :         }
      81             : }
      82             : 
      83             : }
      84             : 
      85             : namespace STAPPLER_VERSIONIZED stappler::string {
      86             : 
      87             : SPUNUSED inline size_t Utf8CharLength(const uint8_t *ptr, uint8_t &mask);
      88             : 
      89             : static inline void sp_str_replace(const char *target, const char *str, char &b, char &c) {
      90             :         int i = 0;
      91             :         while (str[1] != 0) {
      92             :                 if (str[0] == b && str[1] == c) {
      93             :                         if (i % 2 == 0) {
      94             :                                 b = target[i];
      95             :                                 c = target[i + 1];
      96             :                         }
      97             :                         return;
      98             :                 }
      99             :                 ++ i; ++ str;
     100             :         }
     101             : }
     102             : 
     103             : template <class T> static inline T Utf8NextChar(T p) {
     104             :         return (p + unicode::utf8_length_data[((const uint8_t *)p)[0]]);
     105             : }
     106             : 
     107             : template <class T> static inline T Utf8NextChar(T p, size_t &counter) {
     108             :         auto l = unicode::utf8_length_data[ ((const uint8_t *)p)[0] ];
     109             :         counter += 1;
     110             :         return (p + l);
     111             : }
     112             : 
     113          25 : bool isValidUtf8(StringView r) {
     114             :         static const uint8_t utf8_valid_data[256] = {
     115             :         //      0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f
     116             :                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     117             :                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     118             :                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     119             :                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     120             :                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     121             :                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     122             :                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     123             :                 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
     124             :         };
     125             : 
     126          25 :         auto ptr = r.data();
     127          25 :         const auto end = ptr + r.size();
     128         225 :         while (ptr < end && *ptr != 0) {
     129         200 :                 auto l = utf8_valid_data[ ((const uint8_t *)ptr)[0] ];
     130         200 :                 if (l == 0) {
     131           0 :                         return false;
     132         200 :                 } else if (l == 1) {
     133           0 :                         ++ ptr;
     134             :                 } else {
     135         500 :                         while (l > 1) {
     136         300 :                                 -- l;
     137         300 :                                 ++ ptr;
     138             : 
     139         300 :                                 if ((((const uint8_t *)ptr)[0] & 0b1100'0000) != 0b1000'0000) {
     140           0 :                                         return false;
     141             :                                 }
     142             :                         }
     143         200 :                         ++ ptr;
     144             :                 }
     145             :         };
     146          25 :         return true;
     147             : }
     148             : 
     149      166325 : size_t getUtf16Length(const StringView &input) {
     150      166325 :         size_t counter = 0;
     151      166325 :         auto ptr = input.data();
     152      166325 :         const auto end = ptr + input.size();
     153     1951336 :         while (ptr < end && *ptr != 0) {
     154     1785011 :                 counter += unicode::utf16_length_data[ uint8_t(*ptr) ];
     155     1785011 :                 ptr += unicode::utf8_length_data[ uint8_t(*ptr) ];
     156             :         };
     157      166325 :         return counter;
     158             : }
     159             : 
     160          75 : size_t getUtf16HtmlLength(const StringView &input) {
     161          75 :         size_t counter = 0;
     162          75 :         auto ptr = input.data();
     163          75 :         const auto end = ptr + input.size();
     164        2275 :         while (ptr < end && *ptr != 0) {
     165        2200 :                 if (ptr[0] == '&') {
     166         500 :                         uint8_t len = 0;
     167        2975 :                         while (ptr[len] && ptr[len] != ';' && len < 10) {
     168        2475 :                                 len ++;
     169             :                         }
     170             : 
     171             : 
     172         500 :                         if (ptr[len] == ';' && len > 2) {
     173         500 :                                 counter ++;
     174         500 :                                 ptr += len;
     175           0 :                         } else if (ptr[len] == 0) {
     176           0 :                                 ptr += len;
     177             :                         } else {
     178           0 :                                 counter += unicode::utf16_length_data[ uint8_t(*ptr) ];
     179           0 :                                 ptr += unicode::utf8_length_data[ uint8_t(*ptr) ];
     180             :                         }
     181             :                 } else {
     182        1700 :                         counter += unicode::utf16_length_data[ uint8_t(*ptr) ];
     183        1700 :                         ptr += unicode::utf8_length_data[ uint8_t(*ptr) ];
     184             :                 }
     185             :         };
     186          75 :         return counter;
     187             : }
     188             : 
     189          25 : size_t getUtf8HtmlLength(const StringView &input) {
     190          25 :         size_t counter = 0;
     191          25 :         auto ptr = input.data();
     192          25 :         const auto end = ptr + input.size();
     193         225 :         while (ptr < end && *ptr != 0) {
     194         200 :                 if (ptr[0] == '&') {
     195         100 :                         uint8_t len = 0;
     196         575 :                         while (ptr[len] && ptr[len] != ';' && len < 10) {
     197         475 :                                 len ++;
     198             :                         }
     199             : 
     200         100 :                         if (ptr[len] == ';' && len > 2) {
     201         100 :                                 auto c = unicode::Utf8DecodeHtml32(ptr + 1, len - 2);
     202         100 :                                 counter += unicode::utf8EncodeLength(c);
     203         100 :                                 ptr += len;
     204         100 :                         } else if (ptr[len] == 0) {
     205           0 :                                 ptr += len;
     206             :                         } else {
     207           0 :                                 counter += 1;
     208           0 :                                 ptr += 1;
     209             :                         }
     210             :                 } else {
     211         100 :                         counter += 1;
     212         100 :                         ptr += 1;
     213             :                 }
     214             :         };
     215          25 :         return counter;
     216             : }
     217             : 
     218      112542 : size_t getUtf8Length(const WideStringView &str) {
     219      112542 :         const char16_t *ptr = str.data();
     220      112542 :         const char16_t *end = ptr + str.size();
     221      112542 :         size_t ret = 0;
     222      792841 :         while (ptr < end) {
     223      680299 :                 auto c = *ptr++;
     224      680299 :                 if (c >= 0xD800 && c <= 0xDFFF) {
     225             :                         // surrogates is 4-byte
     226           0 :                         ret += 4;
     227           0 :                         ++ ptr;
     228             :                 } else {
     229      680299 :                         ret += unicode::utf8EncodeLength(c);
     230             :                 }
     231             :         }
     232      112542 :         return ret;
     233             : }
     234             : 
     235             : //static constexpr const char16_t utf8_small[64] = {
     236             : //      u'А', u'Б', u'В', u'Г', u'Д', u'Е', u'Ж', u'З', u'И', u'Й', u'К', u'Л', u'М', u'Н', u'О', u'П',
     237             : //      u'Р', u'С', u'Т', u'У', u'Ф', u'Х', u'Ц', u'Ч', u'Ш', u'Щ', u'Ъ', u'Ы', u'Ь', u'Э', u'Ю', u'Я',
     238             : //      u'а', u'б', u'в', u'г', u'д', u'е', u'ж', u'з', u'и', u'й', u'к', u'л', u'м', u'н', u'о', u'п',
     239             : //      u'р', u'с', u'т', u'у', u'ф', u'х', u'ц', u'ч', u'ш', u'щ', u'ъ', u'ы', u'ь', u'э', u'ю', u'я',
     240             : //};
     241             : 
     242             : static constexpr const uint8_t koi8r_small[64] = {
     243             :         0xE1, 0xE2, 0xF7, 0xE7, 0xE4, 0xE5, 0xF6, 0xFA, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0,
     244             :         0xF2, 0xF3, 0xF4, 0xF5, 0xE6, 0xE8, 0xE3, 0xFE, 0xFB, 0xFD, 0xFF, 0xF9, 0xF8, 0xFC, 0xE0, 0xF1,
     245             :         0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xD6, 0xDA, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0,
     246             :         0xD2, 0xD3, 0xD4, 0xD5, 0xC6, 0xC8, 0xC3, 0xDE, 0xDB, 0xDD, 0xDF, 0xD9, 0xD8, 0xDC, 0xC0, 0xD1,
     247             : };
     248             : 
     249      309200 : char charToKoi8r(char16_t c) {
     250      309200 :         if (c <= 0x7f) {
     251        3450 :                 return char(c & 0xFF);
     252      305750 :         } else if (c >= u'а' && c <= u'я') {
     253        1600 :                 return char(koi8r_small[c - u'а' + 32]);
     254      304150 :         } else if (c >= u'А' && c <= u'Я') {
     255        1675 :                 return char(koi8r_small[(c - u'A') % 64]);
     256             :         } else {
     257      302475 :                 switch (c) {
     258          25 :                 case 0x2500: return char(0x80); break;
     259          25 :                 case 0x2502: return char(0x81); break;
     260          25 :                 case 0x250C: return char(0x82); break;
     261          25 :                 case 0x2510: return char(0x83); break;
     262          25 :                 case 0x2514: return char(0x84); break;
     263          25 :                 case 0x2518: return char(0x85); break;
     264          25 :                 case 0x251C: return char(0x86); break;
     265          25 :                 case 0x2524: return char(0x87); break;
     266          25 :                 case 0x252C: return char(0x88); break;
     267          25 :                 case 0x2534: return char(0x89); break;
     268          25 :                 case 0x253C: return char(0x8A); break;
     269          25 :                 case 0x2580: return char(0x8B); break;
     270          25 :                 case 0x2584: return char(0x8C); break;
     271          25 :                 case 0x2588: return char(0x8D); break;
     272          25 :                 case 0x258C: return char(0x8E); break;
     273          25 :                 case 0x2590: return char(0x8F); break;
     274             : 
     275          25 :                 case 0x2591: return char(0x90); break;
     276          25 :                 case 0x2592: return char(0x91); break;
     277          25 :                 case 0x2593: return char(0x92); break;
     278          25 :                 case 0x2320: return char(0x93); break;
     279          25 :                 case 0x25A0: return char(0x94); break;
     280          25 :                 case 0x2219: return char(0x95); break;
     281          25 :                 case 0x221A: return char(0x96); break;
     282          25 :                 case 0x2248: return char(0x97); break;
     283          25 :                 case 0x2264: return char(0x98); break;
     284          25 :                 case 0x2265: return char(0x99); break;
     285          25 :                 case 0x00A0: return char(0x9A); break;
     286          25 :                 case 0x2321: return char(0x9B); break;
     287          25 :                 case 0x00B0: return char(0x9C); break;
     288          25 :                 case 0x00B2: return char(0x9D); break;
     289          25 :                 case 0x00B7: return char(0x9E); break;
     290          25 :                 case 0x00F7: return char(0x9F); break;
     291             : 
     292          25 :                 case 0x2550: return char(0xA0); break;
     293          25 :                 case 0x2551: return char(0xA1); break;
     294          25 :                 case 0x2552: return char(0xA2); break;
     295          50 :                 case 0x0451: return char(0xA3); break;
     296          25 :                 case 0x2553: return char(0xA4); break;
     297          25 :                 case 0x2554: return char(0xA5); break;
     298          25 :                 case 0x2555: return char(0xA6); break;
     299          25 :                 case 0x2556: return char(0xA7); break;
     300          25 :                 case 0x2557: return char(0xA8); break;
     301          25 :                 case 0x2558: return char(0xA9); break;
     302          25 :                 case 0x2559: return char(0xAA); break;
     303          25 :                 case 0x255A: return char(0xAB); break;
     304          25 :                 case 0x255B: return char(0xAC); break;
     305          25 :                 case 0x255C: return char(0xAD); break;
     306          25 :                 case 0x255D: return char(0xAE); break;
     307          25 :                 case 0x255E: return char(0xAF); break;
     308             : 
     309          25 :                 case 0x255F: return char(0xB0); break;
     310          25 :                 case 0x2560: return char(0xB1); break;
     311          25 :                 case 0x2561: return char(0xB2); break;
     312          50 :                 case 0x0401: return char(0xB3); break;
     313          25 :                 case 0x2562: return char(0xB4); break;
     314          25 :                 case 0x2563: return char(0xB5); break;
     315          25 :                 case 0x2564: return char(0xB6); break;
     316          25 :                 case 0x2565: return char(0xB7); break;
     317          25 :                 case 0x2566: return char(0xB8); break;
     318          25 :                 case 0x2567: return char(0xB9); break;
     319          25 :                 case 0x2568: return char(0xBA); break;
     320          25 :                 case 0x2569: return char(0xBB); break;
     321          25 :                 case 0x256A: return char(0xBC); break;
     322          25 :                 case 0x256B: return char(0xBD); break;
     323          25 :                 case 0x256C: return char(0xBE); break;
     324          25 :                 case 0x00A9: return char(0xBF); break;
     325      300825 :                 default: break;
     326             :                 }
     327             :         }
     328      300825 :         return ' ';
     329             : }
     330             : 
     331             : }

Generated by: LCOV version 1.14