Line data Source code
1 : /**
2 : Copyright (c) 2016-2022 Roman Katuntsev <sbkarr@stappler.org>
3 : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
4 :
5 : Permission is hereby granted, free of charge, to any person obtaining a copy
6 : of this software and associated documentation files (the "Software"), to deal
7 : in the Software without restriction, including without limitation the rights
8 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 : copies of the Software, and to permit persons to whom the Software is
10 : furnished to do so, subject to the following conditions:
11 :
12 : The above copyright notice and this permission notice shall be included in
13 : all copies or substantial portions of the Software.
14 :
15 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 : THE SOFTWARE.
22 : **/
23 :
24 : #ifndef STAPPLER_CORE_STRING_SPUNICODE_H_
25 : #define STAPPLER_CORE_STRING_SPUNICODE_H_
26 :
27 : #include "SPMemString.h"
28 :
29 : namespace STAPPLER_VERSIONIZED stappler::unicode {
30 :
31 : // Length lookup table
32 : constexpr const uint8_t utf8_length_data[256] = {
33 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
34 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1
41 : };
42 :
43 : constexpr const uint8_t utf16_length_data[256] = {
44 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1
52 : };
53 :
54 : constexpr const uint8_t utf8_length_mask[256] = {
55 : 0x00, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
56 : 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
57 : 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
58 : 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
59 : 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
60 : 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
61 : 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
62 : 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x03, 0x03, 0x03, 0x03, 0x01, 0x01, 0x7f, 0x7f
63 : };
64 :
65 : // check if char is not start of utf8 symbol
66 : SPINLINE constexpr inline bool isUtf8Surrogate(char c) {
67 15900 : return (c & 0xC0) == 0x80;
68 : }
69 :
70 3004988 : constexpr inline char32_t utf8Decode32(const char *ptr, uint8_t &offset) {
71 3004988 : uint8_t mask = utf8_length_mask[uint8_t(*ptr)];
72 3004988 : offset = utf8_length_data[uint8_t(*ptr)];
73 3004988 : char32_t ret = ptr[0] & mask;
74 3682548 : for (uint8_t c = 1; c < offset; ++c) {
75 677560 : if ((ptr[c] & 0xc0) != 0x80) { ret = 0; break; }
76 677560 : ret <<= 6; ret |= (ptr[c] & 0x3f);
77 : }
78 3004988 : return ret;
79 : }
80 :
81 : char32_t utf8HtmlDecode32(const char *ptr, uint8_t &offset);
82 :
83 284368 : constexpr inline char32_t utf8Decode32(const char *ptr) {
84 : uint8_t offset;
85 284368 : return utf8Decode32(ptr, offset);
86 : }
87 :
88 680324 : inline constexpr uint8_t utf8EncodeLength(char16_t c) {
89 : return ( c < 0x80 ? 1
90 : : ( c < 0x800 ? 2
91 : : 3
92 : )
93 680324 : );
94 : }
95 :
96 225 : inline constexpr uint8_t utf8EncodeLength(char32_t c) {
97 225 : if (c < 0x80) {
98 100 : return 1;
99 125 : } else if (c < 0x800) {
100 25 : return 2;
101 100 : } else if (c < 0x1'0000) {
102 50 : return 3;
103 50 : } else if (c < 0x11'0000) {
104 25 : return 4;
105 : } else {
106 25 : return 5;
107 : }
108 : }
109 :
110 : template <typename PutCharFn>
111 125 : inline uint8_t utf8EncodeCb(const PutCharFn &cb, char16_t c) {
112 125 : if (c < 0x80) {
113 75 : cb(char(c));
114 75 : return 1;
115 50 : } else if (c < 0x800) {
116 0 : cb(0xc0 | (c >> 6));
117 0 : cb(0x80 | (c & 0x3f));
118 0 : return 2;
119 : } else {
120 50 : cb(0xe0 | (c >> 12));
121 50 : cb(0x80 | (c >> 6 & 0x3f));
122 50 : cb(0x80 | (c & 0x3f));
123 50 : return 3;
124 : }
125 : }
126 :
127 : template <typename PutCharFn>
128 680451 : inline uint8_t utf8EncodeCb(const PutCharFn &cb, char32_t c) {
129 680451 : if (c < 0x80) {
130 429995 : cb(char(c));
131 429995 : return 1;
132 250456 : } else if (c < 0x800) {
133 250356 : cb(0xc0 | (c >> 6));
134 250356 : cb(0x80 | (c & 0x3f));
135 250356 : return 2;
136 100 : } else if (c < 0x1'0000) {
137 50 : cb(0b1110'0000 | (c >> 12));
138 50 : cb(0x80 | (c >> 6 & 0x3f));
139 50 : cb(0x80 | (c & 0x3f));
140 50 : return 3;
141 50 : } else if (c < 0x11'0000) {
142 25 : cb(0b1111'0000 | (c >> 18));
143 25 : cb(0x80 | (c >> 12 & 0x3f));
144 25 : cb(0x80 | (c >> 6 & 0x3f));
145 25 : cb(0x80 | (c & 0x3f));
146 25 : return 4;
147 : } else {
148 25 : cb(0b1111'1000 | (c >> 24));
149 25 : cb(0x80 | (c >> 18 & 0x3f));
150 25 : cb(0x80 | (c >> 12 & 0x3f));
151 25 : cb(0x80 | (c >> 6 & 0x3f));
152 25 : cb(0x80 | (c & 0x3f));
153 25 : return 5;
154 : }
155 : }
156 :
157 25 : inline uint8_t utf8EncodeBuf(char *ptr, char16_t ch) {
158 50 : return utf8EncodeCb([&] (char c) {
159 25 : *ptr++ = c;
160 25 : }, ch);
161 : }
162 :
163 500 : inline uint8_t utf8EncodeBuf(char *ptr, char32_t ch) {
164 1000 : return utf8EncodeCb([&] (char c) {
165 900 : *ptr++ = c;
166 500 : }, ch);
167 : }
168 :
169 25 : inline uint8_t utf8Encode(std::string &str, char16_t ch) {
170 50 : return utf8EncodeCb([&] (char c) {
171 25 : str.push_back(c);
172 50 : }, ch);
173 : }
174 :
175 17076 : inline uint8_t utf8Encode(std::string &str, char32_t ch) {
176 34152 : return utf8EncodeCb([&] (char c) {
177 22257 : str.push_back(c);
178 34152 : }, ch);
179 : }
180 :
181 50 : inline uint8_t utf8Encode(memory::string &str, char16_t ch) {
182 100 : return utf8EncodeCb([&] (char c) {
183 150 : str.push_back(c);
184 50 : }, ch);
185 : }
186 :
187 662875 : inline uint8_t utf8Encode(memory::string &str, char32_t ch) {
188 1325750 : return utf8EncodeCb([&] (char c) {
189 907925 : str.push_back(c);
190 662875 : }, ch);
191 : }
192 :
193 : inline uint8_t utf8Encode(std::ostream &str, char16_t ch) {
194 : return utf8EncodeCb([&] (char c) {
195 : str << c;
196 : }, ch);
197 : }
198 :
199 : inline uint8_t utf8Encode(std::ostream &str, char32_t ch) {
200 : return utf8EncodeCb([&] (char c) {
201 : str << c;
202 : }, ch);
203 : }
204 :
205 1555186 : constexpr inline char32_t utf16Decode32(const char16_t *ptr, uint8_t &offset) {
206 1555186 : if ((*ptr & char16_t(0xD800)) != 0) {
207 25 : offset = 2;
208 25 : return char32_t(0b0000'0011'1111'1111 & ptr[0]) << 10 | char32_t(0b0000'0011'1111'1111 & ptr[1]);
209 : } else {
210 1555161 : offset = 1;
211 1555161 : return char32_t(*ptr);
212 : }
213 : }
214 :
215 25 : constexpr inline char32_t utf16Decode32(const char16_t *ptr) {
216 : uint8_t offset;
217 25 : return utf16Decode32(ptr, offset);
218 : }
219 :
220 1225 : constexpr inline uint8_t utf16EncodeLength(char32_t c) {
221 1225 : if (c < 0xD800) {
222 1175 : return 1;
223 50 : } else if (c <= 0xDFFF) {
224 : // do nothing, wrong encoding
225 0 : return 0;
226 50 : } else if (c < 0x10000) {
227 0 : return 1;
228 : } else {
229 50 : return 2;
230 : }
231 : }
232 :
233 : template <typename PutCharFn>
234 1587586 : inline uint8_t utf16EncodeCb(const PutCharFn &cb, char32_t c) {
235 1587586 : if (c < 0xD800) {
236 1587486 : cb(char16_t(c));
237 1587486 : return 1;
238 100 : } else if (c <= 0xDFFF) {
239 25 : return 0;
240 75 : } else if (c < 0x10000) {
241 25 : cb(char16_t(c));
242 25 : return 1;
243 : } else {
244 50 : cb(char16_t(((0b1111'1111'1100'0000'0000 & c) >> 10) + 0xD800));
245 50 : cb(char16_t(((0b0000'0000'0011'1111'1111 & c) >> 00) + 0xDC00));
246 50 : return 2;
247 : }
248 : }
249 :
250 500 : inline uint8_t utf16EncodeBuf(char16_t *ptr, char32_t ch) {
251 1000 : return utf16EncodeCb([&] (char16_t c) {
252 500 : *ptr++ = c;
253 500 : }, ch);
254 : }
255 :
256 877972 : inline uint8_t utf16Encode(std::u16string &str, char32_t ch) {
257 1755944 : return utf16EncodeCb([&] (char16_t c) {
258 877972 : str.push_back(c);
259 1755944 : }, ch);
260 : }
261 :
262 709114 : inline uint8_t utf16Encode(memory::u16string &str, char32_t ch) {
263 1418228 : return utf16EncodeCb([&] (char16_t c) {
264 709139 : str.push_back(c);
265 709114 : }, ch);
266 : }
267 :
268 : template <typename std::enable_if<std::is_class<std::ctype<char16_t>>::value>::type* = nullptr>
269 : inline uint8_t utf16Encode(std::basic_ostream<char16_t> &out, char32_t ch) {
270 : return utf16EncodeCb([&] (char16_t c) {
271 : out << c;
272 : }, ch);
273 : }
274 :
275 : }
276 :
277 :
278 : // A part of SPString.h header placed here, to be available in utilities,
279 : // that included by SPString.h (like StringView)
280 :
281 : namespace STAPPLER_VERSIONIZED stappler::platform {
282 :
283 : char32_t tolower(char32_t c);
284 : char32_t toupper(char32_t c);
285 : char32_t totitle(char32_t c);
286 :
287 : }
288 :
289 : namespace STAPPLER_VERSIONIZED stappler::string {
290 :
291 : static constexpr size_t DOUBLE_MAX_DIGITS = 27;
292 :
293 22050 : inline char32_t tolower(char32_t c) { return platform::tolower(c); }
294 6480 : inline char32_t toupper(char32_t c) { return platform::toupper(c); }
295 : inline char32_t totitle(char32_t c) { return platform::totitle(c); }
296 :
297 :
298 : // fast itoa implementation
299 : // data will be written at the end of buffer, no trailing zero (do not try to use strlen on it!)
300 : // designed to be used with StringView: StringView(buf + bufSize - ret, ret)
301 :
302 : size_t _itoa(int64_t number, char* buffer, size_t bufSize);
303 : size_t _itoa(uint64_t number, char* buffer, size_t bufSize);
304 :
305 : size_t _itoa(int64_t number, char16_t* buffer, size_t bufSize);
306 : size_t _itoa(uint64_t number, char16_t* buffer, size_t bufSize);
307 :
308 : size_t _itoa_len(int64_t number);
309 : size_t _itoa_len(uint64_t number);
310 :
311 : // fast dtoa implementation
312 : // data will be written from beginning, no trailing zero (do not try to use strlen on it!)
313 : // designed to be used with StringView: StringView(buf, ret)
314 :
315 : size_t _dtoa(double number, char* buffer, size_t bufSize);
316 : size_t _dtoa(double number, char16_t* buffer, size_t bufSize);
317 :
318 : size_t _dtoa_len(double number);
319 :
320 : // read number from string and offset pointers
321 :
322 : template <typename T, typename Char>
323 5137417 : inline auto readNumber(const Char *ptr, size_t len, int base, uint8_t &offset) -> Result<T> {
324 : // prevent to read out of bounds, copy symbols to stack buffer
325 5137417 : char buf[32] = { 0 }; // int64_t/scientific double character length max
326 5137417 : size_t m = min(size_t(31), len);
327 5137416 : size_t i = 0;
328 53419825 : for (; i < m; i++) {
329 48282409 : auto c = ptr[i];
330 48282409 : if (c < 127) {
331 48282409 : buf[i] = c;
332 : } else {
333 0 : break;
334 : }
335 : }
336 :
337 : // read number from internal buffer
338 5137416 : char * ret = nullptr;
339 5137416 : auto val = StringToNumber<T>(buf, &ret, base);
340 5137417 : if (*ret == 0) {
341 : // while string was used
342 5106267 : offset = i;
343 31150 : } else if (ret && ret != buf) {
344 : // part of string was used
345 26850 : offset = ret - buf;
346 : } else {
347 : // fail to read number
348 4300 : offset = 0;
349 4300 : return Result<T>();
350 : }
351 5133117 : return Result<T>(val);
352 : }
353 :
354 : }
355 :
356 : #endif /* STAPPLER_CORE_STRING_SPUNICODE_H_ */
|