Line data Source code
1 : /**
2 : Copyright (c) 2016-2022 Roman Katuntsev <sbkarr@stappler.org>
3 : Copyright (c) 2023 Stappler LLC <admin@stappler.dev>
4 :
5 : Permission is hereby granted, free of charge, to any person obtaining a copy
6 : of this software and associated documentation files (the "Software"), to deal
7 : in the Software without restriction, including without limitation the rights
8 : to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 : copies of the Software, and to permit persons to whom the Software is
10 : furnished to do so, subject to the following conditions:
11 :
12 : The above copyright notice and this permission notice shall be included in
13 : all copies or substantial portions of the Software.
14 :
15 : THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 : AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 : OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 : THE SOFTWARE.
22 : **/
23 :
24 : #include "SPUnicode.h"
25 : #include "SPString.h"
26 :
27 : #if WIN32
28 : #include "SPPlatformUnistd.h"
29 : #include <libloaderapi.h>
30 : #else
31 : #include <dlfcn.h>
32 : #endif
33 :
34 : namespace STAPPLER_VERSIONIZED stappler::unicode {
35 :
36 600 : static char32_t Utf8DecodeHtml32(const char *ptr, uint32_t len) {
37 600 : if (ptr[0] == '#') {
38 200 : if (len > 1 && (ptr[1] == 'x' || ptr[1] == 'X')) {
39 100 : return char32_t(strtol(ptr + 2, nullptr, 16));
40 : }
41 100 : return char32_t(strtol(ptr + 1, nullptr, 10));
42 400 : } else if (strncmp(ptr, "amp", len) == 0) {
43 50 : return '&';
44 350 : } else if (strncmp(ptr, "nbsp", len) == 0) {
45 50 : return 0xA0;
46 300 : } else if (strncmp(ptr, "quot", len) == 0) {
47 50 : return '"';
48 250 : } else if (strncmp(ptr, "apos", len) == 0) {
49 50 : return '\'';
50 200 : } else if (strncmp(ptr, "lt", len) == 0) {
51 50 : return '<';
52 150 : } else if (strncmp(ptr, "gt", len) == 0) {
53 50 : return '>';
54 100 : } else if (strncmp(ptr, "shy", len) == 0) {
55 50 : return char32_t(0x00AD);
56 : }
57 50 : return 0;
58 : }
59 :
60 1825 : char32_t utf8HtmlDecode32(const char *utf8, uint8_t &offset) {
61 1825 : if (utf8[0] == '&') {
62 500 : uint32_t len = 0;
63 2975 : while (utf8[len] && utf8[len] != ';' && len < 10) {
64 2475 : len ++;
65 : }
66 :
67 500 : char32_t c = 0;
68 500 : if (utf8[len] == ';' && len > 2) {
69 500 : c = Utf8DecodeHtml32(utf8 + 1, len - 2);
70 : }
71 :
72 500 : if (c == 0) {
73 25 : return utf8Decode32(utf8, offset);
74 : } else {
75 475 : offset = (len + 1);
76 475 : return c;
77 : }
78 : } else {
79 1325 : return utf8Decode32(utf8, offset);
80 : }
81 : }
82 :
83 : }
84 :
85 : namespace STAPPLER_VERSIONIZED stappler::string {
86 :
87 : SPUNUSED inline size_t Utf8CharLength(const uint8_t *ptr, uint8_t &mask);
88 :
89 : static inline void sp_str_replace(const char *target, const char *str, char &b, char &c) {
90 : int i = 0;
91 : while (str[1] != 0) {
92 : if (str[0] == b && str[1] == c) {
93 : if (i % 2 == 0) {
94 : b = target[i];
95 : c = target[i + 1];
96 : }
97 : return;
98 : }
99 : ++ i; ++ str;
100 : }
101 : }
102 :
103 : template <class T> static inline T Utf8NextChar(T p) {
104 : return (p + unicode::utf8_length_data[((const uint8_t *)p)[0]]);
105 : }
106 :
107 : template <class T> static inline T Utf8NextChar(T p, size_t &counter) {
108 : auto l = unicode::utf8_length_data[ ((const uint8_t *)p)[0] ];
109 : counter += 1;
110 : return (p + l);
111 : }
112 :
113 25 : bool isValidUtf8(StringView r) {
114 : static const uint8_t utf8_valid_data[256] = {
115 : // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f
116 : 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
117 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
118 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
119 : 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 : 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 : 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123 : 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
124 : };
125 :
126 25 : auto ptr = r.data();
127 25 : const auto end = ptr + r.size();
128 225 : while (ptr < end && *ptr != 0) {
129 200 : auto l = utf8_valid_data[ ((const uint8_t *)ptr)[0] ];
130 200 : if (l == 0) {
131 0 : return false;
132 200 : } else if (l == 1) {
133 0 : ++ ptr;
134 : } else {
135 500 : while (l > 1) {
136 300 : -- l;
137 300 : ++ ptr;
138 :
139 300 : if ((((const uint8_t *)ptr)[0] & 0b1100'0000) != 0b1000'0000) {
140 0 : return false;
141 : }
142 : }
143 200 : ++ ptr;
144 : }
145 : };
146 25 : return true;
147 : }
148 :
149 166325 : size_t getUtf16Length(const StringView &input) {
150 166325 : size_t counter = 0;
151 166325 : auto ptr = input.data();
152 166325 : const auto end = ptr + input.size();
153 1951336 : while (ptr < end && *ptr != 0) {
154 1785011 : counter += unicode::utf16_length_data[ uint8_t(*ptr) ];
155 1785011 : ptr += unicode::utf8_length_data[ uint8_t(*ptr) ];
156 : };
157 166325 : return counter;
158 : }
159 :
160 75 : size_t getUtf16HtmlLength(const StringView &input) {
161 75 : size_t counter = 0;
162 75 : auto ptr = input.data();
163 75 : const auto end = ptr + input.size();
164 2275 : while (ptr < end && *ptr != 0) {
165 2200 : if (ptr[0] == '&') {
166 500 : uint8_t len = 0;
167 2975 : while (ptr[len] && ptr[len] != ';' && len < 10) {
168 2475 : len ++;
169 : }
170 :
171 :
172 500 : if (ptr[len] == ';' && len > 2) {
173 500 : counter ++;
174 500 : ptr += len;
175 0 : } else if (ptr[len] == 0) {
176 0 : ptr += len;
177 : } else {
178 0 : counter += unicode::utf16_length_data[ uint8_t(*ptr) ];
179 0 : ptr += unicode::utf8_length_data[ uint8_t(*ptr) ];
180 : }
181 : } else {
182 1700 : counter += unicode::utf16_length_data[ uint8_t(*ptr) ];
183 1700 : ptr += unicode::utf8_length_data[ uint8_t(*ptr) ];
184 : }
185 : };
186 75 : return counter;
187 : }
188 :
189 25 : size_t getUtf8HtmlLength(const StringView &input) {
190 25 : size_t counter = 0;
191 25 : auto ptr = input.data();
192 25 : const auto end = ptr + input.size();
193 225 : while (ptr < end && *ptr != 0) {
194 200 : if (ptr[0] == '&') {
195 100 : uint8_t len = 0;
196 575 : while (ptr[len] && ptr[len] != ';' && len < 10) {
197 475 : len ++;
198 : }
199 :
200 100 : if (ptr[len] == ';' && len > 2) {
201 100 : auto c = unicode::Utf8DecodeHtml32(ptr + 1, len - 2);
202 100 : counter += unicode::utf8EncodeLength(c);
203 100 : ptr += len;
204 100 : } else if (ptr[len] == 0) {
205 0 : ptr += len;
206 : } else {
207 0 : counter += 1;
208 0 : ptr += 1;
209 : }
210 : } else {
211 100 : counter += 1;
212 100 : ptr += 1;
213 : }
214 : };
215 25 : return counter;
216 : }
217 :
218 112542 : size_t getUtf8Length(const WideStringView &str) {
219 112542 : const char16_t *ptr = str.data();
220 112542 : const char16_t *end = ptr + str.size();
221 112542 : size_t ret = 0;
222 792841 : while (ptr < end) {
223 680299 : auto c = *ptr++;
224 680299 : if (c >= 0xD800 && c <= 0xDFFF) {
225 : // surrogates is 4-byte
226 0 : ret += 4;
227 0 : ++ ptr;
228 : } else {
229 680299 : ret += unicode::utf8EncodeLength(c);
230 : }
231 : }
232 112542 : return ret;
233 : }
234 :
235 : //static constexpr const char16_t utf8_small[64] = {
236 : // u'А', u'Б', u'В', u'Г', u'Д', u'Е', u'Ж', u'З', u'И', u'Й', u'К', u'Л', u'М', u'Н', u'О', u'П',
237 : // u'Р', u'С', u'Т', u'У', u'Ф', u'Х', u'Ц', u'Ч', u'Ш', u'Щ', u'Ъ', u'Ы', u'Ь', u'Э', u'Ю', u'Я',
238 : // u'а', u'б', u'в', u'г', u'д', u'е', u'ж', u'з', u'и', u'й', u'к', u'л', u'м', u'н', u'о', u'п',
239 : // u'р', u'с', u'т', u'у', u'ф', u'х', u'ц', u'ч', u'ш', u'щ', u'ъ', u'ы', u'ь', u'э', u'ю', u'я',
240 : //};
241 :
242 : static constexpr const uint8_t koi8r_small[64] = {
243 : 0xE1, 0xE2, 0xF7, 0xE7, 0xE4, 0xE5, 0xF6, 0xFA, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0,
244 : 0xF2, 0xF3, 0xF4, 0xF5, 0xE6, 0xE8, 0xE3, 0xFE, 0xFB, 0xFD, 0xFF, 0xF9, 0xF8, 0xFC, 0xE0, 0xF1,
245 : 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xD6, 0xDA, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0,
246 : 0xD2, 0xD3, 0xD4, 0xD5, 0xC6, 0xC8, 0xC3, 0xDE, 0xDB, 0xDD, 0xDF, 0xD9, 0xD8, 0xDC, 0xC0, 0xD1,
247 : };
248 :
249 309200 : char charToKoi8r(char16_t c) {
250 309200 : if (c <= 0x7f) {
251 3450 : return char(c & 0xFF);
252 305750 : } else if (c >= u'а' && c <= u'я') {
253 1600 : return char(koi8r_small[c - u'а' + 32]);
254 304150 : } else if (c >= u'А' && c <= u'Я') {
255 1675 : return char(koi8r_small[(c - u'A') % 64]);
256 : } else {
257 302475 : switch (c) {
258 25 : case 0x2500: return char(0x80); break;
259 25 : case 0x2502: return char(0x81); break;
260 25 : case 0x250C: return char(0x82); break;
261 25 : case 0x2510: return char(0x83); break;
262 25 : case 0x2514: return char(0x84); break;
263 25 : case 0x2518: return char(0x85); break;
264 25 : case 0x251C: return char(0x86); break;
265 25 : case 0x2524: return char(0x87); break;
266 25 : case 0x252C: return char(0x88); break;
267 25 : case 0x2534: return char(0x89); break;
268 25 : case 0x253C: return char(0x8A); break;
269 25 : case 0x2580: return char(0x8B); break;
270 25 : case 0x2584: return char(0x8C); break;
271 25 : case 0x2588: return char(0x8D); break;
272 25 : case 0x258C: return char(0x8E); break;
273 25 : case 0x2590: return char(0x8F); break;
274 :
275 25 : case 0x2591: return char(0x90); break;
276 25 : case 0x2592: return char(0x91); break;
277 25 : case 0x2593: return char(0x92); break;
278 25 : case 0x2320: return char(0x93); break;
279 25 : case 0x25A0: return char(0x94); break;
280 25 : case 0x2219: return char(0x95); break;
281 25 : case 0x221A: return char(0x96); break;
282 25 : case 0x2248: return char(0x97); break;
283 25 : case 0x2264: return char(0x98); break;
284 25 : case 0x2265: return char(0x99); break;
285 25 : case 0x00A0: return char(0x9A); break;
286 25 : case 0x2321: return char(0x9B); break;
287 25 : case 0x00B0: return char(0x9C); break;
288 25 : case 0x00B2: return char(0x9D); break;
289 25 : case 0x00B7: return char(0x9E); break;
290 25 : case 0x00F7: return char(0x9F); break;
291 :
292 25 : case 0x2550: return char(0xA0); break;
293 25 : case 0x2551: return char(0xA1); break;
294 25 : case 0x2552: return char(0xA2); break;
295 50 : case 0x0451: return char(0xA3); break;
296 25 : case 0x2553: return char(0xA4); break;
297 25 : case 0x2554: return char(0xA5); break;
298 25 : case 0x2555: return char(0xA6); break;
299 25 : case 0x2556: return char(0xA7); break;
300 25 : case 0x2557: return char(0xA8); break;
301 25 : case 0x2558: return char(0xA9); break;
302 25 : case 0x2559: return char(0xAA); break;
303 25 : case 0x255A: return char(0xAB); break;
304 25 : case 0x255B: return char(0xAC); break;
305 25 : case 0x255C: return char(0xAD); break;
306 25 : case 0x255D: return char(0xAE); break;
307 25 : case 0x255E: return char(0xAF); break;
308 :
309 25 : case 0x255F: return char(0xB0); break;
310 25 : case 0x2560: return char(0xB1); break;
311 25 : case 0x2561: return char(0xB2); break;
312 50 : case 0x0401: return char(0xB3); break;
313 25 : case 0x2562: return char(0xB4); break;
314 25 : case 0x2563: return char(0xB5); break;
315 25 : case 0x2564: return char(0xB6); break;
316 25 : case 0x2565: return char(0xB7); break;
317 25 : case 0x2566: return char(0xB8); break;
318 25 : case 0x2567: return char(0xB9); break;
319 25 : case 0x2568: return char(0xBA); break;
320 25 : case 0x2569: return char(0xBB); break;
321 25 : case 0x256A: return char(0xBC); break;
322 25 : case 0x256B: return char(0xBD); break;
323 25 : case 0x256C: return char(0xBE); break;
324 25 : case 0x00A9: return char(0xBF); break;
325 300825 : default: break;
326 : }
327 : }
328 300825 : return ' ';
329 : }
330 :
331 : }
|