1// tStandard.cpp 
2// 
3// Tacent functions and types that are standard across all platforms. Includes global functions like itoa which are not 
4// available on some platforms, but are common enough that they should be. 
5// 
6// Copyright (c) 2004-2006, 2015, 2023-2025 Tristan Grimmer. 
7// Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby 
8// granted, provided that the above copyright notice and this permission notice appear in all copies. 
9// 
10// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL 
11// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 
12// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 
13// AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 
14// PERFORMANCE OF THIS SOFTWARE. 
15 
16#include <stdlib.h> 
17#ifdef PLATFORM_WINDOWS 
18#include <Windows.h> 
19#endif 
20#include "Foundation/tStandard.h" 
21#include "Foundation/tString.h" 
22#include "Foundation/tFundamentals.h" 
23#pragma warning (disable: 4146) 
24#pragma warning (disable: 4018) 
25 
26 
27const char* tStd::SeparatorSubStr = "\x1a"
28const char* tStd::SeparatorFileStr = "\x1c"
29const char* tStd::SeparatorGroupStr = "\x1d"
30const char* tStd::SeparatorRecordStr = "\x1e"
31const char* tStd::SeparatorUnitStr = "\x1f"
32const char* tStd::SeparatorAStr = tStd::SeparatorUnitStr
33const char* tStd::SeparatorBStr = tStd::SeparatorRecordStr
34const char* tStd::SeparatorCStr = tStd::SeparatorGroupStr
35const char* tStd::SeparatorDStr = tStd::SeparatorFileStr
36const char* tStd::SeparatorEStr = tStd::SeparatorSubStr
37const char8_t* tStd::u8SeparatorSubStr = (const char8_t*)tStd::SeparatorSubStr
38const char8_t* tStd::u8SeparatorFileStr = (const char8_t*)tStd::SeparatorFileStr
39const char8_t* tStd::u8SeparatorGroupStr = (const char8_t*)tStd::SeparatorGroupStr
40const char8_t* tStd::u8SeparatorRecordStr = (const char8_t*)tStd::SeparatorRecordStr
41const char8_t* tStd::u8SeparatorUnitStr = (const char8_t*)tStd::SeparatorUnitStr
42const char8_t* tStd::u8SeparatorAStr = (const char8_t*)tStd::SeparatorAStr
43const char8_t* tStd::u8SeparatorBStr = (const char8_t*)tStd::SeparatorBStr
44const char8_t* tStd::u8SeparatorCStr = (const char8_t*)tStd::SeparatorCStr
45const char8_t* tStd::u8SeparatorDStr = (const char8_t*)tStd::SeparatorDStr
46const char8_t* tStd::u8SeparatorEStr = (const char8_t*)tStd::SeparatorEStr
47 
48 
49void* tStd::tMemsrch(void* haystack, int haystackNumBytes, void* needle, int needleNumBytes
50
51 if ((haystackNumBytes <= 0) || (needleNumBytes <= 0) || (haystackNumBytes < needleNumBytes)) 
52 return nullptr
53 
54 // Serach for the pattern from the first haystack byte (0) to numNeedleBytes from the end. For example, if we are 
55 // seraching for 4 bytes in 8, there will be 5 mem compares of 4 bytes each. 
56 for (int i = 0; i <= haystackNumBytes-needleNumBytes; i++) 
57
58 if (tMemcmp(a: (uint8*)haystack + i, b: needle, numBytes: needleNumBytes) == 0
59 return (uint8*)haystack + i
60
61 
62 return nullptr
63
64 
65 
66int tStd::tNstrcmp(const char* a, const char* b
67
68 const char* origa = a
69 const char* origb = b
70 
71 bool aStartsDig = a && tIsdigit(c: *a);  
72 bool bStartsDig = b && tIsdigit(c: *b);  
73 
74 // This implementation of tNstrcmp is a modified version of the one written by GitHub user ClangPan. 
75 while (*a && *b
76
77 bool aDigit = tIsdigit(c: *a); 
78 bool bDigit = tIsdigit(c: *b); 
79 
80 if (!aDigit && (*a == '-')) 
81
82 ++a
83 continue
84
85 
86 if (!bDigit && (*b == '-')) 
87
88 ++b
89 continue
90
91 
92 // We're comparing (possibly multidigit) numbers. 
93 if (aDigit && bDigit
94
95 char* enda
96 char* endb
97 
98 // Get the left number. 
99 int aInt = strtoul(nptr: (char*)a, endptr: &enda, base: 10); 
100 
101 // Get the right number. 
102 int bInt = strtoul(nptr: (char*)b, endptr: &endb, base: 10); 
103 
104 // if the difference is not equal to zero, we have a comparison result 
105 int sign = tMath::tSign(val: aInt - bInt); 
106 if (sign) return sign
107 
108 a = enda
109 b = endb
110 continue
111 }  
112 
113 // If only the left char is a digit, we have a result. 
114 if (aDigit) return aStartsDig ? -1 : +1
115 
116 // If only the right char is a digit, we have a result. 
117 if (bDigit) return bStartsDig ? +1 : -1
118 
119 // compute the difference of both characters 
120 int sign = tMath::tSign(val: tToLower(c: *a) - tToLower(c: *b)); 
121 
122 // If they differ we have a result. 
123 if (sign) return sign
124 
125 // Otherwise process the next characters. 
126 ++a; ++b
127
128 
129 // If both a and b are at end, we consider letter-case and compare as if we had never done the tToLowers. 
130 if (!(*a) && !(*b)) 
131 return tStrcmp(a: origa, b: origb); 
132 
133 // Now only one of *a or *b are non-zero. 
134 if (*b) return -1
135 if (*a) return +1
136 
137 return 0
138
139 
140 
141int tStd::tNstrcmpEx(const char* a, const char* b
142
143 if (tStrcmp(a, b) == 0
144 return 0
145 
146 // Code modified from https://github.com/scopeInfinity/NaturalSort 
147 bool foundSpace1 = false
148 bool foundSpace2 = false
149 
150 // Loop on every character. 
151 while (*a && *b
152
153 // Ignore More than one continous space. 
154 while (foundSpace1 && *a && *a == ' '
155 a++; 
156 foundSpace1 = false
157 if (*a == ' '
158 foundSpace1 = true
159 
160 while (foundSpace2 && *b && *b == ' '
161 b++; 
162 foundSpace2 = false
163 if (*b == ' '
164 foundSpace2 = true
165 
166 // If one character is alphanumeric, compare as usual. Edge case when we encounter a zero first, to avoid 
167 // problematic situations like '01.png' & '001.png' that would otherwise be considered equal. 
168 if (!tIsdigit(c: *a) || !tIsdigit(c: *b) || (*a == '0') || (*b == '0')) 
169
170 // Normal comparision if any of character is non digit character. 
171 if (tToLower(c: *a) < tToLower(c: *b)) 
172 return -1
173 
174 if (tToLower(c: *b) < tToLower(c: *a)) 
175 return +1
176 
177 a++; b++; 
178
179 // If both characters are numbers do a numeral comparison. 
180 else 
181
182 // Get the full number with tAtoi() to account for when you're comparing e.g. '1.png' & '10.png'. 
183 int digit1 = tAtoi(s: (const char*)a); 
184 int digit2 = tAtoi(s: (const char*)b); 
185 
186 // Compare the numbers. If they are the same we just continue. 
187 if (digit1 < digit2
188 return -1
189 if (digit2 < digit1
190 return +1
191 
192 using namespace tMath
193 
194 // Advance the pointers by the length of the digits (math, yay). 
195 a += int(tFloor(v: tLog10(x: float(digit1)))) + 1
196 b += int(tFloor(v: tLog10(x: float(digit2)))) + 1
197
198
199 
200 return +1
201
202 
203 
204bool tStd::tStrtob(const char* str
205
206 tString lower(str); 
207 lower.ToLower(); 
208 
209 if 
210
211 (lower == "true") || (lower == "t") || 
212 (lower == "yes") || (lower == "y") || 
213 (lower == "on") || (lower == "1") || (lower == "+") || 
214 (lower == "enable") || (lower == "enabled") || (tStrtoi(s: str) != 0
215
216 return true
217 else 
218 return false
219
220 
221 
222float tStd::tStrtof(const char* s
223
224 // Both tStrchr and tStrlen assert on nullptrs so we check here. 
225 if (!s
226 return 0.0f
227 
228 char* hash = tStrchr(s, c: '#'); 
229 if (hash && (tStrlen(s: hash+1) == 8)) 
230
231 uint32 bin = tStd::tStrtoui32(s: hash+1, base: 16); 
232 return *((float*)(&bin)); 
233
234 
235 return float( tStrtod(s) ); 
236
237 
238 
239double tStd::tStrtod(const char* s
240
241 // tStrlen asserts on nullptrs so we check here. 
242 if (!s
243 return 0.0
244 
245 int l = tStrlen(s); 
246 if (!l
247 return 0.0
248 
249 char* hash = tStrchr(s, c: '#'); 
250 if (hash && (tStrlen(s: hash+1) == 16)) 
251
252 uint64 bin = tStrtoui64(s: hash+1, base: 16); 
253 return *((double*)&bin); 
254
255 
256 // This error checking is essential. Sometimes NANs are written in text format to a string. 
257 // Like "nan(snan)". We want these to evaluate to 0.0, not -1 or something else. We allow 
258 // 'e' and 'E' for numbers in exponential form like 3.09E08. 
259 for (int i = 0; i < l; i++) 
260
261 char ch = s[i]; 
262 if 
263
264 ((ch >= 'a') && (ch <= 'z') && (ch != 'e')) || 
265 ((ch >= 'A') && (ch <= 'Z') && (ch != 'E')) 
266
267 return 0.0
268
269 
270 // Will be 0.0 if there was a problem. 
271 return strtod(nptr: s, endptr: nullptr); 
272
273 
274 
275void tStd::tStrrev(char* begin, char* end
276{  
277 char aux
278 while (end > begin
279 aux = *end, *end-- = *begin, *begin++ = aux
280
281 
282 
283// The UTF 8 <-> 16 conversion code below was based on https://github.com/Davipb/utf8-utf16-converter 
284// under the MIT licence. See Docs/Licence_Utf8Utf16.txt 
285namespace tUTF 
286
287 // BMP = Basic Multilingual Plane. 
288 // CP = Unicode codepoint. 
289 const char32_t cCodepoint_LastValidBMP = 0x0000FFFD; // Last valid codepoint. Note that U+FFFF and U+FFFE are guaranteed 'non-characters'. They do not appear if codepoint is valid. 
290 const char32_t cCodepoint_UnicodeMax = 0x0010FFFF; // The highest valid Unicode codepoint. 
291 const char32_t cCodepoint_UTF8Max1 = 0x0000007F; // The highest codepoint that can be encoded with 1 byte in UTF-8. 
292 const char32_t cCodepoint_UTF8Max2 = 0x000007FF; // The highest codepoint that can be encoded with 2 bytes in UTF-8. 
293 const char32_t cCodepoint_UTF8Max3 = 0x0000FFFF; // The highest codepoint that can be encoded with 3 bytes in UTF-8. 
294 
295 const char16_t cSurrogate_GenericMask16 = 0xF800; // The mask to apply before testing it against cSurrogate_GenericVal16 
296 const char16_t cSurrogate_GenericVal16 = 0xD800; // If masked with cSurrogate_GenericMask16, matches this value, it is a surrogate. 
297 const char32_t cSurrogate_GenericMask32 = 0x0000F800; // The mask to apply before testing it against cSurrogate_GenericVal32 
298 const char32_t cSurrogate_GenericVal32 = 0x0000D800; // If masked with cSurrogate_GenericMask32, matches this value, it is a surrogate. 
299 
300 const char16_t cSurrogate_Mask16 = 0xFC00; // The mask to apply to a character before testing it against cSurrogate_HighVal16 or cSurrogate_LowVal16. 
301 const char16_t cSurrogate_HighVal16 = 0xD800; // If a character, masked with cSurrogate_Mask16, matches this value, it is a high surrogate. 
302 const char16_t cSurrogate_LowVal16 = 0xDC00; // If a character, masked with cSurrogate_Mask16, matches this value, it is a low surrogate. 
303 
304 const char16_t cSurrogate_CodepointMask16 = 0x03FF; // A mask that can be applied to a surrogate to extract the codepoint value contained in it. 
305 const char32_t cSurrogate_CodepointMask32 = 0x000003FF; // A mask that can be applied to a surrogate to extract the codepoint value contained in it. 
306 const int cSurrogate_CodepointBits = 10; // The number of LS bits of cSurrogate_CodepointMask that are set. 
307 const char32_t cSurrogate_CodepointOffset = 0x00010000; // The value that is subtracted from a codepoint before encoding it in a surrogate pair. 
308 
309 const char8_t cContinuation_UTF8Mask = 0xC0; // The mask to a apply to a character before testing it against cContinuation_UTF8Val 
310 const char8_t cContinuation_UTF8Val = 0x80; // If a character, masked with cContinuation_UTF8Mask, matches this value, it is a UTF-8 continuation byte. 
311 const int cContinuation_CodepointBits = 6; // The number of bits of a codepoint that are contained in a UTF-8 continuation byte. 
312 
313 // A UTF-8 bit-pattern that can be set or verified. 
314 struct UTF8Pattern 
315
316 char8_t Mask; // The mask that should be applied to the character before testing it. 
317 char8_t Value; // The value that the character should be tested against after applying the mask. 
318 }; 
319 
320 // Bit-patterns for leading bytes in a UTF-8 codepoint encoding. Each pattern represents the leading byte for a 
321 // character encoded with N UTF-8 bytes where N is the index + 1. 
322 static const UTF8Pattern UTF8LeadingBytes[] = 
323
324 { .Mask: 0x80, .Value: 0x00 }, // 0xxxxxxx 
325 { .Mask: 0xE0, .Value: 0xC0 }, // 110xxxxx 
326 { .Mask: 0xF0, .Value: 0xE0 }, // 1110xxxx 
327 { .Mask: 0xF8, .Value: 0xF0 } // 11110xxx 
328 }; 
329 const int UTF8LeadingBytes_NumElements = tNumElements(UTF8LeadingBytes); 
330 
331 // Calculates the number of UTF-16 16-bit characters it would take to encode a codepoint. The codepoint is not 
332 // checked for validity. That should be done beforehand. 
333 int CalculateUtf16Length(char32_t codepoint); 
334 
335 // Gets a single codepoint from a UTF-16 string (string does not need null-termination). Returns how many char16s 
336 // were read to generate the codepoint. If 2 char16s (surrogate pairs) were read, returns 2. Otherwise returns 1. 
337 // For invalid encodings, the codepoint is set to the special 'replacement' (from the BMP) and 1 is returned. 
338 int DecodeUtf16(char32_t& codepoint, const char16_t* src); 
339 
340 // Encodes a 32-bit codepoint into a UTF-16 string. The codepoint is not checked for validity by this function. You 
341 // must ensure the dst buffer is big enough -- 2 is always big enough, but you can call CalculateUtf16Length to get 
342 // an exact size. Returns the number of char16s written to dst [0,2]. Returns 0 is dst is nullptr. 
343 int EncodeUtf16(char16_t* dst, char32_t codepoint); 
344 
345 // Calculates the number of UTF-8 8-bit chars it would take to encode a codepoint. The codepoint is not checked 
346 // for validity. That should be done beforehand. 
347 int CalculateUtf8Length(char32_t codepoint); 
348 
349 // Gets a single codepoint from a UTF-8 string (string does not need null-termination). Returns how many char8s 
350 // were read to generate the codepoint. eg. If 3 char8s (surrogates) were read, returns 3. 
351 // For invalid encodings, the codepoint is set to the special 'replacement' num bytes read from src is returned. 
352 int DecodeUtf8(char32_t& codepoint, const char8_t* src); 
353 
354 // Encodes a 32-bit codepoint into a UTF-8 string. The codepoint is not checked for validity by this function. You 
355 // must ensure the dst buffer is big enough -- 4 is always big enough, but you can call CalculateUtf8Length to get 
356 // an exact size. Returns the number of char8s written to dst [0,4]. Returns 0 is dst is nullptr. 
357 int EncodeUtf8(char8_t* dst, char32_t codepoint); 
358}; 
359 
360 
361int tStd::tUTF8(char8_t* dst, const char16_t* src, int srcLen
362
363 // Compute fast worst-case size needed. 
364 // UTF-8 can use up to 3 bytes to encode some codepoints in the BMP (Basic Multilingual Plane). This has 
365 // implications for how much room UTF-8 encoded text could take up from src data that's UTF-16. Eg. 2 char16s could be 
366 // either 2 codepoints in the BMP (6 bytes in UTF-8) or a single codepoint if the second char16 is a surrogate (4 bytes 
367 // in UTF-8). Therefore worst case without inspecting data is 3*numChar16s. 
368 if (!src
369 return srcLen * 3
370 
371 int total = 0
372 while (srcLen > 0
373
374 char32_t codepoint
375 int read = tUTF::DecodeUtf16(codepoint, src); 
376 srcLen -= read
377 src += read
378 
379 int written = 0
380 if (dst
381
382 written = tUTF::EncodeUtf8(dst, codepoint); 
383 dst += written
384
385 else 
386
387 // No encoding. Just compute length. 
388 written = tUTF::CalculateUtf8Length(codepoint); 
389
390 total += written
391
392 
393 return total
394
395 
396 
397int tStd::tUTF8(char8_t* dst, const char32_t* src, int srcLen
398
399 // Compute fast worst-case size needed. 
400 // Worst case is every char32 needing 4 char8s. 
401 if (!src
402 return srcLen * 4
403 
404 int total = 0
405 for (int i = 0; i < srcLen; i++) 
406
407 char32_t codepoint = src[i]; 
408 
409 int written = 0
410 if (dst
411
412 written = tUTF::EncodeUtf8(dst, codepoint); 
413 dst += written
414
415 else 
416
417 // No encoding. Just compute length. 
418 written = tUTF::CalculateUtf8Length(codepoint); 
419
420 total += written
421
422 
423 return total
424
425 
426 
427int tStd::tUTF16(char16_t* dst, const char8_t* src, int srcLen
428
429 // Compute fast worst-case size needed. 
430 // 1 char8 -> 1 char16. 
431 // 2 char8s (surrogates) -> 1 char16. 
432 // 3 char8s (surrogates) -> also guaranteed 1 char16. 
433 // 4 char8s (surrogates) -> 2 char16s. 
434 // So worst-case is every byte needing 1 whole char16. 
435 if (!src
436 return srcLen
437 
438 int total = 0
439 while (srcLen > 0
440
441 char32_t codepoint
442 int read = tUTF::DecodeUtf8(codepoint, src); 
443 srcLen -= read
444 src += read
445 
446 int written = 0
447 if (dst
448
449 written = tUTF::EncodeUtf16(dst, codepoint); 
450 dst += written
451
452 else 
453
454 // No encoding. Just compute length. 
455 written = tUTF::CalculateUtf16Length(codepoint); 
456
457 total += written
458
459 
460 return total
461
462 
463 
464int tStd::tUTF16(char16_t* dst, const char32_t* src, int srcLen
465
466 // Compute fast worst-case size needed. 
467 // Worst case is every char32 needing 2 char16s. 
468 if (!src
469 return srcLen * 2
470 
471 int total = 0
472 for (int i = 0; i < srcLen; i++) 
473
474 char32_t codepoint = src[i]; 
475 
476 int written = 0
477 if (dst
478
479 written = tUTF::EncodeUtf16(dst, codepoint); 
480 dst += written
481
482 else 
483
484 // No encoding. Just compute length. 
485 written = tUTF::CalculateUtf16Length(codepoint); 
486
487 total += written
488
489 
490 return total
491
492 
493 
494int tStd::tUTF32(char32_t* dst, const char8_t* src, int srcLen
495
496 // Compute fast worst-case size needed. 
497 // Worst-case is every char8 needing 1 whole char32. 
498 if (!src
499 return srcLen
500 
501 int total = 0
502 while (srcLen > 0
503
504 char32_t codepoint
505 int read = tUTF::DecodeUtf8(codepoint, src); 
506 srcLen -= read
507 src += read
508 
509 if (dst
510
511 dst[0] = codepoint
512 dst++; 
513
514 total++; 
515
516 
517 return total
518
519 
520 
521int tStd::tUTF32(char32_t* dst, const char16_t* src, int srcLen
522
523 // Compute fast worst-case size needed. 
524 // Worst-case is every char16 needing 1 whole char32. 
525 if (!src
526 return srcLen
527 
528 int total = 0
529 while (srcLen > 0
530
531 char32_t codepoint
532 int read = tUTF::DecodeUtf16(codepoint, src); 
533 srcLen -= read
534 src += read
535 
536 if (dst
537
538 dst[0] = codepoint
539 dst++; 
540
541 total++; 
542
543 
544 return total
545
546 
547 
548int tStd::tUTF8s(char8_t* dst, const char16_t* src
549
550 if (!src
551 return 0
552 
553 int length = tUTF8(dst, src, srcLen: tStrlen(s: src)); 
554 if (dst
555 dst[length] = u8'\0'; 
556 
557 return length
558
559 
560 
561int tStd::tUTF8s(char8_t* dst, const char32_t* src
562
563 if (!src
564 return 0
565 
566 int length = tUTF8(dst, src, srcLen: tStrlen(s: src)); 
567 if (dst
568 dst[length] = u8'\0'; 
569 
570 return length
571
572 
573 
574int tStd::tUTF16s(char16_t* dst, const char8_t* src
575
576 if (!src
577 return 0
578 
579 int length = tUTF16(dst, src, srcLen: tStrlen(s: src)); 
580 if (dst
581 dst[length] = u'\0'
582 
583 return length
584
585 
586 
587int tStd::tUTF16s(char16_t* dst, const char32_t* src
588
589 if (!src
590 return 0
591 
592 int length = tUTF16(dst, src, srcLen: tStrlen(s: src)); 
593 if (dst
594 dst[length] = u'\0'
595 
596 return length
597
598 
599 
600int tStd::tUTF32s(char32_t* dst, const char8_t* src
601
602 if (!src
603 return 0
604 
605 int length = tUTF32(dst, src, srcLen: tStrlen(s: src)); 
606 if (dst
607 dst[length] = U'\0'
608 
609 return length
610
611 
612 
613int tStd::tUTF32s(char32_t* dst, const char16_t* src
614
615 if (!src
616 return 0
617 
618 int length = tUTF32(dst, src, srcLen: tStrlen(s: src)); 
619 if (dst
620 dst[length] = U'\0'
621 
622 return length
623
624 
625 
626char32_t tStd::tUTF32c(const char8_t* srcPoint
627
628 char32_t codepoint = cCodepoint_Replacement
629 if (!srcPoint
630 return codepoint
631 
632 tUTF::DecodeUtf8(codepoint, src: srcPoint); 
633 return codepoint
634
635 
636 
637char32_t tStd::tUTF32c(const char16_t* srcPoint
638
639 char32_t codepoint = cCodepoint_Replacement
640 if (!srcPoint
641 return codepoint
642 
643 tUTF::DecodeUtf16(codepoint, src: srcPoint); 
644 return codepoint
645
646 
647 
648char32_t tStd::tUTF32c(const char32_t* srcPoint
649
650 char32_t codepoint = cCodepoint_Replacement
651 if (!srcPoint
652 return codepoint
653 
654 if (*srcPoint > tUTF::cCodepoint_UnicodeMax
655 codepoint = cCodepoint_Replacement
656 else 
657 codepoint = *srcPoint
658 
659 return codepoint
660
661 
662 
663int tStd::tUTF32c(char32_t dst[1], const char8_t* srcPoint
664
665 char32_t codepoint = cCodepoint_Replacement
666 if (!srcPoint
667
668 if (dst) dst[0] = codepoint
669 return 0
670
671 
672 // Decode is a low-level function. It expects srcPoint to be valid. 
673 int unitCount = tUTF::DecodeUtf8(codepoint, src: srcPoint); 
674 if (dst) dst[0] = codepoint
675 return unitCount
676
677 
678 
679int tStd::tUTF32c(char32_t dst[1], const char16_t* srcPoint
680
681 char32_t codepoint = cCodepoint_Replacement
682 if (!srcPoint
683
684 if (dst) dst[0] = codepoint
685 return 0
686
687 
688 // Decode is a low-level function. It expects srcPoint to be valid. 
689 int unitCount = tUTF::DecodeUtf16(codepoint, src: srcPoint); 
690 if (dst) dst[0] = codepoint
691 return unitCount
692
693 
694 
695int tStd::tUTF32c(char32_t dst[1], const char32_t* srcPoint
696
697 if (!srcPoint
698
699 if (dst) dst[0] = cCodepoint_Replacement
700 return 0
701
702 
703 if (dst) dst[0] = *srcPoint
704 return 1
705
706 
707 
708int tStd::tUTF8c(char8_t dst[4], char32_t srcPoint
709
710 return tUTF::EncodeUtf8(dst, codepoint: srcPoint); 
711
712 
713 
714int tStd::tUTF16c(char16_t dst[2], char32_t srcPoint
715
716 return tUTF::EncodeUtf16(dst, codepoint: srcPoint); 
717
718 
719 
720int tStd::tUTF32c(char32_t dst[1], char32_t srcPoint
721
722 if (!dst
723 return 0
724 
725 if (srcPoint > tUTF::cCodepoint_UnicodeMax
726 dst[0] = cCodepoint_Replacement
727 else 
728 dst[0] = srcPoint
729 return 1
730
731 
732 
733int tUTF::CalculateUtf16Length(char32_t codepoint
734
735 if (codepoint <= cCodepoint_LastValidBMP
736 return 1
737 
738 return 2
739
740 
741 
742int tUTF::DecodeUtf16(char32_t& codepoint, const char16_t* src
743
744 tAssert(src); 
745 char16_t high = src[0]; 
746 
747 // If BMP character, we're done. 
748 if ((high & cSurrogate_GenericMask16) != cSurrogate_GenericVal16
749
750 codepoint = high
751 return 1
752
753 
754 // If unmatched low surrogate it's invalid. Return replacement. 
755 if ((high & cSurrogate_Mask16) != cSurrogate_HighVal16
756
757 codepoint = tStd::cCodepoint_Replacement
758 return 1
759
760  
761 char16_t low = src[1]; 
762 
763 // If unmatched high surrogate it's invalid. Return replacement. 
764 if ((low & cSurrogate_Mask16) != cSurrogate_LowVal16
765
766 codepoint = tStd::cCodepoint_Replacement
767 return 1
768
769 
770 // Two correctly matched surrogates if we ade it this far. 
771 // The high bits of the codepoint are the value bits of the high surrogate. 
772 // The low bits of the codepoint are the value bits of the low surrogate. 
773 codepoint = high & cSurrogate_CodepointMask16
774 codepoint <<= cSurrogate_CodepointBits
775 codepoint |= low & cSurrogate_CodepointMask16
776 codepoint += cSurrogate_CodepointOffset;  
777 return 2
778
779 
780 
781int tUTF::EncodeUtf16(char16_t* dst, char32_t codepoint
782
783 if (!dst
784 return 0
785 
786 // If codepoint in the BMP just write the single char16. 
787 if (codepoint <= cCodepoint_LastValidBMP
788
789 dst[0] = codepoint
790 return 1
791
792 
793 codepoint -= cSurrogate_CodepointOffset
794 char16_t low = cSurrogate_LowVal16
795 low |= codepoint & cSurrogate_CodepointMask32
796 
797 codepoint >>= cSurrogate_CodepointBits
798 char16_t high = cSurrogate_HighVal16
799 high |= codepoint & cSurrogate_CodepointMask32
800 
801 dst[0] = high
802 dst[1] = low
803 return 2
804
805 
806 
807int tUTF::CalculateUtf8Length(char32_t codepoint
808
809 if (codepoint <= cCodepoint_UTF8Max1
810 return 1
811 
812 if (codepoint <= cCodepoint_UTF8Max2
813 return 2
814 
815 if (codepoint <= cCodepoint_UTF8Max3
816 return 3
817 
818 if (codepoint <= cCodepoint_UnicodeMax
819 return 4
820 
821 // Return max 4 in case the UTF-8 standard ever increases cCodepoint_UnicodeMax. What they won't 
822 // break is that UTF-8 can encode all codepoints, so checking UnicodeMax is still valid. 
823 return 4
824
825 
826 
827int tUTF::DecodeUtf8(char32_t& codepoint, const char8_t* src
828
829 tAssert(src); 
830 char8_t leading = src[0]; 
831 int encodingLength = 0
832 UTF8Pattern leadingPattern
833 
834 bool matches = false; // True if the leading byte matches the current leading pattern. 
835 do 
836
837 encodingLength++; 
838 leadingPattern = UTF8LeadingBytes[encodingLength - 1]; 
839 matches = (leading & leadingPattern.Mask) == leadingPattern.Value
840 
841 } while (!matches && (encodingLength < UTF8LeadingBytes_NumElements)); 
842 
843 // If leading byte doesn't match any known pattern it is invalid and we return replacement. 
844 if (!matches
845
846 codepoint = tStd::cCodepoint_Replacement
847 return encodingLength
848
849 
850 codepoint = leading & ~leadingPattern.Mask
851 
852 // This loop only ends up running if continuation codeunits found (not ASCII). 
853 for (int i = 1; i < encodingLength; i++) 
854
855 char8_t continuation = src[i]; 
856 
857 // If number of continuation bytes is not the same as advertised on the leading byte it's an invalid encoding 
858 // so we return the replacement. 
859 if ((continuation & cContinuation_UTF8Mask) != cContinuation_UTF8Val
860
861 codepoint = tStd::cCodepoint_Replacement
862 
863 // I think the best behaviour here is to return how much we processed b4 running into a problem. 
864 // If we returned encodingLength we might skip some input when an invalid is encountered. Hard to say. 
865 return 1+i
866
867 
868 codepoint <<= cContinuation_CodepointBits
869 codepoint |= continuation & ~cContinuation_UTF8Mask
870
871 
872 if 
873
874 // These are guaranteed to be non-characters by the standard and reuire the replacement. 
875 ((codepoint == tStd::cCodepoint_SpecialNonCharA) || (codepoint == tStd::cCodepoint_SpecialNonCharB)) || 
876 
877 // Surrogates are invalid Unicode codepoints and should only be used in UTF-16. Invalid encoding so return replacement. 
878 ((codepoint <= cCodepoint_LastValidBMP) && ((codepoint & cSurrogate_GenericMask32) == cSurrogate_GenericVal32)) || 
879 
880 // UTF-8 can encode codepoints larger than the Unicode standard allows. If it does it's an invalid encoding and we return the replacement codepoint. 
881 (codepoint > cCodepoint_UnicodeMax) || 
882 
883 // Overlong encodings are considered invalid so we return the replacement codepoint and return the actual number read so we skip the overlong completely. 
884 // We do this last cuz of short-circuit expression evaluation in C++ (calc only called if necessary). 
885 (CalculateUtf8Length(codepoint) != encodingLength
886
887
888 codepoint = tStd::cCodepoint_Replacement
889
890 
891 return encodingLength
892
893 
894 
895int tUTF::EncodeUtf8(char8_t* dst, char32_t codepoint
896
897 if (!dst
898 return 0
899 
900 // Write the continuation bytes in reverse order. 
901 int encodeLength = CalculateUtf8Length(codepoint); 
902 for (int contIndex = encodeLength - 1; contIndex > 0; contIndex--) 
903
904 char8_t cont = codepoint & ~cContinuation_UTF8Mask
905 cont |= cContinuation_UTF8Val
906 dst[contIndex] = cont
907 codepoint >>= cContinuation_CodepointBits
908
909 
910 // Write the leading byte. 
911 UTF8Pattern pattern = UTF8LeadingBytes[encodeLength - 1]; 
912 char8_t lead = codepoint & ~(pattern.Mask); 
913 lead |= pattern.Value
914 dst[0] = lead
915 
916 return encodeLength
917
918