tStandard.cpp source code [Modules/Foundation/Src/tStandard.cpp]

1	// tStandard.cpp
2	//
3	// Tacent functions and types that are standard across all platforms. Includes global functions like itoa which are not
4	// available on some platforms, but are common enough that they should be.
5	//
6	// Copyright (c) 2004-2006, 2015, 2023-2025 Tristan Grimmer.
7	// Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby
8	// granted, provided that the above copyright notice and this permission notice appear in all copies.
9	//
10	// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
11	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
12	// INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
13	// AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14	// PERFORMANCE OF THIS SOFTWARE.
15
16	#include <stdlib.h>
17	#ifdef PLATFORM_WINDOWS
18	#include <Windows.h>
19	#endif
20	#include "Foundation/tStandard.h"
21	#include "Foundation/tString.h"
22	#include "Foundation/tFundamentals.h"
23	#pragma warning (disable: 4146)
24	#pragma warning (disable: 4018)
25
26
27	const char* tStd::SeparatorSubStr = "\x1a";
28	const char* tStd::SeparatorFileStr = "\x1c";
29	const char* tStd::SeparatorGroupStr = "\x1d";
30	const char* tStd::SeparatorRecordStr = "\x1e";
31	const char* tStd::SeparatorUnitStr = "\x1f";
32	const char* tStd::SeparatorAStr = tStd::SeparatorUnitStr;
33	const char* tStd::SeparatorBStr = tStd::SeparatorRecordStr;
34	const char* tStd::SeparatorCStr = tStd::SeparatorGroupStr;
35	const char* tStd::SeparatorDStr = tStd::SeparatorFileStr;
36	const char* tStd::SeparatorEStr = tStd::SeparatorSubStr;
37	const char8_t* tStd::u8SeparatorSubStr = (const char8_t*)tStd::SeparatorSubStr;
38	const char8_t* tStd::u8SeparatorFileStr = (const char8_t*)tStd::SeparatorFileStr;
39	const char8_t* tStd::u8SeparatorGroupStr = (const char8_t*)tStd::SeparatorGroupStr;
40	const char8_t* tStd::u8SeparatorRecordStr = (const char8_t*)tStd::SeparatorRecordStr;
41	const char8_t* tStd::u8SeparatorUnitStr = (const char8_t*)tStd::SeparatorUnitStr;
42	const char8_t* tStd::u8SeparatorAStr = (const char8_t*)tStd::SeparatorAStr;
43	const char8_t* tStd::u8SeparatorBStr = (const char8_t*)tStd::SeparatorBStr;
44	const char8_t* tStd::u8SeparatorCStr = (const char8_t*)tStd::SeparatorCStr;
45	const char8_t* tStd::u8SeparatorDStr = (const char8_t*)tStd::SeparatorDStr;
46	const char8_t* tStd::u8SeparatorEStr = (const char8_t*)tStd::SeparatorEStr;
47
48
49	void* tStd::tMemsrch(void* haystack, int haystackNumBytes, void* needle, int needleNumBytes)
50	{
51	if ((haystackNumBytes <= `0`) \|\| (needleNumBytes <= `0`) \|\| (haystackNumBytes < needleNumBytes))
52	return nullptr;
53
54	// Serach for the pattern from the first haystack byte (0) to numNeedleBytes from the end. For example, if we are
55	// seraching for 4 bytes in 8, there will be 5 mem compares of 4 bytes each.
56	for (int i = `0`; i <= haystackNumBytes-needleNumBytes; i++)
57	{
58	if (tMemcmp(a: (uint8*)haystack + i, b: needle, numBytes: needleNumBytes) == `0`)
59	return (uint8*)haystack + i;
60	}
61
62	return nullptr;
63	}
64
65
66	int tStd::tNstrcmp(const char* a, const char* b)
67	{
68	const char* origa = a;
69	const char* origb = b;
70
71	bool aStartsDig = a && tIsdigit(c: *a);
72	bool bStartsDig = b && tIsdigit(c: *b);
73
74	// This implementation of tNstrcmp is a modified version of the one written by GitHub user ClangPan.
75	while (a && b)
76	{
77	bool aDigit = tIsdigit(c: *a);
78	bool bDigit = tIsdigit(c: *b);
79
80	if (!aDigit && (*a == `'-'`))
81	{
82	++a;
83	continue;
84	}
85
86	if (!bDigit && (*b == `'-'`))
87	{
88	++b;
89	continue;
90	}
91
92	// We're comparing (possibly multidigit) numbers.
93	if (aDigit && bDigit)
94	{
95	char* enda;
96	char* endb;
97
98	// Get the left number.
99	int aInt = strtoul(nptr: (char*)a, endptr: &enda, base: `10`);
100
101	// Get the right number.
102	int bInt = strtoul(nptr: (char*)b, endptr: &endb, base: `10`);
103
104	// if the difference is not equal to zero, we have a comparison result
105	int sign = tMath::tSign(val: aInt - bInt);
106	if (sign) return sign;
107
108	a = enda;
109	b = endb;
110	continue;
111	}
112
113	// If only the left char is a digit, we have a result.
114	if (aDigit) return aStartsDig ? -`1` : +`1`;
115
116	// If only the right char is a digit, we have a result.
117	if (bDigit) return bStartsDig ? +`1` : -`1`;
118
119	// compute the difference of both characters
120	int sign = tMath::tSign(val: tToLower(c: a) - tToLower(c: b));
121
122	// If they differ we have a result.
123	if (sign) return sign;
124
125	// Otherwise process the next characters.
126	++a; ++b;
127	}
128
129	// If both a and b are at end, we consider letter-case and compare as if we had never done the tToLowers.
130	if (!(a) && !(b))
131	return tStrcmp(a: origa, b: origb);
132
133	// Now only one of a or b are non-zero.
134	if (b) return* -`1`;
135	if (a) return* +`1`;
136
137	return `0`;
138	}
139
140
141	int tStd::tNstrcmpEx(const char* a, const char* b)
142	{
143	if (tStrcmp(a, b) == `0`)
144	return `0`;
145
146	// Code modified from https://github.com/scopeInfinity/NaturalSort
147	bool foundSpace1 = false;
148	bool foundSpace2 = false;
149
150	// Loop on every character.
151	while (a && b)
152	{
153	// Ignore More than one continous space.
154	while (foundSpace1 && a && a == `' '`)
155	a++;
156	foundSpace1 = false;
157	if (*a == `' '`)
158	foundSpace1 = true;
159
160	while (foundSpace2 && b && b == `' '`)
161	b++;
162	foundSpace2 = false;
163	if (*b == `' '`)
164	foundSpace2 = true;
165
166	// If one character is alphanumeric, compare as usual. Edge case when we encounter a zero first, to avoid
167	// problematic situations like '01.png' & '001.png' that would otherwise be considered equal.
168	if (!tIsdigit(c: a) \|\| !tIsdigit(c: b) \|\| (a == `'0'`) \|\| (b == `'0'`))
169	{
170	// Normal comparision if any of character is non digit character.
171	if (tToLower(c: a) < tToLower(c: b))
172	return -`1`;
173
174	if (tToLower(c: b) < tToLower(c: a))
175	return +`1`;
176
177	a++; b++;
178	}
179	// If both characters are numbers do a numeral comparison.
180	else
181	{
182	// Get the full number with tAtoi() to account for when you're comparing e.g. '1.png' & '10.png'.
183	int digit1 = tAtoi(s: (const char*)a);
184	int digit2 = tAtoi(s: (const char*)b);
185
186	// Compare the numbers. If they are the same we just continue.
187	if (digit1 < digit2)
188	return -`1`;
189	if (digit2 < digit1)
190	return +`1`;
191
192	using namespace tMath;
193
194	// Advance the pointers by the length of the digits (math, yay).
195	a += int(tFloor(v: tLog10(x: float(digit1)))) + `1`;
196	b += int(tFloor(v: tLog10(x: float(digit2)))) + `1`;
197	}
198	}
199
200	return +`1`;
201	}
202
203
204	bool tStd::tStrtob(const char* str)
205	{
206	tString lower(str);
207	lower.ToLower();
208
209	if
210	(
211	(lower == "true") \|\| (lower == "t") \|\|
212	(lower == "yes") \|\| (lower == "y") \|\|
213	(lower == "on") \|\| (lower == "1") \|\| (lower == "+") \|\|
214	(lower == "enable") \|\| (lower == "enabled") \|\| (tStrtoi(s: str) != `0`)
215	)
216	return true;
217	else
218	return false;
219	}
220
221
222	float tStd::tStrtof(const char* s)
223	{
224	// Both tStrchr and tStrlen assert on nullptrs so we check here.
225	if (!s)
226	return `0.0f`;
227
228	char* hash = tStrchr(s, c: `'#'`);
229	if (hash && (tStrlen(s: hash+`1`) == `8`))
230	{
231	uint32 bin = tStd::tStrtoui32(s: hash+`1`, base: `16`);
232	return ((float**)(&bin));
233	}
234
235	return float( tStrtod(s) );
236	}
237
238
239	double tStd::tStrtod(const char* s)
240	{
241	// tStrlen asserts on nullptrs so we check here.
242	if (!s)
243	return `0.0`;
244
245	int l = tStrlen(s);
246	if (!l)
247	return `0.0`;
248
249	char* hash = tStrchr(s, c: `'#'`);
250	if (hash && (tStrlen(s: hash+`1`) == `16`))
251	{
252	uint64 bin = tStrtoui64(s: hash+`1`, base: `16`);
253	return ((double**)&bin);
254	}
255
256	// This error checking is essential. Sometimes NANs are written in text format to a string.
257	// Like "nan(snan)". We want these to evaluate to 0.0, not -1 or something else. We allow
258	// 'e' and 'E' for numbers in exponential form like 3.09E08.
259	for (int i = `0`; i < l; i++)
260	{
261	char ch = s[i];
262	if
263	(
264	((ch >= `'a'`) && (ch <= `'z'`) && (ch != `'e'`)) \|\|
265	((ch >= `'A'`) && (ch <= `'Z'`) && (ch != `'E'`))
266	)
267	return `0.0`;
268	}
269
270	// Will be 0.0 if there was a problem.
271	return strtod(nptr: s, endptr: nullptr);
272	}
273
274
275	void tStd::tStrrev(char* begin, char* end)
276	{
277	char aux;
278	while (end > begin)
279	aux = end, end-- = begin, begin++ = aux;
280	}
281
282
283	// The UTF 8 <-> 16 conversion code below was based on https://github.com/Davipb/utf8-utf16-converter
284	// under the MIT licence. See Docs/Licence_Utf8Utf16.txt
285	namespace tUTF
286	{
287	// BMP = Basic Multilingual Plane.
288	// CP = Unicode codepoint.
289	const char32_t cCodepoint_LastValidBMP = `0x0000FFFD`; // Last valid codepoint. Note that U+FFFF and U+FFFE are guaranteed 'non-characters'. They do not appear if codepoint is valid.
290	const char32_t cCodepoint_UnicodeMax = `0x0010FFFF`; // The highest valid Unicode codepoint.
291	const char32_t cCodepoint_UTF8Max1 = `0x0000007F`; // The highest codepoint that can be encoded with 1 byte in UTF-8.
292	const char32_t cCodepoint_UTF8Max2 = `0x000007FF`; // The highest codepoint that can be encoded with 2 bytes in UTF-8.
293	const char32_t cCodepoint_UTF8Max3 = `0x0000FFFF`; // The highest codepoint that can be encoded with 3 bytes in UTF-8.
294
295	const char16_t cSurrogate_GenericMask16 = `0xF800`; // The mask to apply before testing it against cSurrogate_GenericVal16
296	const char16_t cSurrogate_GenericVal16 = `0xD800`; // If masked with cSurrogate_GenericMask16, matches this value, it is a surrogate.
297	const char32_t cSurrogate_GenericMask32 = `0x0000F800`; // The mask to apply before testing it against cSurrogate_GenericVal32
298	const char32_t cSurrogate_GenericVal32 = `0x0000D800`; // If masked with cSurrogate_GenericMask32, matches this value, it is a surrogate.
299
300	const char16_t cSurrogate_Mask16 = `0xFC00`; // The mask to apply to a character before testing it against cSurrogate_HighVal16 or cSurrogate_LowVal16.
301	const char16_t cSurrogate_HighVal16 = `0xD800`; // If a character, masked with cSurrogate_Mask16, matches this value, it is a high surrogate.
302	const char16_t cSurrogate_LowVal16 = `0xDC00`; // If a character, masked with cSurrogate_Mask16, matches this value, it is a low surrogate.
303
304	const char16_t cSurrogate_CodepointMask16 = `0x03FF`; // A mask that can be applied to a surrogate to extract the codepoint value contained in it.
305	const char32_t cSurrogate_CodepointMask32 = `0x000003FF`; // A mask that can be applied to a surrogate to extract the codepoint value contained in it.
306	const int cSurrogate_CodepointBits = `10`; // The number of LS bits of cSurrogate_CodepointMask that are set.
307	const char32_t cSurrogate_CodepointOffset = `0x00010000`; // The value that is subtracted from a codepoint before encoding it in a surrogate pair.
308
309	const char8_t cContinuation_UTF8Mask = `0xC0`; // The mask to a apply to a character before testing it against cContinuation_UTF8Val
310	const char8_t cContinuation_UTF8Val = `0x80`; // If a character, masked with cContinuation_UTF8Mask, matches this value, it is a UTF-8 continuation byte.
311	const int cContinuation_CodepointBits = `6`; // The number of bits of a codepoint that are contained in a UTF-8 continuation byte.
312
313	// A UTF-8 bit-pattern that can be set or verified.
314	struct UTF8Pattern
315	{
316	char8_t Mask; // The mask that should be applied to the character before testing it.
317	char8_t Value; // The value that the character should be tested against after applying the mask.
318	};
319
320	// Bit-patterns for leading bytes in a UTF-8 codepoint encoding. Each pattern represents the leading byte for a
321	// character encoded with N UTF-8 bytes where N is the index + 1.
322	static const UTF8Pattern UTF8LeadingBytes[] =
323	{
324	{ .Mask: `0x80`, .Value: `0x00` }, // 0xxxxxxx
325	{ .Mask: `0xE0`, .Value: `0xC0` }, // 110xxxxx
326	{ .Mask: `0xF0`, .Value: `0xE0` }, // 1110xxxx
327	{ .Mask: `0xF8`, .Value: `0xF0` } // 11110xxx
328	};
329	const int UTF8LeadingBytes_NumElements = tNumElements(UTF8LeadingBytes);
330
331	// Calculates the number of UTF-16 16-bit characters it would take to encode a codepoint. The codepoint is not
332	// checked for validity. That should be done beforehand.
333	int CalculateUtf16Length(char32_t codepoint);
334
335	// Gets a single codepoint from a UTF-16 string (string does not need null-termination). Returns how many char16s
336	// were read to generate the codepoint. If 2 char16s (surrogate pairs) were read, returns 2. Otherwise returns 1.
337	// For invalid encodings, the codepoint is set to the special 'replacement' (from the BMP) and 1 is returned.
338	int DecodeUtf16(char32_t& codepoint, const char16_t* src);
339
340	// Encodes a 32-bit codepoint into a UTF-16 string. The codepoint is not checked for validity by this function. You
341	// must ensure the dst buffer is big enough -- 2 is always big enough, but you can call CalculateUtf16Length to get
342	// an exact size. Returns the number of char16s written to dst [0,2]. Returns 0 is dst is nullptr.
343	int EncodeUtf16(char16_t* dst, char32_t codepoint);
344
345	// Calculates the number of UTF-8 8-bit chars it would take to encode a codepoint. The codepoint is not checked
346	// for validity. That should be done beforehand.
347	int CalculateUtf8Length(char32_t codepoint);
348
349	// Gets a single codepoint from a UTF-8 string (string does not need null-termination). Returns how many char8s
350	// were read to generate the codepoint. eg. If 3 char8s (surrogates) were read, returns 3.
351	// For invalid encodings, the codepoint is set to the special 'replacement' num bytes read from src is returned.
352	int DecodeUtf8(char32_t& codepoint, const char8_t* src);
353
354	// Encodes a 32-bit codepoint into a UTF-8 string. The codepoint is not checked for validity by this function. You
355	// must ensure the dst buffer is big enough -- 4 is always big enough, but you can call CalculateUtf8Length to get
356	// an exact size. Returns the number of char8s written to dst [0,4]. Returns 0 is dst is nullptr.
357	int EncodeUtf8(char8_t* dst, char32_t codepoint);
358	};
359
360
361	int tStd::tUTF8(char8_t* dst, const char16_t* src, int srcLen)
362	{
363	// Compute fast worst-case size needed.
364	// UTF-8 can use up to 3 bytes to encode some codepoints in the BMP (Basic Multilingual Plane). This has
365	// implications for how much room UTF-8 encoded text could take up from src data that's UTF-16. Eg. 2 char16s could be
366	// either 2 codepoints in the BMP (6 bytes in UTF-8) or a single codepoint if the second char16 is a surrogate (4 bytes
367	// in UTF-8). Therefore worst case without inspecting data is 3numChar16s.*
368	if (!src)
369	return srcLen * `3`;
370
371	int total = `0`;
372	while (srcLen > `0`)
373	{
374	char32_t codepoint;
375	int read = tUTF::DecodeUtf16(codepoint, src);
376	srcLen -= read;
377	src += read;
378
379	int written = `0`;
380	if (dst)
381	{
382	written = tUTF::EncodeUtf8(dst, codepoint);
383	dst += written;
384	}
385	else
386	{
387	// No encoding. Just compute length.
388	written = tUTF::CalculateUtf8Length(codepoint);
389	}
390	total += written;
391	}
392
393	return total;
394	}
395
396
397	int tStd::tUTF8(char8_t* dst, const char32_t* src, int srcLen)
398	{
399	// Compute fast worst-case size needed.
400	// Worst case is every char32 needing 4 char8s.
401	if (!src)
402	return srcLen * `4`;
403
404	int total = `0`;
405	for (int i = `0`; i < srcLen; i++)
406	{
407	char32_t codepoint = src[i];
408
409	int written = `0`;
410	if (dst)
411	{
412	written = tUTF::EncodeUtf8(dst, codepoint);
413	dst += written;
414	}
415	else
416	{
417	// No encoding. Just compute length.
418	written = tUTF::CalculateUtf8Length(codepoint);
419	}
420	total += written;
421	}
422
423	return total;
424	}
425
426
427	int tStd::tUTF16(char16_t* dst, const char8_t* src, int srcLen)
428	{
429	// Compute fast worst-case size needed.
430	// 1 char8 -> 1 char16.
431	// 2 char8s (surrogates) -> 1 char16.
432	// 3 char8s (surrogates) -> also guaranteed 1 char16.
433	// 4 char8s (surrogates) -> 2 char16s.
434	// So worst-case is every byte needing 1 whole char16.
435	if (!src)
436	return srcLen;
437
438	int total = `0`;
439	while (srcLen > `0`)
440	{
441	char32_t codepoint;
442	int read = tUTF::DecodeUtf8(codepoint, src);
443	srcLen -= read;
444	src += read;
445
446	int written = `0`;
447	if (dst)
448	{
449	written = tUTF::EncodeUtf16(dst, codepoint);
450	dst += written;
451	}
452	else
453	{
454	// No encoding. Just compute length.
455	written = tUTF::CalculateUtf16Length(codepoint);
456	}
457	total += written;
458	}
459
460	return total;
461	}
462
463
464	int tStd::tUTF16(char16_t* dst, const char32_t* src, int srcLen)
465	{
466	// Compute fast worst-case size needed.
467	// Worst case is every char32 needing 2 char16s.
468	if (!src)
469	return srcLen * `2`;
470
471	int total = `0`;
472	for (int i = `0`; i < srcLen; i++)
473	{
474	char32_t codepoint = src[i];
475
476	int written = `0`;
477	if (dst)
478	{
479	written = tUTF::EncodeUtf16(dst, codepoint);
480	dst += written;
481	}
482	else
483	{
484	// No encoding. Just compute length.
485	written = tUTF::CalculateUtf16Length(codepoint);
486	}
487	total += written;
488	}
489
490	return total;
491	}
492
493
494	int tStd::tUTF32(char32_t* dst, const char8_t* src, int srcLen)
495	{
496	// Compute fast worst-case size needed.
497	// Worst-case is every char8 needing 1 whole char32.
498	if (!src)
499	return srcLen;
500
501	int total = `0`;
502	while (srcLen > `0`)
503	{
504	char32_t codepoint;
505	int read = tUTF::DecodeUtf8(codepoint, src);
506	srcLen -= read;
507	src += read;
508
509	if (dst)
510	{
511	dst[`0`] = codepoint;
512	dst++;
513	}
514	total++;
515	}
516
517	return total;
518	}
519
520
521	int tStd::tUTF32(char32_t* dst, const char16_t* src, int srcLen)
522	{
523	// Compute fast worst-case size needed.
524	// Worst-case is every char16 needing 1 whole char32.
525	if (!src)
526	return srcLen;
527
528	int total = `0`;
529	while (srcLen > `0`)
530	{
531	char32_t codepoint;
532	int read = tUTF::DecodeUtf16(codepoint, src);
533	srcLen -= read;
534	src += read;
535
536	if (dst)
537	{
538	dst[`0`] = codepoint;
539	dst++;
540	}
541	total++;
542	}
543
544	return total;
545	}
546
547
548	int tStd::tUTF8s(char8_t* dst, const char16_t* src)
549	{
550	if (!src)
551	return `0`;
552
553	int length = tUTF8(dst, src, srcLen: tStrlen(s: src));
554	if (dst)
555	dst[length] = u8'\0';
556
557	return length;
558	}
559
560
561	int tStd::tUTF8s(char8_t* dst, const char32_t* src)
562	{
563	if (!src)
564	return `0`;
565
566	int length = tUTF8(dst, src, srcLen: tStrlen(s: src));
567	if (dst)
568	dst[length] = u8'\0';
569
570	return length;
571	}
572
573
574	int tStd::tUTF16s(char16_t* dst, const char8_t* src)
575	{
576	if (!src)
577	return `0`;
578
579	int length = tUTF16(dst, src, srcLen: tStrlen(s: src));
580	if (dst)
581	dst[length] = u`'\0'`;
582
583	return length;
584	}
585
586
587	int tStd::tUTF16s(char16_t* dst, const char32_t* src)
588	{
589	if (!src)
590	return `0`;
591
592	int length = tUTF16(dst, src, srcLen: tStrlen(s: src));
593	if (dst)
594	dst[length] = u`'\0'`;
595
596	return length;
597	}
598
599
600	int tStd::tUTF32s(char32_t* dst, const char8_t* src)
601	{
602	if (!src)
603	return `0`;
604
605	int length = tUTF32(dst, src, srcLen: tStrlen(s: src));
606	if (dst)
607	dst[length] = U`'\0'`;
608
609	return length;
610	}
611
612
613	int tStd::tUTF32s(char32_t* dst, const char16_t* src)
614	{
615	if (!src)
616	return `0`;
617
618	int length = tUTF32(dst, src, srcLen: tStrlen(s: src));
619	if (dst)
620	dst[length] = U`'\0'`;
621
622	return length;
623	}
624
625
626	char32_t tStd::tUTF32c(const char8_t* srcPoint)
627	{
628	char32_t codepoint = cCodepoint_Replacement;
629	if (!srcPoint)
630	return codepoint;
631
632	tUTF::DecodeUtf8(codepoint, src: srcPoint);
633	return codepoint;
634	}
635
636
637	char32_t tStd::tUTF32c(const char16_t* srcPoint)
638	{
639	char32_t codepoint = cCodepoint_Replacement;
640	if (!srcPoint)
641	return codepoint;
642
643	tUTF::DecodeUtf16(codepoint, src: srcPoint);
644	return codepoint;
645	}
646
647
648	char32_t tStd::tUTF32c(const char32_t* srcPoint)
649	{
650	char32_t codepoint = cCodepoint_Replacement;
651	if (!srcPoint)
652	return codepoint;
653
654	if (*srcPoint > tUTF::cCodepoint_UnicodeMax)
655	codepoint = cCodepoint_Replacement;
656	else
657	codepoint = *srcPoint;
658
659	return codepoint;
660	}
661
662
663	int tStd::tUTF32c(char32_t dst[`1`], const char8_t* srcPoint)
664	{
665	char32_t codepoint = cCodepoint_Replacement;
666	if (!srcPoint)
667	{
668	if (dst) dst[`0`] = codepoint;
669	return `0`;
670	}
671
672	// Decode is a low-level function. It expects srcPoint to be valid.
673	int unitCount = tUTF::DecodeUtf8(codepoint, src: srcPoint);
674	if (dst) dst[`0`] = codepoint;
675	return unitCount;
676	}
677
678
679	int tStd::tUTF32c(char32_t dst[`1`], const char16_t* srcPoint)
680	{
681	char32_t codepoint = cCodepoint_Replacement;
682	if (!srcPoint)
683	{
684	if (dst) dst[`0`] = codepoint;
685	return `0`;
686	}
687
688	// Decode is a low-level function. It expects srcPoint to be valid.
689	int unitCount = tUTF::DecodeUtf16(codepoint, src: srcPoint);
690	if (dst) dst[`0`] = codepoint;
691	return unitCount;
692	}
693
694
695	int tStd::tUTF32c(char32_t dst[`1`], const char32_t* srcPoint)
696	{
697	if (!srcPoint)
698	{
699	if (dst) dst[`0`] = cCodepoint_Replacement;
700	return `0`;
701	}
702
703	if (dst) dst[`0`] = *srcPoint;
704	return `1`;
705	}
706
707
708	int tStd::tUTF8c(char8_t dst[`4`], char32_t srcPoint)
709	{
710	return tUTF::EncodeUtf8(dst, codepoint: srcPoint);
711	}
712
713
714	int tStd::tUTF16c(char16_t dst[`2`], char32_t srcPoint)
715	{
716	return tUTF::EncodeUtf16(dst, codepoint: srcPoint);
717	}
718
719
720	int tStd::tUTF32c(char32_t dst[`1`], char32_t srcPoint)
721	{
722	if (!dst)
723	return `0`;
724
725	if (srcPoint > tUTF::cCodepoint_UnicodeMax)
726	dst[`0`] = cCodepoint_Replacement;
727	else
728	dst[`0`] = srcPoint;
729	return `1`;
730	}
731
732
733	int tUTF::CalculateUtf16Length(char32_t codepoint)
734	{
735	if (codepoint <= cCodepoint_LastValidBMP)
736	return `1`;
737
738	return `2`;
739	}
740
741
742	int tUTF::DecodeUtf16(char32_t& codepoint, const char16_t* src)
743	{
744	tAssert(src);
745	char16_t high = src[`0`];
746
747	// If BMP character, we're done.
748	if ((high & cSurrogate_GenericMask16) != cSurrogate_GenericVal16)
749	{
750	codepoint = high;
751	return `1`;
752	}
753
754	// If unmatched low surrogate it's invalid. Return replacement.
755	if ((high & cSurrogate_Mask16) != cSurrogate_HighVal16)
756	{
757	codepoint = tStd::cCodepoint_Replacement;
758	return `1`;
759	}
760
761	char16_t low = src[`1`];
762
763	// If unmatched high surrogate it's invalid. Return replacement.
764	if ((low & cSurrogate_Mask16) != cSurrogate_LowVal16)
765	{
766	codepoint = tStd::cCodepoint_Replacement;
767	return `1`;
768	}
769
770	// Two correctly matched surrogates if we ade it this far.
771	// The high bits of the codepoint are the value bits of the high surrogate.
772	// The low bits of the codepoint are the value bits of the low surrogate.
773	codepoint = high & cSurrogate_CodepointMask16;
774	codepoint <<= cSurrogate_CodepointBits;
775	codepoint \|= low & cSurrogate_CodepointMask16;
776	codepoint += cSurrogate_CodepointOffset;
777	return `2`;
778	}
779
780
781	int tUTF::EncodeUtf16(char16_t* dst, char32_t codepoint)
782	{
783	if (!dst)
784	return `0`;
785
786	// If codepoint in the BMP just write the single char16.
787	if (codepoint <= cCodepoint_LastValidBMP)
788	{
789	dst[`0`] = codepoint;
790	return `1`;
791	}
792
793	codepoint -= cSurrogate_CodepointOffset;
794	char16_t low = cSurrogate_LowVal16;
795	low \|= codepoint & cSurrogate_CodepointMask32;
796
797	codepoint >>= cSurrogate_CodepointBits;
798	char16_t high = cSurrogate_HighVal16;
799	high \|= codepoint & cSurrogate_CodepointMask32;
800
801	dst[`0`] = high;
802	dst[`1`] = low;
803	return `2`;
804	}
805
806
807	int tUTF::CalculateUtf8Length(char32_t codepoint)
808	{
809	if (codepoint <= cCodepoint_UTF8Max1)
810	return `1`;
811
812	if (codepoint <= cCodepoint_UTF8Max2)
813	return `2`;
814
815	if (codepoint <= cCodepoint_UTF8Max3)
816	return `3`;
817
818	if (codepoint <= cCodepoint_UnicodeMax)
819	return `4`;
820
821	// Return max 4 in case the UTF-8 standard ever increases cCodepoint_UnicodeMax. What they won't
822	// break is that UTF-8 can encode all codepoints, so checking UnicodeMax is still valid.
823	return `4`;
824	}
825
826
827	int tUTF::DecodeUtf8(char32_t& codepoint, const char8_t* src)
828	{
829	tAssert(src);
830	char8_t leading = src[`0`];
831	int encodingLength = `0`;
832	UTF8Pattern leadingPattern;
833
834	bool matches = false; // True if the leading byte matches the current leading pattern.
835	do
836	{
837	encodingLength++;
838	leadingPattern = UTF8LeadingBytes[encodingLength - `1`];
839	matches = (leading & leadingPattern.Mask) == leadingPattern.Value;
840
841	} while (!matches && (encodingLength < UTF8LeadingBytes_NumElements));
842
843	// If leading byte doesn't match any known pattern it is invalid and we return replacement.
844	if (!matches)
845	{
846	codepoint = tStd::cCodepoint_Replacement;
847	return encodingLength;
848	}
849
850	codepoint = leading & ~leadingPattern.Mask;
851
852	// This loop only ends up running if continuation codeunits found (not ASCII).
853	for (int i = `1`; i < encodingLength; i++)
854	{
855	char8_t continuation = src[i];
856
857	// If number of continuation bytes is not the same as advertised on the leading byte it's an invalid encoding
858	// so we return the replacement.
859	if ((continuation & cContinuation_UTF8Mask) != cContinuation_UTF8Val)
860	{
861	codepoint = tStd::cCodepoint_Replacement;
862
863	// I think the best behaviour here is to return how much we processed b4 running into a problem.
864	// If we returned encodingLength we might skip some input when an invalid is encountered. Hard to say.
865	return `1`+i;
866	}
867
868	codepoint <<= cContinuation_CodepointBits;
869	codepoint \|= continuation & ~cContinuation_UTF8Mask;
870	}
871
872	if
873	(
874	// These are guaranteed to be non-characters by the standard and reuire the replacement.
875	((codepoint == tStd::cCodepoint_SpecialNonCharA) \|\| (codepoint == tStd::cCodepoint_SpecialNonCharB)) \|\|
876
877	// Surrogates are invalid Unicode codepoints and should only be used in UTF-16. Invalid encoding so return replacement.
878	((codepoint <= cCodepoint_LastValidBMP) && ((codepoint & cSurrogate_GenericMask32) == cSurrogate_GenericVal32)) \|\|
879
880	// UTF-8 can encode codepoints larger than the Unicode standard allows. If it does it's an invalid encoding and we return the replacement codepoint.
881	(codepoint > cCodepoint_UnicodeMax) \|\|
882
883	// Overlong encodings are considered invalid so we return the replacement codepoint and return the actual number read so we skip the overlong completely.
884	// We do this last cuz of short-circuit expression evaluation in C++ (calc only called if necessary).
885	(CalculateUtf8Length(codepoint) != encodingLength)
886	)
887	{
888	codepoint = tStd::cCodepoint_Replacement;
889	}
890
891	return encodingLength;
892	}
893
894
895	int tUTF::EncodeUtf8(char8_t* dst, char32_t codepoint)
896	{
897	if (!dst)
898	return `0`;
899
900	// Write the continuation bytes in reverse order.
901	int encodeLength = CalculateUtf8Length(codepoint);
902	for (int contIndex = encodeLength - `1`; contIndex > `0`; contIndex--)
903	{
904	char8_t cont = codepoint & ~cContinuation_UTF8Mask;
905	cont \|= cContinuation_UTF8Val;
906	dst[contIndex] = cont;
907	codepoint >>= cContinuation_CodepointBits;
908	}
909
910	// Write the leading byte.
911	UTF8Pattern pattern = UTF8LeadingBytes[encodeLength - `1`];
912	char8_t lead = codepoint & ~(pattern.Mask);
913	lead \|= pattern.Value;
914	dst[`0`] = lead;
915
916	return encodeLength;
917	}
918

Source File Modules/Foundation/Src/tStandard.cppHome Page Browse Root

Source File Modules/Foundation/Src/tStandard.cpp
Home Page Browse Root