-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhoutf.h
executable file
·683 lines (605 loc) · 28.5 KB
/
houtf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
/*
Copyright (c) 2024 Luke Philipsen
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted.
THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE
FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/* Usage
Do this:
#define HOUTF_IMPLEMENTATION
before you include this file in *one* C or C++ file to create the implementation.
You can define HOUTF_DECL with
#define HOUTF_DECL static
or
#define HOUTF_DECL extern
to specify HOUTF function declarations as static or extern, respectively.
The default specifier is extern.
*/
#ifndef HOUTF_H
#define HOUTF_H
#include <stddef.h> /* NULL, size_t */
#include <stdint.h> /* uint32_t */
#include <string.h> /* memcpy() */
#ifndef HOUTF_DECL
#define HOUTF_DECL
#endif /* HOUTF_DECL */
#ifdef __cplusplus
extern "C" {
#endif /* __cpluspus */
/***************/
/* Definitions */
/**
* Values representing the character encodings recognizef by houtf.
*/
typedef enum {
HOUTF_ENCODING_UNKNOWN = 0, /**< Encoding is not known. This is assumed to be UTF-8 when encoding/decoding. */
HOUTF_ENCODING_UTF8, /**< Variable-length character encoding using between one and four bytes per character. */
HOUTF_ENCODING_UTF16BE, /**< Varaiable-length character encoding using two or four bytes. Big endian. */
HOUTF_ENCODING_UTF16LE, /**< Varaiable-length character encoding using two or four bytes. Little endian. */
} houtf_encoding_t;
/**
* A single character in both encoded and unencoded forms along with metadata.
*/
typedef struct {
uint32_t raw; /**< Character as it appeared in the source. In other words, the original, encoded character. */
uint32_t value; /**< Integer value of the character. In other words, the decoded character. */
size_t bytes; /**< Number of eight-bit bytes of the encoded character, in the [1, 4] range. */
houtf_encoding_t encoding; /**< Character encoding (UTF-8, UTF-16BE, etc.) used by the raw character. */
} houtf_char_t;
/**
* Detect the character encoding of a string by searching its initial bytes for possible UTF byte order marks (BOMs).
* As BOMs are typically prepended to text files, the given string would typicaly be the start of the file's content.
*
* @param str Pointer to search for a byte order mark.
* @param num Number of bytes in the string.
* @return Enumeration value indiciating the character encoding of the content
*/
HOUTF_DECL houtf_encoding_t houtf_detect_bom(const char* str, size_t num);
/**
* Get the length of the byte order mark (BOM) of the given character encoding.
*
* @param enc Character encoding of the BOM.
*/
HOUTF_DECL size_t houtf_bom_len(houtf_encoding_t enc);
/**
* Decode a single character assuming UTF-8. The given string should point to the first byte of the character to decode.
*
* @param str Pointer to the first byte of the character.
* @param num Number of bytes in the string.
* @return A structure containing the character's raw value, decoded value, number of bytes, and encoding.
*/
HOUTF_DECL houtf_char_t houtf_decode(const char* str, size_t num);
/**
* Decode a single character explicitly given the expected character encoding. The given string should point to the
* first byte of the character to decode.
*
* @param str Pointer to the first byte of the character.
* @param num Number of bytes in the string.
* @param enc Encoding of the character.
* @return A structure containing the character's raw value, decoded value, number of bytes, and encoding.
*/
HOUTF_DECL houtf_char_t houtf_decode_e(const char* str, size_t num, houtf_encoding_t enc);
/**
* Encode a single character value as UTF-8.
*
* @param val Integer value of the character.
* @return A structure containing the character's encoded value, unencoded value, number of bytes, and encoding.
*/
HOUTF_DECL houtf_char_t houtf_encode(uint32_t val);
/**
* Encode a single character value as the given character encoding.
*
* @param val Integer value of the character.
* @param enc Character encoding to use.
* @return A structure containing the character's encoded value, unencoded value, number of bytes, and encoding.
*/
HOUTF_DECL houtf_char_t houtf_encode_e(uint32_t val, houtf_encoding_t enc);
/**
* Appends a copy of the UTF-8 'source' string to the UTF-8 'destination' string. The terminating null character in
* 'destination' is overwritten by the first character of 'source,' and a null character is included at the end of the
* new string formed by the concatenation of both in 'destination.'
*
* @param destination Pointer to the destination array, which should contain a UTF-8 string, and be large enough to
* contain the concatenated resulting string.
* @param source UTF-8 string to be appended. This should not overlap 'destination.'
* @return 'destination' is returned.
*/
HOUTF_DECL char* houtf_strcat(char* destination, const char* source);
/**
* Appends a copy of the 'source' string to the 'destination' string. The terminating null character in 'destination' is
* overwritten by the first character of 'source,' and a null character is included at the end of the new string formed
* by the concatenation of both in 'destination.'
*
* @param destination Pointer to the destination array, which should contain a string, and be large enough to contain
* the concatenated resulting string.
* @param destination_enc Character encoding of the destination string.
* @param source String to be appended. This should not overlap 'destination.'
* @param source_enc Character encoding of the source string.
* @return 'destination' is returned.
*/
HOUTF_DECL char* houtf_strcat_e(char* destination, houtf_encoding_t destination_enc, const char* source,
houtf_encoding_t source_enc);
/**
* Returns a pointer to the first occurrence of 'val' in the UTF-8 string 'str.'
* The terminating null character is considered part of the string. Therefore, it can also be located.
*
* @param str UTF-8 string to be scanned.
* @param val Character to be located.
* @return A pointer to the first byte of the first occurrence of 'val' in 'str.' If the character is not found, a null
* pointer is returned.
*/
HOUTF_DECL const char* houtf_strchr(const char* str, uint32_t val);
/**
* Returns a pointer to the first occurrence of 'val' in the string 'str.'
* The terminating null character is considered part of the string. Therefore, it can also be located.
*
* @param str String to be scanned.
* @param val Character to be located.
* @param str_enc Character encoding of the string.
* @return A pointer to the first byte of the first occurrence of 'val' in 'str.' If the character is not found, a null
* pointer is returned.
*/
HOUTF_DECL const char* houtf_strchr_e(const char* str, uint32_t val, houtf_encoding_t str_enc);
/**
* Compares the UTF-8 string 'str1' to UTF-8 string 'str2.'
* This function starts comparing the first character of each string. If they are equal to each other, it continues with
* the following pairs until the characters differ or until a terminating null character is reached.
* This function performs a comparison of decoded vlue of the characters.
*
* @param str1 UTF-8 string to be compared.
* @param str2 UTF-8 string to be compared.
* @return Returns a value indicating the relationship between the strings. A value less than zero indicates the first
* character that does not match has a lower value in 'str1' than in 'str2.' A value greater than zero indicates
* the first character that does not match has a greater value in 'str1' than in 'str2'. Zero inidicates the
* contents of both strings are equal.
*/
HOUTF_DECL int houtf_strcmp(const char* str1, const char* str2);
/**
* Compares the string 'str1' to string 'str2.'
* This function starts comparing the first character of each string. If they are equal to each other, it continues with
* the following pairs until the characters differ or until a terminating null character is reached.
* This function performs a comparison of decoded vlue of the characters.
*
* @param str1 String to be compared.
* @param str1_enc Charcter encoding of str1.
* @param str2 String to be compared.
* @param str2
* @return Returns a value indicating the relationship between the strings. A value less than zero indicates the first
* character that does not match has a lower value in 'str1' than in 'str2.' A value greater than zero indicates
* the first character that does not match has a greater value in 'str1' than in 'str2'. Zero inidicates the
* contents of both strings are equal.
*/
HOUTF_DECL int houtf_strcmp_e(const char* str1, houtf_encoding_t str1_enc, const char* str2, houtf_encoding_t str2_enc);
/**
* Returns the length of the UTF-8 string 'str.'
* The length of a string is determiend by the terminator null character: a string is as long as the number of
* characters between the beginning of the string and the terminating charcter (without including the terminating null
* character itself).
*
* @param str A UTF-8 string.
* @return The length of the string in characters.
*/
HOUTF_DECL size_t houtf_strlen(const char* str);
/**
* Returns the length of the string 'str.'
* The length of a string is determiend by the terminator null character: a string is as long as the number of
* characters between the beginning of the string and the terminating charcter (without including the terminating null
* character itself).
*
* @param str A string.
* @param enc Character encoding of the string.
* @return The length of the string in characters.
*/
HOUTF_DECL size_t houtf_strlen_e(const char* str, houtf_encoding_t enc);
/**
* Returns a pointer to the first occurrence of UTF-8 string 'str2' in UTF-8 string 'str1,' or a null pointer if 'str2'
* is not part of 'str1.'
* The matching process does not include the terminating null characters, but it stops there.
*
* @param str1 UTF-8 string to be scanned.
* @param str2 UTF-8 string contaiing the sequence of characters to match.
* @return A pointer to the first occurrence in 'str1' of the entire sequence of characters specificied in 'str2,' or a
* null pointer if the sequence is not present in 'str1.'
*/
HOUTF_DECL const char* houtf_strstr(const char* str1, const char* str2);
/**
* Returns a pointer to the first occurrence of 'str2' in 'str1,' or a null pointer if 'str2' is not part of 'str1.'
* The matching process does not include the terminating null characters, but it stops there.
*
* @param str1 String to be scanned.
* @param str1_enc Character encoding of 'str1.'
* @param str2 String contaiing the sequence of characters to match.
* @param str2_enc Character encoding of 'str2.'
* @return A pointer to the first occurrence in 'str1' of the entire sequence of characters specificied in 'str2,' or a
* null pointer if the sequence is not present in 'str1.'
*/
HOUTF_DECL const char* houtf_strstr_e(const char* str1, houtf_encoding_t str1_enc, const char* str2,
houtf_encoding_t str2_enc);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#ifdef HOUTF_IMPLEMENTATION
/******************/
/* Implementation */
houtf_encoding_t houtf_detect_bom(const char* str, size_t num)
{
/* UTF byte order marks (BOMs) are simple magic numbers prepended to files. They can be detected by comparing the */
/* first 16 to 32 bits again those magic numbers. */
if (str != NULL)
{
/* The UTF-8 BOM is EF BB BF */
if (num >= 3 && (str[0] & 0xFF) == 0xEF && (str[1] & 0xFF) == 0xBB && (str[2] & 0xFF) == 0xBF)
return HOUTF_ENCODING_UTF8;
/* The UTF-16BE BOM is FE FF */
else if (num >= 2 && (str[0] & 0xFF) == 0xFE && (str[1] & 0xFF) == 0xFF)
return HOUTF_ENCODING_UTF16BE;
/* The UTF-16LE BOM is FF FE */
else if (num >= 2 && (str[0] & 0xFF) == 0xFF && (str[1] & 0xFF) == 0xFE)
return HOUTF_ENCODING_UTF16LE;
}
/* If this function hasn't returned yet, there are three possible reasons: 1) the string pointer is null, 2) the */
/* number of characters in the string is too small to contain a BOM, or 3) there's simply no BOM at the start of */
/* the string. The BOM cannot be detected. */
return HOUTF_ENCODING_UNKNOWN;
}
size_t houtf_bom_len(houtf_encoding_t enc)
{
switch (enc)
{
case HOUTF_ENCODING_UNKNOWN:
default:
return 0;
case HOUTF_ENCODING_UTF8:
return 3; /* UTF-8's BOM (EF BB BF) is three bytes long */
case HOUTF_ENCODING_UTF16BE:
case HOUTF_ENCODING_UTF16LE:
return 2; /* UTF-16BE's BOM (FE FF) and UTF-16LE's BOM (FF FE) are both two bytes long */
}
}
houtf_char_t houtf_decode(const char* str, size_t num)
{
return houtf_decode_e(str, num, HOUTF_ENCODING_UTF8);
}
houtf_char_t houtf_decode_e(const char* str, size_t num, houtf_encoding_t enc)
{
houtf_char_t c = {0}; /* All zeroes */
switch (enc)
{
case HOUTF_ENCODING_UNKNOWN:
c.bytes = 1;
break;
case HOUTF_ENCODING_UTF8:
/* The first byte of a UTF-8 character can can begin with one of four bit patterns, each indicating the */
/* number of remaining bytes: 0XXXXXXX = 1 byte, 110XXXXX = 2 bytes, 1110XXXX = 3 bytes, 11110XXX = 4 bytes. */
/* NOTE: UTF-8 is *big* endian. */
if (((str[0] >> 7) & 0x01) == 0x00)
c.bytes = 1;
else if (((str[0] >> 5) & 0x07) == 0x06)
c.bytes = 2;
else if (((str[0] >> 4) & 0x0F) == 0x0E)
c.bytes = 3;
else if (((str[0] >> 3) & 0x1F) == 0x1E)
c.bytes = 4;
break;
case HOUTF_ENCODING_UTF16BE:
/* UTF-16 characters are either two bytes or four bytes where the four-byte characters are encoded such that */
/* the first two bytes begin with 110110XX and the second with 110111XX. The rest are two-byte characters. */
if (((str[0] >> 2) & 0x3F) == 0x36 && ((str[2] >> 2) & 0x3F) == 0x37)
c.bytes = 4;
else
c.bytes = 2;
break;
case HOUTF_ENCODING_UTF16LE:
/* UTF-16LE (Little Endian) is just like UTF-16BE (Big Endian) but the most and least significant bytes in */
/* any 16-bit sequence are swapped. (Technically, a byte isn't defined as eight bits but it is in practice.) */
if (((str[1] >> 2) & 0x3F) == 0x36 && ((str[3] >> 2) & 0x3F) == 0x37)
c.bytes = 4;
else
c.bytes = 2;
break;
}
/* If the string doesn't have enough bytes in it to decode this character */
if (c.bytes > num)
return (houtf_char_t){0}; /* Return early with all values zeroed */
switch (enc)
{
case HOUTF_ENCODING_UNKNOWN:
c.value = (uint32_t)str[0] & 0x000000FF; /* The mask ensures the remainder of the bits are zero */
break;
case HOUTF_ENCODING_UTF8:
if (c.bytes == 1)
{
/* One-byte UTF-8 characters are encoded as 0XXXXXXX where the Xs represent the bits of the character's */
/* value. For all decoding, we want to grab only those bits and transform them into an integer. */
/* The method here takes one byte from the string, uses a mask to zero out any bit that is not part of */
/* resulting value, casts the masked byte to an unsigned 32-bit integer, shifts those bits to the left to */
/* place them at the indexes they're expected in the value, and then bitwise ORs these components into a */
/* single unsigned 32-bit integer. This one-byte case does not need any shift but the remaining cases do. */
c.value = (uint32_t)(str[0] & 0x7F);
}
else if (c.bytes == 2)
{
/* Two-byte UTF-8 characters are encoded as 110XXXXX 10XXXXXX */
c.value = ((uint32_t)(str[0] & 0x1F) << 6) | (uint32_t)(str[1] & 0x3F);
}
else if (c.bytes == 3)
{
/* Three-byte UTF-8 characters are encoded as 1110XXXX 10XXXXXX 10XXXXXX */
c.value = ((uint32_t)(str[0] & 0x0F) << 12) | ((uint32_t)(str[1] & 0x3F) << 6) |
(uint32_t)(str[2] & 0x3F);
}
else if (c.bytes == 4)
{
/* Four-byte UTF-8 characters are encoded as 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX */
c.value = ((uint32_t)(str[0] & 0x07) << 18) | ((uint32_t)(str[1] & 0x3F) << 12) |
((uint32_t)(str[2] & 0x3F) << 6) | (uint32_t)(str[3] & 0x3F);
}
break;
case HOUTF_ENCODING_UTF16BE:
if (c.bytes == 2)
{
/* Concatenate the two bytes together to retrieve the original value */
c.value = ((uint32_t)str[0] << 8) | (uint32_t)str[1];
}
else if (c.bytes == 4)
{
/* Four-byte UTF-16 characters are encoded as 110110XX XXXXXXXX 110111XX XXXXXXXX after first subtracting */
/* 0x00010000 from the value. Here, that subtracted value is reconstructed and 0x00010000 is added back. */
c.value = (((uint32_t)(str[0] & 0x03) << 18) | ((uint32_t)str[1] << 16) |
((uint32_t)(str[2] & 0x03) << 8) | (uint32_t)str[3]) + 0x00010000;
}
break;
case HOUTF_ENCODING_UTF16LE:
if (c.bytes == 2)
c.value = ((uint32_t)str[1] << 8) | (uint32_t)str[0];
else if (c.bytes == 4)
{
c.value = (((uint32_t)(str[1] & 0x03) << 18) | ((uint32_t)str[0] << 16) |
((uint32_t)(str[3] & 0x03) << 8) | (uint32_t)str[2]) + 0x00010000;
}
break;
}
switch (c.bytes)
{
/* The method here takes the char pointer, casts it to an unsigned 32-bit integer pointer (pointing to four bytes */
/* rather than one), dereferences it to get its integer value, applies a mask to zero any unwanted bits (e.g. the */
/* 0x000000FF mask retains just the last eight bits), and assigns this value to the 'raw' value. */
case 1: c.raw = *(uint32_t*)str & 0x000000FF; break;
case 2: c.raw = *(uint32_t*)str & 0x0000FFFF; break;
case 3: c.raw = *(uint32_t*)str & 0x00FFFFFF; break;
case 4: c.raw = *(uint32_t*)str; break;
}
return c;
}
houtf_char_t houtf_encode(uint32_t val)
{
return houtf_encode_e(val, HOUTF_ENCODING_UTF8);
}
houtf_char_t houtf_encode_e(uint32_t val, houtf_encoding_t enc)
{
houtf_char_t c = {0}; /* All zeroes */
switch (enc) {
case HOUTF_ENCODING_UNKNOWN: /* If the encoding is somehow not specified, assume UTF-8 */
case HOUTF_ENCODING_UTF8:
if (val <= 0x0000007F) /* If the value will fit into one byte */
{
c.raw = val;
c.bytes = 1;
}
else if (val >= 0x000080 && val <= 0x000007FF) /* If the value will fit into two bytes */
{
/* For a value with bits XXXXXAAA AABBBBBB we want to transform the bits to the form 110AAAAA 10BBBBBB. */
/* The method here treats c.raw as an array of unsigned, eight-bit integers. This is done to assign bytes */
/* individually for the sake of endianness where UTF-8 is big endian. The value is masked in order to */
/* zero any bits that are not used in the byte being assigned, then shifted all the way to the right. */
/* prefixed "0xC0" and "0x80" bitwise ORs prepend the UTF-8 markers 110 and 10, respectively. The */
((uint8_t*)&c.raw)[0] = 0xC0 | (uint8_t)((val & 0x0000007C0) >> 6); /* 110AAAAAA */
((uint8_t*)&c.raw)[1] = 0x80 | (uint8_t) (val & 0x0000000FF); /* 10BBBBBB */
c.bytes = 2;
}
else if ((val >= 0x00000800 && val <= 0x0000D7FF) || (val >= 0x0000E000 && val <= 0x0000FFFF))
{
/* For a value with bits AAAABBBB BBCCCCCC we want 1110AAAA 10BBBBBB 10CCCCCC */
((uint8_t*)&c.raw)[0] = 0xE0 | (uint8_t)((val & 0x0000F000) >> 12); /* 1110AAAA */
((uint8_t*)&c.raw)[1] = 0x80 | (uint8_t)((val & 0x00000FC0) >> 6); /* 10BBBBBB */
((uint8_t*)&c.raw)[2] = 0x80 | (uint8_t) (val & 0x0000003F); /* 10CCCCCC */
c.bytes = 3;
}
else if (val >= 0x00010000 && val <= 0x0010FFFF) {
/* For a value with bits XXXAAABB BBBBCCCC CCDDDDDD we want 11110AAA 10BBBBBB 10CCCCCC 10DDDDDD */
((uint8_t*)&c.raw)[0] = 0xF0 | (uint8_t)((val & 0x001C0000) >> 18) ; /* 11110AAA */
((uint8_t*)&c.raw)[1] = 0x80 | (uint8_t)((val & 0x0003F000) >> 12); /* 10BBBBBB */
((uint8_t*)&c.raw)[2] = 0x80 | (uint8_t)((val & 0x00000FC0) >> 6); /* 10CCCCCC */
((uint8_t*)&c.raw)[3] = 0x80 | (uint8_t) (val & 0x0000003F); /* 10DDDDDD */
c.bytes = 4;
}
else /* If the value is not valid */
c.bytes = 0; /* Don't even try */
break;
case HOUTF_ENCODING_UTF16BE:
if (val <= 0x0000D7FF || (val >= 0x0000E000 && val <= 0x0000FFFF)) /* If the value fits in two bytes */
{
((uint8_t*)&c.raw)[0] = (uint8_t)((val & 0x0000FF00) >> 8);
((uint8_t*)&c.raw)[1] = (uint8_t) (val & 0x000000FF);
c.bytes = 2;
}
else if (val >= 0x00010000 && val <= 0x0010FFFF) /* If the value fits in four bytes */
{
/* For a value - 0x00010000 with bits XXXXXXXX XXXXAABB BBBBBBCC DDDDDDDD we want to transform the bits */
/* to the form 110110AA BBBBBBBB 110111CC DDDDDDDD. When decoded, as per UTF-16, 0x00010000 is added. */
/* The prefixed "0xD8" and "0xDC" bitwise ORs prepend the UTF-16 markers 110110 and 110111, respectively. */
val -= 0x00010000;
((uint8_t*)&c.raw)[0] = 0xD8 | (uint8_t)((val & 0x000C0000) >> 20); /* 110110AA */
((uint8_t*)&c.raw)[1] = (uint8_t)((val & 0x0003FC00) >> 18); /* BBBBBBBB */
((uint8_t*)&c.raw)[2] = 0xDC | (uint8_t)((val & 0x00000300) >> 8); /* 110111CC */
((uint8_t*)&c.raw)[3] = (uint8_t) (val & 0x000000FF); /* DDDDDDDD */
c.bytes = 4;
}
else /* If the value is not valid */
c.bytes = 0; /* Don't even try */
break;
case HOUTF_ENCODING_UTF16LE:
/* UTF-16LE (Little Endian) is just like UTF-16BE (Big Endian) with the reverse endianness meaning that the */
/* operations here are identical to those above but the indexes have been changed to reflect endianness */
if (val <= 0x0000D7FF || (val >= 0x0000E000 && val <= 0x0000FFFF))
{
((uint8_t*)&c.raw)[1] = (uint8_t)((val & 0x0000FF00) >> 8);
((uint8_t*)&c.raw)[0] = (uint8_t) (val & 0x000000FF);
c.bytes = 2;
}
else if (val >= 0x00010000 && val <= 0x0010FFFF)
{
val -= 0x00010000;
((uint8_t*)&c.raw)[3] = 0xD8 | (uint8_t)((val & 0x000C0000) >> 20); /* 110110AA */
((uint8_t*)&c.raw)[2] = (uint8_t)((val & 0x0003FC00) >> 18); /* BBBBBBBB */
((uint8_t*)&c.raw)[1] = 0xDC | (uint8_t)((val & 0x00000300) >> 8); /* 110111CC */
((uint8_t*)&c.raw)[0] = (uint8_t) (val & 0x000000FF); /* DDDDDDDD */
c.bytes = 4;
}
else
c.bytes = 0;
break;
}
return c;
}
char* houtf_strcat(char* destination, const char* source)
{
return houtf_strcat_e(destination, HOUTF_ENCODING_UTF8, source, HOUTF_ENCODING_UTF8);
}
char* houtf_strcat_e(char* destination, houtf_encoding_t destination_enc, const char* source,
houtf_encoding_t source_enc)
{
if (destination == NULL || source == NULL)
return destination;
/* Iterate through the current destination string to look for the null terminator, stopping there */
char* id = destination; /* Destination iterator */
houtf_char_t cd = houtf_decode_e(/* str: */ id, /* num: */ 4, /* enc: */ destination_enc);
while (cd.value != 0) /* While not at the destination's terminator */
{
id += cd.bytes; /* Iterate forward by the number of bytes used by the previous character */
cd = houtf_decode_e(id, 4, destination_enc);
}
/* Iterate through both strings, assigning values from the source string to the destination string */
const char* is = source; /* Source iterator */
houtf_char_t cs; /* Source character */
do
{
cs = houtf_decode_e(/* str: */ is, /* num: */ 4, /* enc: */ source_enc);
/* Reuse 'cd' to re-encode the character from the source and copy the character to the destination */
cd = houtf_encode_e(/* val: */ cs.value, /* enc: */ destination_enc);
memcpy(id, &(cd.raw), cd.bytes);
/* Iterate to the next character for both strings */
is += cs.bytes;
id += cs.bytes;
}
while (cs.value != 0); /* While not at the source's terminator */
return destination;
}
const char* houtf_strchr(const char* str, uint32_t val)
{
return houtf_strchr_e(str, val, HOUTF_ENCODING_UTF8);
}
const char* houtf_strchr_e(const char* str, uint32_t val, houtf_encoding_t str_enc)
{
if (str == NULL)
return NULL;
const char* i = str; /* Iterator */
houtf_char_t c; /* Decoded character */
do
{
c = houtf_decode_e(/* str: */ i, /* num: */ 4, /* enc: */ str_enc);
if (c.value == val)
return i;
i += c.bytes; /* Iterate forward by the number of bytes used by the character */
}
while (c.value != 0); /* While not at the string's terminator */
return NULL;
}
int houtf_strcmp(const char* str1, const char* str2)
{
return houtf_strcmp_e(str1, HOUTF_ENCODING_UTF8, str2, HOUTF_ENCODING_UTF8);
}
int houtf_strcmp_e(const char* str1, houtf_encoding_t str1_enc, const char* str2, houtf_encoding_t str2_enc)
{
if (str1 == NULL && str2 == NULL)
return 0;
else if (str1 == NULL)
return -1;
else if (str2 == NULL)
return 1;
const char* i1 = str1; /* str1 iterator */
const char* i2 = str2; /* str2 iterator */
houtf_char_t c1 = {0}; /* str1 character */
houtf_char_t c2 = {0}; /* str2 character */
do
{
/* Decode the current character from strings in order to know their values and number of bytes */
c1 = houtf_decode_e(/* str: */ i1, /* num: */ 4, /* enc: */ str1_enc);
c2 = houtf_decode_e(i2, 4, str2_enc);
/* Iterate to the next character for both strings */
i1 += c1.bytes;
i2 += c2.bytes;
}
while (c1.value != 0 && c1.value == c2.value); /* While not at the terminator and the character's are equal */
return (int)(c1.value - c2.value);
}
size_t houtf_strlen(const char* str)
{
return houtf_strlen_e(str, HOUTF_ENCODING_UTF8);
}
size_t houtf_strlen_e(const char* str, houtf_encoding_t enc)
{
if (str == NULL)
return 0;
size_t len = 0; /* Length, to be incremented each character */
const char* i = str; /* Iterator */
houtf_char_t c = houtf_decode_e(/* str: */ i, /* num: */ 4, /* enc: */ enc); /* Decoded character */
while (c.value != 0) /* While not at the string's terminator */
{
len += 1; /* Increment the length of the string by one */
i += c.bytes; /* Iterate forward by the number of bytes used by the previous character */
c = houtf_decode_e(i, 4, enc); /* Decode the next character */
}
return len;
}
const char* houtf_strstr(const char* str1, const char* str2)
{
return houtf_strstr_e(str1, HOUTF_ENCODING_UTF8, str2, HOUTF_ENCODING_UTF8);
}
const char* houtf_strstr_e(const char* str1, houtf_encoding_t str1_enc, const char* str2, houtf_encoding_t str2_enc)
{
if (str1 == NULL || str2 == NULL)
return NULL;
const char* i1_0 = str1; /* Outer loop str1 iterator. Points to the start of the matching string, if it exists. */
houtf_char_t c1 = houtf_decode_e(/* str: */ str1, /* num: */ 4, /* enc: */ str1_enc); /* str1 character */
houtf_char_t c2 = houtf_decode_e(str2, 4, str2_enc); /* str2 character */
while (c1.value != 0 && c2.value != 0)
{
if (c1.value == c2.value)
{
const char* i1 = i1_0;
const char* i2 = str2;
do
{
c1 = houtf_decode_e(i1, 4, str1_enc);
c2 = houtf_decode_e(i2, 4, str2_enc);
if (c2.value == 0)
return i1_0;
i1 += c1.bytes;
i2 += c2.bytes;
}
while (c1.value == c2.value);
}
i1_0 += c1.bytes;
c1 = houtf_decode_e(i1_0, 4, str1_enc);
}
return NULL;
}
#endif /* HOUTF_IMPLEMENTATION */
#endif /* HOUTF_H */