Skip to content

Commit 5fe2e6a

Browse files
committed
Added E.asUTF8 and extra functionality to work with UTF8 strings in Espruino
1 parent ad96d3c commit 5fe2e6a

11 files changed

+189
-79
lines changed

libs/graphics/jswrap_graphics.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -2091,7 +2091,7 @@ void _jswrap_graphics_stringMetrics(JsGraphics *gfx, JsVar *var, int lineStartIn
20912091
int height = fontHeight;
20922092
int maxWidth = 0;
20932093
while (jsvStringIteratorHasChar(&it)) {
2094-
char ch = jsvStringIteratorGetCharAndNext(&it);
2094+
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
20952095
if (ch=='\n') {
20962096
if (width>maxWidth) maxWidth=width;
20972097
width = 0;
@@ -2211,7 +2211,7 @@ JsVar *jswrap_graphics_wrapString(JsVar *parent, JsVar *str, int maxWidth) {
22112211
jsvStringIteratorNew(&it, str, 0);
22122212

22132213
while (jsvStringIteratorHasChar(&it) || endOfText) {
2214-
char ch = jsvStringIteratorGetCharAndNext(&it);
2214+
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
22152215
if (endOfText || ch=='\n' || ch==' ') { // newline or space
22162216
int currentPos = jsvStringIteratorGetIndex(&it);
22172217
if ((lineWidth + spaceWidth + wordWidth <= maxWidth) &&
@@ -2377,7 +2377,7 @@ JsVar *jswrap_graphics_drawString(JsVar *parent, JsVar *var, int x, int y, bool
23772377
JsvStringIterator it;
23782378
jsvStringIteratorNew(&it, str, 0);
23792379
while (jsvStringIteratorHasChar(&it)) {
2380-
char ch = jsvStringIteratorGetCharAndNext(&it);
2380+
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
23812381
if (ch=='\n') {
23822382
x = startx;
23832383
#ifndef SAVE_ON_FLASH

src/jsparse.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "jswrap_functions.h" // insane check for eval in jspeFunctionCall
2020
#include "jswrap_json.h" // for jsfPrintJSON
2121
#include "jswrap_espruino.h" // for jswrap_espruino_memoryArea
22+
#include "jswrap_string.h" // for jswrap_string_charAt
2223
#ifndef ESPR_NO_REGEX
2324
#include "jswrap_regexp.h" // for jswrap_regexp_constructor
2425
#endif
@@ -1061,8 +1062,7 @@ JsVar *jspGetVarNamedField(JsVar *object, JsVar *nameVar, bool returnName) {
10611062
} else if (jsvIsString(object) && jsvIsInt(nameVar)) {
10621063
JsVarInt idx = jsvGetInteger(nameVar);
10631064
if (idx>=0 && idx<(JsVarInt)jsvGetStringLength(object)) {
1064-
char ch = jsvGetCharInString(object, (size_t)idx);
1065-
child = jsvNewStringOfLength(1, &ch);
1065+
return jswrap_string_charAt(object, idx);
10661066
} else if (returnName)
10671067
child = jsvCreateNewChild(object, nameVar, 0); // just return *something* to show this is handled
10681068
} else {

src/jsutils.c

+29-1
Original file line numberDiff line numberDiff line change
@@ -990,10 +990,38 @@ unsigned short int int_sqrt32(unsigned int x) {
990990
}
991991

992992
/// Gets the length of a unicode char sequence by looking at the first char
993-
int jsUnicodeCharLength(char c) {
993+
int jsUTF8LengthFromChar(char c) {
994994
if ((c&0x80)==0) return 1; // ASCII - definitely just one byte
995995
if ((c&0xE0)==0xC0) return 2; // 2-byte code starts with 0b110xxxxx
996996
if ((c&0xF0)==0xE0) return 3; // 3-byte code starts with 0b1110xxxx
997997
if ((c&0xF8)==0xF0) return 4; // 4-byte code starts with 0b11110xxx
998998
return 1;
999999
}
1000+
1001+
/// Given a codepoint, figure hot how many bytes it needs for UTF8 encoding
1002+
int jsUTF8Bytes(int codepoint) {
1003+
if (codepoint <= 0x7F) return 1;
1004+
if (codepoint <= 0x7FF) return 2;
1005+
if (codepoint <= 0xFFFF) return 3;
1006+
if (codepoint <= 0x10FFFF) return 4;
1007+
return 0;
1008+
}
1009+
1010+
// encode a codepoint as a string, NOT null terminated (utf8 min size=4)
1011+
int jsUTF8Encode(int codepoint, char* utf8) {
1012+
static const uint8_t masks[] = {
1013+
0x80, // 10000000
1014+
0xE0, // 11100000
1015+
0xF0, // 11110000
1016+
0xF8 // 11111000
1017+
};
1018+
int size = jsUTF8Bytes(codepoint);
1019+
if (!size) return 0;
1020+
for (int i = size - 1; i > 0; --i) {
1021+
utf8[i] = (char)((codepoint & ~0xC0) | 0x80);
1022+
codepoint >>= 6;
1023+
}
1024+
utf8[0] = (char)((codepoint & ~(masks[size - 1])) | (masks[size - 1] << 1));
1025+
return size;
1026+
}
1027+

src/jsutils.h

+7-1
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,12 @@ typedef struct {
625625
} Vector3;
626626

627627
/// Gets the length of a unicode char sequence by looking at the first char
628-
int jsUnicodeCharLength(char c);
628+
int jsUTF8LengthFromChar(char c);
629+
630+
/// Given a codepoint, figure hot how many bytes it needs for UTF8 encoding
631+
int jsUTF8Bytes(int codepoint);
632+
633+
// encode a codepoint as a string, NOT null terminated (utf8 min size=4)
634+
int jsUTF8Encode(int codepoint, char* utf8);
629635

630636
#endif /* JSUTILS_H_ */

src/jsvar.c

+24-17
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ unsigned char jsvGetLocks(JsVar *v) { return (unsigned char)((v->flags>>JSV_LOCK
9999
#define JSV_IS_INT(f) ((f)==JSV_INTEGER || JSV_IS_PIN(f) || (f)==JSV_NAME_INT || (f)==JSV_NAME_INT_INT || (f)==JSV_NAME_INT_BOOL)
100100
#define JSV_IS_NUMERIC(f) ((f)>=_JSV_NUMERIC_START && (f)<=_JSV_NUMERIC_END)
101101
#define JSV_IS_STRING(f) ((f)>=_JSV_STRING_START && (f)<=_JSV_STRING_END)
102-
#define JSV_IS_UNICODE_STRING(f) (f)==JSV_UNICODE_STRING
102+
#define JSV_IS_UNICODE_STRING(f) (f)==JSV_UTF8_STRING
103103
#define JSV_IS_STRING_EXT(f) ((f)>=JSV_STRING_EXT_0 && (f)<=JSV_STRING_EXT_MAX)
104104
#define JSV_IS_FLAT_STRING(f) (f)==JSV_FLAT_STRING
105105
#define JSV_IS_NATIVE_STRING(f) (f)==JSV_NATIVE_STRING
@@ -128,7 +128,7 @@ bool jsvIsInt(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VART
128128
bool jsvIsFloat(const JsVar *v) { return v && (v->flags&JSV_VARTYPEMASK)==JSV_FLOAT; }
129129
bool jsvIsBoolean(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return JSV_IS_BOOL(f); }
130130
bool jsvIsString(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return JSV_IS_STRING(f); } ///< String, or a NAME too
131-
bool jsvIsUnicodeString(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return JSV_IS_UNICODE_STRING(f); } ///< Just a unicode string (Unicode JsVar, pointing to a string)
131+
bool jsvIsUTF8String(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return JSV_IS_UNICODE_STRING(f); } ///< Just a unicode string (UTF8 JsVar, pointing to a string)
132132
bool jsvIsBasicString(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return f>=JSV_STRING_0 && f<=JSV_STRING_MAX; } ///< Just a string (NOT a name/flatstr/nativestr or flashstr)
133133
bool jsvIsStringExt(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return JSV_IS_STRING_EXT(f); } ///< The extra bits dumped onto the end of a string to store more data
134134
bool jsvIsFlatString(const JsVar *v) { if (!v) return false; char f = v->flags&JSV_VARTYPEMASK; return JSV_IS_FLAT_STRING(f); }
@@ -1096,9 +1096,9 @@ JsVar *jsvNewStringOfLength(unsigned int byteLength, const char *initialData) {
10961096
return first;
10971097
}
10981098

1099-
JsVar *jsvNewUnicodeString(JsVar* dataString) {
1099+
JsVar *jsvNewUTF8String(JsVar* dataString) {
11001100
assert(jsvIsString(dataString));
1101-
JsVar *var = jsvNewWithFlags(JSV_UNICODE_STRING);
1101+
JsVar *var = jsvNewWithFlags(JSV_UTF8_STRING);
11021102
if (!var) return 0; // no memory
11031103
jsvSetFirstChild(var, jsvGetRef(jsvRef(dataString)));
11041104
return var;
@@ -1643,6 +1643,18 @@ bool jsvIsEmptyString(JsVar *v) {
16431643

16441644
size_t jsvGetStringLength(const JsVar *v) {
16451645
size_t strLength = 0;
1646+
// For unicode, we just have to iterate to get a length
1647+
if (jsvIsUTF8String(v)) {
1648+
JsvStringIterator it;
1649+
jsvStringIteratorNew(&it, v, 0);
1650+
while (jsvStringIteratorHasChar(&it)) {
1651+
jsvStringIteratorNextUTF8(&it);
1652+
strLength++;
1653+
}
1654+
jsvStringIteratorFree(&it);
1655+
return strLength;
1656+
}
1657+
16461658
const JsVar *var = v;
16471659
JsVar *newVar = 0;
16481660
if (!jsvHasCharacterData(v)) return 0;
@@ -1895,12 +1907,12 @@ void jsvAppendStringVarComplete(JsVar *var, const JsVar *str) {
18951907
jsvAppendStringVar(var, str, 0, JSVAPPENDSTRINGVAR_MAXLENGTH);
18961908
}
18971909

1898-
char jsvGetCharInString(JsVar *v, size_t idx) {
1910+
int jsvGetCharInString(JsVar *v, size_t idx) {
18991911
if (!jsvIsString(v)) return 0;
19001912

19011913
JsvStringIterator it;
19021914
jsvStringIteratorNew(&it, v, idx);
1903-
char ch = jsvStringIteratorGetChar(&it);
1915+
int ch = jsvStringIteratorGetUTF8CharAndNext(&it);
19041916
jsvStringIteratorFree(&it);
19051917
return ch;
19061918
}
@@ -2493,9 +2505,8 @@ int jsvCompareString(JsVar *va, JsVar *vb, size_t starta, size_t startb, bool eq
24932505
jsvStringIteratorNew(&itb, vb, startb);
24942506
// step to first positions
24952507
while (true) {
2496-
int ca = jsvStringIteratorGetCharOrMinusOne(&ita);
2497-
int cb = jsvStringIteratorGetCharOrMinusOne(&itb);
2498-
2508+
int ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
2509+
int cb = jsvStringIteratorGetUTF8CharAndNext(&itb);
24992510
if (ca != cb) {
25002511
jsvStringIteratorFree(&ita);
25012512
jsvStringIteratorFree(&itb);
@@ -2507,8 +2518,6 @@ int jsvCompareString(JsVar *va, JsVar *vb, size_t starta, size_t startb, bool eq
25072518
jsvStringIteratorFree(&itb);
25082519
return 0;
25092520
}
2510-
jsvStringIteratorNext(&ita);
2511-
jsvStringIteratorNext(&itb);
25122521
}
25132522
// never get here, but the compiler warns...
25142523
return true;
@@ -2522,14 +2531,12 @@ JsVar *jsvGetCommonCharacters(JsVar *va, JsVar *vb) {
25222531
JsvStringIterator ita, itb;
25232532
jsvStringIteratorNew(&ita, va, 0);
25242533
jsvStringIteratorNew(&itb, vb, 0);
2525-
int ca = jsvStringIteratorGetCharOrMinusOne(&ita);
2526-
int cb = jsvStringIteratorGetCharOrMinusOne(&itb);
2534+
int ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
2535+
int cb = jsvStringIteratorGetUTF8CharAndNext(&itb);
25272536
while (ca>0 && cb>0 && ca == cb) {
25282537
jsvAppendCharacter(v, (char)ca);
2529-
jsvStringIteratorNext(&ita);
2530-
jsvStringIteratorNext(&itb);
2531-
ca = jsvStringIteratorGetCharOrMinusOne(&ita);
2532-
cb = jsvStringIteratorGetCharOrMinusOne(&itb);
2538+
ca = jsvStringIteratorGetUTF8CharAndNext(&ita);
2539+
cb = jsvStringIteratorGetUTF8CharAndNext(&itb);
25332540
}
25342541
jsvStringIteratorFree(&ita);
25352542
jsvStringIteratorFree(&itb);

src/jsvar.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,9 @@ typedef enum {
7272
JSV_STRING_MAX = JSV_STRING_0+JSVAR_DATA_STRING_LEN,
7373
JSV_FLAT_STRING = JSV_STRING_MAX+1, ///< Flat strings store the length (in chars) as an int, and then the subsequent JsVars (in memory) store data
7474
JSV_NATIVE_STRING = JSV_FLAT_STRING+1, ///< Native strings store an address and length, and reference the underlying data directly
75-
JSV_UNICODE_STRING = JSV_NATIVE_STRING+1, ///< Unicode just point to a normal string with firstChild, but just tag that the string is a unicode one
75+
JSV_UTF8_STRING = JSV_NATIVE_STRING+1, ///< UTF8 just point to a normal string with firstChild, but just tag that the string is a unicode one
7676
#ifdef SPIFLASH_BASE
77-
JSV_FLASH_STRING = JSV_UNICODE_STRING+1, ///< Like a native String, but not writable and uses jshFlashRead
77+
JSV_FLASH_STRING = JSV_UTF8_STRING+1, ///< Like a native String, but not writable and uses jshFlashRead
7878
_JSV_STRING_END = JSV_FLASH_STRING,
7979
#else
8080
_JSV_STRING_END = JSV_NATIVE_STRING,
@@ -302,7 +302,7 @@ JsVar *jsvNewFromString(const char *str); ///< Create a new string
302302
JsVar *jsvNewNameFromString(const char *str); ///< Create a new name from a string
303303
JsVar *jsvNewStringOfLength(unsigned int byteLength, const char *initialData); ///< Create a new string of the given length - full of 0s (or initialData if specified)
304304
static ALWAYS_INLINE JsVar *jsvNewFromEmptyString() { return jsvNewWithFlags(JSV_STRING_0); } ;///< Create a new empty string
305-
JsVar *jsvNewUnicodeString(JsVar* dataString); ///< Create a new unicode string using the given data string for backing
305+
JsVar *jsvNewUTF8String(JsVar* dataString); ///< Create a new unicode string using the given data string for backing
306306
static ALWAYS_INLINE JsVar *jsvNewNull() { return jsvNewWithFlags(JSV_NULL); } ;///< Create a new null variable
307307
/** Create a new variable from a substring. argument must be a string. stridx = start char or str, maxLength = max number of characters (can be JSVAPPENDSTRINGVAR_MAXLENGTH) */
308308
JsVar *jsvNewFromStringVar(const JsVar *str, size_t stridx, size_t maxLength);
@@ -386,7 +386,7 @@ bool jsvIsInt(const JsVar *v);
386386
bool jsvIsFloat(const JsVar *v);
387387
bool jsvIsBoolean(const JsVar *v);
388388
bool jsvIsString(const JsVar *v); ///< String, or a NAME too
389-
bool jsvIsUnicodeString(const JsVar *v); ///< Just a unicode string (Unicode JsVar, pointing to a string)
389+
bool jsvIsUTF8String(const JsVar *v); ///< Just a unicode string (UTF8 JsVar, pointing to a string)
390390
bool jsvIsBasicString(const JsVar *v); ///< Just a string (NOT a name)
391391
bool jsvIsStringExt(const JsVar *v); ///< The extra bits dumped onto the end of a string to store more data
392392
bool jsvIsFlatString(const JsVar *v);
@@ -513,7 +513,7 @@ static ALWAYS_INLINE void jsvAppendCharacter(JsVar *var, char ch) { jsvAppendStr
513513
#define JSVAPPENDSTRINGVAR_MAXLENGTH (0x7FFFFFFF)
514514
void jsvAppendStringVar(JsVar *var, const JsVar *str, size_t stridx, size_t maxLength); ///< Append str to var. Both must be strings. stridx = start char or str, maxLength = max number of characters (can be JSVAPPENDSTRINGVAR_MAXLENGTH)
515515
void jsvAppendStringVarComplete(JsVar *var, const JsVar *str); ///< Append all of str to var. Both must be strings.
516-
char jsvGetCharInString(JsVar *v, size_t idx); ///< Get a character at the given index in the String
516+
int jsvGetCharInString(JsVar *v, size_t idx); ///< Get a character at the given index in the String (handles unicode)
517517
void jsvSetCharInString(JsVar *v, size_t idx, char ch, bool bitwiseOR); ///< Set a character at the given index in the String. If bitwiseOR, ch will be ORed with the character already at that position.
518518
int jsvGetStringIndexOf(JsVar *str, char ch); ///< Get the index of a character in a string, or -1
519519

0 commit comments

Comments
 (0)