Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Implement /\p{Basic_Emoji}/u #221

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Implement /\p{Basic_Emoji}/u
Support the Basic_Emoji property from https://unicode.org/reports/tr51/

Support for Emoji_Keycap_Sequence, RGI_Emoji_Flag_Sequence, etc. is not
yet complete because unicode_gen does not know how to generate codepoint
sequences, only ranges.
  • Loading branch information
bnoordhuis committed Dec 15, 2023
commit cbafcc948a895aad449a0929c9f2c0b2b9670295
61 changes: 61 additions & 0 deletions libunicode-table.h
Original file line number Diff line number Diff line change
@@ -4136,6 +4136,27 @@ static const uint8_t unicode_prop_Bidi_Mirrored_table[173] = {
0x80, 0xb8, 0x80, 0xb8, 0x80,
};

static const uint8_t unicode_prop_Basic_Emoji_table[144] = {
0x60, 0x23, 0x19, 0x81, 0x40, 0xcc, 0x1a, 0x01,
0x80, 0x42, 0x08, 0x81, 0x94, 0x81, 0xb1, 0x8b,
0xaa, 0x80, 0x92, 0x80, 0x8c, 0x07, 0x81, 0x90,
0x0c, 0x0f, 0x04, 0x80, 0x94, 0x06, 0x08, 0x03,
0x01, 0x06, 0x03, 0x81, 0x9b, 0x80, 0xa2, 0x00,
0x03, 0x10, 0x80, 0xbc, 0x82, 0x97, 0x80, 0x8d,
0x80, 0x43, 0x5a, 0x81, 0xb2, 0x03, 0x80, 0x61,
0xc4, 0xad, 0x80, 0x40, 0xc9, 0x80, 0x40, 0xbd,
0x01, 0x89, 0xe5, 0x80, 0x97, 0x80, 0x93, 0x01,
0x20, 0x82, 0x94, 0x81, 0x40, 0xad, 0xa0, 0x8b,
0x88, 0x80, 0xc5, 0x80, 0x95, 0x8b, 0xaa, 0x1c,
0x8b, 0x90, 0x10, 0x82, 0xc6, 0x00, 0x80, 0x40,
0xba, 0x81, 0xbe, 0x8c, 0x18, 0x97, 0x91, 0x80,
0x99, 0x81, 0x8c, 0x80, 0xd5, 0xd4, 0xaf, 0xc5,
0x28, 0x12, 0x0a, 0x1b, 0x8a, 0x0e, 0x88, 0x40,
0xe2, 0x8b, 0x18, 0x41, 0x1a, 0xae, 0x80, 0x89,
0x80, 0x40, 0xb8, 0xef, 0x8c, 0x82, 0x88, 0x86,
0xad, 0x06, 0x87, 0x8d, 0x83, 0x88, 0x86, 0x88,
};

static const uint8_t unicode_prop_Emoji_table[239] = {
0xa2, 0x05, 0x04, 0x89, 0xee, 0x03, 0x80, 0x5f,
0x8c, 0x80, 0x8b, 0x80, 0x40, 0xd7, 0x80, 0x95,
@@ -4214,6 +4235,21 @@ static const uint8_t unicode_prop_Emoji_Presentation_table[145] = {
0x88,
};

static const uint8_t unicode_prop_Emoji_Keycap_Sequence_table[0] = {
};

static const uint8_t unicode_prop_RGI_Emoji_Flag_Sequence_table[0] = {
};

static const uint8_t unicode_prop_RGI_Emoji_Modifier_Sequence_table[0] = {
};

static const uint8_t unicode_prop_RGI_Emoji_Tag_Sequence_table[0] = {
};

static const uint8_t unicode_prop_RGI_Emoji_ZWJ_Sequence_table[0] = {
};

static const uint8_t unicode_prop_Extended_Pictographic_table[156] = {
0x40, 0xa8, 0x03, 0x80, 0x5f, 0x8c, 0x80, 0x8b,
0x80, 0x40, 0xd7, 0x80, 0x95, 0x80, 0xd9, 0x85,
@@ -4289,11 +4325,17 @@ typedef enum {
UNICODE_PROP_Variation_Selector,
UNICODE_PROP_White_Space,
UNICODE_PROP_Bidi_Mirrored,
UNICODE_PROP_Basic_Emoji,
UNICODE_PROP_Emoji,
UNICODE_PROP_Emoji_Component,
UNICODE_PROP_Emoji_Modifier,
UNICODE_PROP_Emoji_Modifier_Base,
UNICODE_PROP_Emoji_Presentation,
UNICODE_PROP_Emoji_Keycap_Sequence,
UNICODE_PROP_RGI_Emoji_Flag_Sequence,
UNICODE_PROP_RGI_Emoji_Modifier_Sequence,
UNICODE_PROP_RGI_Emoji_Tag_Sequence,
UNICODE_PROP_RGI_Emoji_ZWJ_Sequence,
UNICODE_PROP_Extended_Pictographic,
UNICODE_PROP_Default_Ignorable_Code_Point,
UNICODE_PROP_ID_Start,
@@ -4347,11 +4389,17 @@ static const char unicode_prop_name_table[] =
"Variation_Selector,VS" "\0"
"White_Space,space" "\0"
"Bidi_Mirrored,Bidi_M" "\0"
"Basic_Emoji" "\0"
"Emoji" "\0"
"Emoji_Component,EComp" "\0"
"Emoji_Modifier,EMod" "\0"
"Emoji_Modifier_Base,EBase" "\0"
"Emoji_Presentation,EPres" "\0"
"Emoji_Keycap_Sequence" "\0"
"RGI_Emoji_Flag_Sequence" "\0"
"RGI_Emoji_Modifier_Sequence" "\0"
"RGI_Emoji_Tag_Sequence" "\0"
"RGI_Emoji_ZWJ_Sequence" "\0"
"Extended_Pictographic,ExtPict" "\0"
"Default_Ignorable_Code_Point,DI" "\0"
"ID_Start,IDS" "\0"
@@ -4419,11 +4467,17 @@ static const uint8_t * const unicode_prop_table[] = {
unicode_prop_Variation_Selector_table,
unicode_prop_White_Space_table,
unicode_prop_Bidi_Mirrored_table,
unicode_prop_Basic_Emoji_table,
unicode_prop_Emoji_table,
unicode_prop_Emoji_Component_table,
unicode_prop_Emoji_Modifier_table,
unicode_prop_Emoji_Modifier_Base_table,
unicode_prop_Emoji_Presentation_table,
unicode_prop_Emoji_Keycap_Sequence_table,
unicode_prop_RGI_Emoji_Flag_Sequence_table,
unicode_prop_RGI_Emoji_Modifier_Sequence_table,
unicode_prop_RGI_Emoji_Tag_Sequence_table,
unicode_prop_RGI_Emoji_ZWJ_Sequence_table,
unicode_prop_Extended_Pictographic_table,
unicode_prop_Default_Ignorable_Code_Point_table,
unicode_prop_ID_Start_table,
@@ -4472,13 +4526,20 @@ static const uint16_t unicode_prop_len_table[] = {
countof(unicode_prop_Variation_Selector_table),
countof(unicode_prop_White_Space_table),
countof(unicode_prop_Bidi_Mirrored_table),
countof(unicode_prop_Basic_Emoji_table),
countof(unicode_prop_Emoji_table),
countof(unicode_prop_Emoji_Component_table),
countof(unicode_prop_Emoji_Modifier_table),
countof(unicode_prop_Emoji_Modifier_Base_table),
countof(unicode_prop_Emoji_Presentation_table),
countof(unicode_prop_Emoji_Keycap_Sequence_table),
countof(unicode_prop_RGI_Emoji_Flag_Sequence_table),
countof(unicode_prop_RGI_Emoji_Modifier_Sequence_table),
countof(unicode_prop_RGI_Emoji_Tag_Sequence_table),
countof(unicode_prop_RGI_Emoji_ZWJ_Sequence_table),
countof(unicode_prop_Extended_Pictographic_table),
countof(unicode_prop_Default_Ignorable_Code_Point_table),
countof(unicode_prop_ID_Start_table),
countof(unicode_prop_Case_Ignorable_table),
};

2 changes: 2 additions & 0 deletions tests/test_builtin.js
Original file line number Diff line number Diff line change
@@ -590,6 +590,8 @@ function test_regexp()
assert(/{1a}/.toString(), "/{1a}/");
a = /a{1+/.exec("a{11");
assert(a, ["a{11"] );

/\p{Basic_Emoji}/u;
}

function test_symbol()
10 changes: 4 additions & 6 deletions unicode_download.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
#!/bin/sh
set -e

url="ftp://ftp.unicode.org/Public/14.0.0/ucd"
emoji_url="${url}/emoji/emoji-data.txt"

files="CaseFolding.txt DerivedNormalizationProps.txt PropList.txt \
SpecialCasing.txt CompositionExclusions.txt ScriptExtensions.txt \
UnicodeData.txt DerivedCoreProperties.txt NormalizationTest.txt Scripts.txt \
@@ -12,8 +9,9 @@ PropertyValueAliases.txt"
mkdir -p unicode

for f in $files; do
g="${url}/${f}"
wget $g -O unicode/$f
wget "https://www.unicode.org/Public/15.0.0/ucd/${f}" -O unicode/$f
done

wget $emoji_url -O unicode/emoji-data.txt
wget "https://www.unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt" -O unicode/emoji-data.txt
wget "https://www.unicode.org/Public/emoji/15.0/emoji-sequences.txt" -O unicode/emoji-sequences.txt
wget "https://www.unicode.org/Public/emoji/15.0/emoji-zwj-sequences.txt" -O unicode/emoji-zwj-sequences.txt
62 changes: 41 additions & 21 deletions unicode_gen.c
Original file line number Diff line number Diff line change
@@ -678,39 +678,51 @@ void parse_prop_list(const char *filename)
for(;;) {
if (!get_line(line, sizeof(line), f))
break;
// first parse the property name
p = line;
while (isspace(*p))
p++;
if (*p == '#' || *p == '@' || *p == '\0')
continue;
p = strchr(p, ';');
if (!p)
continue;
p++;
p += strspn(p, " \t");
q = buf;
static const char fini[] = " \t;#";
while (!memchr(fini, *p, sizeof(fini))) {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
i = find_name(unicode_prop_name, countof(unicode_prop_name), buf);
if (i < 0) {
fprintf(stderr, "Property not found: %s\n", buf);
exit(1);
}
// now parse the codepoint, codepoint range, or sequence
p = line;
c0 = strtoul(p, (char **)&p, 16);
if (*p == '.' && p[1] == '.') {
p += 2;
c1 = strtoul(p, (char **)&p, 16);
} else {
c1 = c0;
}
assert(c1 <= CHARCODE_MAX);
p += strspn(p, " \t");
if (*p == ';') {
p++;
p += strspn(p, " \t");
q = buf;
while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
if ((q - buf) < sizeof(buf) - 1)
*q++ = *p;
p++;
}
*q = '\0';
i = find_name(unicode_prop_name,
countof(unicode_prop_name), buf);
if (i < 0) {
fprintf(stderr, "Property not found: %s\n", buf);
exit(1);
}
assert(c1 <= CHARCODE_MAX);
for(c = c0; c <= c1; c++) {
set_prop(c, i, 1);
}
} else {
c1 = strtoul(p, (char **)&p, 16);
if (c1) {
// TODO(bnoordhuis) store sequence
do {
assert(c1 <= CHARCODE_MAX);
c1 = strtoul(p, (char **)&p, 16);
} while (c1);
} else {
set_prop(c0, i, 1);
}
}
}
fclose(f);
@@ -2951,6 +2963,14 @@ int main(int argc, char **argv)
unicode_db_path);
parse_prop_list(filename);

snprintf(filename, sizeof(filename), "%s/emoji-sequences.txt",
unicode_db_path);
parse_prop_list(filename);

snprintf(filename, sizeof(filename), "%s/emoji-zwj-sequences.txt",
unicode_db_path);
parse_prop_list(filename);

// dump_data(unicode_db);

build_conv_table(unicode_db);
6 changes: 6 additions & 0 deletions unicode_gen_def.h
Original file line number Diff line number Diff line change
@@ -254,11 +254,17 @@ DEF(Unified_Ideograph, "UIdeo")
DEF(Variation_Selector, "VS")
DEF(White_Space, "space")
DEF(Bidi_Mirrored, "Bidi_M")
DEF(Basic_Emoji, "")
DEF(Emoji, "")
DEF(Emoji_Component, "EComp")
DEF(Emoji_Modifier, "EMod")
DEF(Emoji_Modifier_Base, "EBase")
DEF(Emoji_Presentation, "EPres")
DEF(Emoji_Keycap_Sequence, "")
DEF(RGI_Emoji_Flag_Sequence, "")
DEF(RGI_Emoji_Modifier_Sequence, "")
DEF(RGI_Emoji_Tag_Sequence, "")
DEF(RGI_Emoji_ZWJ_Sequence, "")
DEF(Extended_Pictographic, "ExtPict")
DEF(Default_Ignorable_Code_Point, "DI")
DEF(ID_Start, "IDS")