Skip to content

Commit

Permalink
encoder: Optimise SIMD string escaping
Browse files Browse the repository at this point in the history
  • Loading branch information
JakubOnderka committed Jan 1, 2025
1 parent 64722e4 commit 5e0ee8e
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 24 deletions.
24 changes: 12 additions & 12 deletions src/simdjson_encoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,38 +474,38 @@ static zend_always_inline size_t simdjson_append_escape(char *buf, const char c)
template<typename T>
static zend_always_inline void simdjson_escape_long_string(smart_str *buf, const char *s, size_t len)
{
size_t i = 0;
T chunk;
const char* start = s;
const size_t vlen = len & (int) (~(sizeof(chunk) - 1)); // max lenght that can be processed in chunk mode
char *output;

// vlen = len - (len % sizeof(simdjson_vector8))
size_t vlen = len & (int) (~(sizeof(chunk) - 1));

output = simdjson_smart_str_alloc(buf, len + 2);
*output++ = '"';

// Iterate input string in chunks
for (; i < vlen; i += sizeof(chunk)) {
while (s < start + vlen) {
// Load chars to vector
chunk.load((const uint8_t *) &s[i]);
chunk.load((const uint8_t *) s);
// Check chunk if contains char that needs to be escaped
auto needs_escaping = chunk.needs_escaping();
if (EXPECTED(!needs_escaping)) {
// If no escape char found, store chunk in output buffer and move buffer pointer
chunk.store((uint8_t*)output);
output += sizeof(chunk);
s += sizeof(chunk);
} else {
// Allocate enought space for escaped chunk + space for rest of unescaped string
SIDMJSON_ZSTR_ALLOC(sizeof(chunk) * SIMDJSON_ENCODER_ESCAPE_LENGTH + (len - i));
SIDMJSON_ZSTR_ALLOC(sizeof(chunk) * SIMDJSON_ENCODER_ESCAPE_LENGTH + ((start + len) - s));

// Copy first bytes that do not need escaping in chunk without checking
auto j = chunk.escape_index(needs_escaping);
memcpy(output, &s[i], j);
memcpy(output, s, j);
output += j;
s += j;

// Process rest of chunk char by char and escape required char
for (; j < sizeof(chunk); j++) {
char c = s[i + j];
char c = *s++;
if (EXPECTED(simdjson_need_escaping[(uint8_t)c] == 0)) {
*output++ = c;
} else {
Expand All @@ -515,12 +515,12 @@ static zend_always_inline void simdjson_escape_long_string(smart_str *buf, const
}
}

// Ensure that buf contains enoug space that we can call unsafe methods
// Ensure that buf contains enough space that we can call unsafe methods
SIDMJSON_ZSTR_ALLOC(sizeof(chunk) * SIMDJSON_ENCODER_ESCAPE_LENGTH + 1);

// Finish last chars of string
for (; i < len; i++) {
char c = s[i];
while (s < start + len) {
char c = *s++;
if (EXPECTED(simdjson_need_escaping[(uint8_t)c] == 0)) {
*output++ = c;
} else {
Expand Down
69 changes: 57 additions & 12 deletions src/simdjson_encoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,63 @@ static const _simdjson_escaped simdjson_escape[] = {
{"", 0},
{"", 0},
{"\\\"", 2}, // 34
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, // 44
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, // 54
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, // 64
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, // 74
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0}, // 84
{"", 0}, {"", 0}, {"", 0}, {"", 0}, {"", 0},
{"", 0}, {"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0}, // 44
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0}, // 54
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0}, // 64
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0}, // 74
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0}, // 84
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"", 0},
{"\\\\", 2}, // 92
};

Expand Down

0 comments on commit 5e0ee8e

Please # to comment.