From d62aa26e2c3c49e5b8a4298644cff290406d9357 Mon Sep 17 00:00:00 2001 From: Joseph Birr-Pixton Date: Wed, 16 Sep 2015 01:00:13 +0100 Subject: [PATCH] Deal with padding in one place, in blockwise. We introduce two functions, to assist with processing sequences of fixed bytes. One processes a single byte a bunch of times, the other does something more complicated. We use this for all hashes, CMAC and CBCMAC. This gives a good performance improvement. --- src/blockwise.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ src/blockwise.h | 52 ++++++++++++++++++++++++++++++++++++++ src/cbcmac.c | 15 +++++++++-- src/ccm.c | 4 +-- src/cmac.c | 7 +++--- src/gcm.c | 11 +++++--- src/modes.h | 5 ++++ src/sha1.c | 13 ++++------ src/sha256.c | 13 ++++------ src/sha3.c | 17 +++---------- src/sha512.c | 13 ++++------ 11 files changed, 167 insertions(+), 50 deletions(-) diff --git a/src/blockwise.c b/src/blockwise.c index 19691dc..182c8c5 100644 --- a/src/blockwise.c +++ b/src/blockwise.c @@ -126,3 +126,70 @@ void cf_blockwise_xor(uint8_t *partial, size_t *npartial, size_t nblock, inb += taken; } } + +void cf_blockwise_acc_byte(uint8_t *partial, size_t *npartial, + size_t nblock, + uint8_t byte, size_t nbytes, + cf_blockwise_in_fn process, + void *ctx) +{ + /* only memset the whole of the block once */ + int filled = 0; + + while (nbytes) + { + size_t start = *npartial; + size_t count = MIN(nbytes, nblock - start); + + if (!filled) + memset(partial + start, byte, count); + + if (start == 0 && count == nblock) + filled = 1; + + if (start + count == nblock) + { + process(ctx, partial); + *npartial = 0; + } else { + *npartial += count; + } + + nbytes -= count; + } +} + +void cf_blockwise_acc_pad(uint8_t *partial, size_t *npartial, + size_t nblock, + uint8_t fbyte, uint8_t mbyte, uint8_t lbyte, + size_t nbytes, + cf_blockwise_in_fn process, + void *ctx) +{ + + switch (nbytes) + { + case 0: break; + case 1: fbyte ^= lbyte; + cf_blockwise_accumulate(partial, npartial, nblock, &fbyte, 1, process, ctx); + break; + case 2: + cf_blockwise_accumulate(partial, npartial, nblock, &fbyte, 1, process, ctx); + cf_blockwise_accumulate(partial, npartial, nblock, &lbyte, 1, process, ctx); + break; + default: + cf_blockwise_accumulate(partial, npartial, nblock, &fbyte, 1, process, ctx); + + /* If the middle and last bytes differ, then process the last byte separately. + * Otherwise, just extend the middle block size. */ + if (lbyte != mbyte) + { + cf_blockwise_acc_byte(partial, npartial, nblock, mbyte, nbytes - 2, process, ctx); + cf_blockwise_accumulate(partial, npartial, nblock, &lbyte, 1, process, ctx); + } else { + cf_blockwise_acc_byte(partial, npartial, nblock, mbyte, nbytes - 1, process, ctx); + } + + break; + } +} diff --git a/src/blockwise.h b/src/blockwise.h index 37be2aa..a20ff95 100644 --- a/src/blockwise.h +++ b/src/blockwise.h @@ -92,4 +92,56 @@ void cf_blockwise_xor(uint8_t *partial, size_t *npartial, cf_blockwise_out_fn newblock, void *ctx); +/* This function processes a single byte a number of times. It's useful + * for padding, and more efficient than calling cf_blockwise_accumulate + * a bunch of times. + * + * partial is the buffer (maintained by the caller) + * on entry, npartial is the currently valid count of used bytes on + * the front of partial. + * on exit, npartial is updated to reflect the status of partial. + * nblock is the blocksize to accumulate -- partial must be at least + * this long! + * process is the processing function, passed ctx and a pointer + * to the data to process (always exactly nblock bytes long!) + * which may not neccessarily be the same as partial. + * byte is the byte to process, nbytes times. + */ +void cf_blockwise_acc_byte(uint8_t *partial, size_t *npartial, + size_t nblock, + uint8_t byte, size_t nbytes, + cf_blockwise_in_fn process, + void *ctx); + +/* This function attempts to process patterns of bytes common in + * block cipher padding. + * + * This takes three bytes: + * - a first byte, fbyte, + * - a middle byte, mbyte, + * - a last byte, lbyte. + * + * If nbytes is zero, nothing happens. + * If nbytes is one, the byte fbyte ^ lbyte is processed. + * If nbytes is two, the fbyte then lbyte are processed. + * If nbytes is three or more, fbyte, then one or more mbytes, then fbyte + * is processed. + * + * partial is the buffer (maintained by the caller) + * on entry, npartial is the currently valid count of used bytes on + * the front of partial. + * on exit, npartial is updated to reflect the status of partial. + * nblock is the blocksize to accumulate -- partial must be at least + * this long! + * process is the processing function, passed ctx and a pointer + * to the data to process (always exactly nblock bytes long!) + * which may not neccessarily be the same as partial. + */ +void cf_blockwise_acc_pad(uint8_t *partial, size_t *npartial, + size_t nblock, + uint8_t fbyte, uint8_t mbyte, uint8_t lbyte, + size_t nbytes, + cf_blockwise_in_fn process, + void *ctx); + #endif diff --git a/src/cbcmac.c b/src/cbcmac.c index 503c3a5..f0dfe87 100644 --- a/src/cbcmac.c +++ b/src/cbcmac.c @@ -53,6 +53,16 @@ void cf_cbcmac_stream_update(cf_cbcmac_stream *ctx, const uint8_t *data, size_t ctx); } +void cf_cbcmac_stream_finish_block_zero(cf_cbcmac_stream *ctx) +{ + if (ctx->used == 0) + return; + + memset(ctx->buffer + ctx->used, 0, ctx->prp->blocksz - ctx->used); + cbcmac_process(ctx, ctx->buffer); + ctx->used = 0; +} + void cf_cbcmac_stream_nopad_final(cf_cbcmac_stream *ctx, uint8_t out[CF_MAXBLOCK]) { assert(ctx->used == 0); @@ -62,7 +72,8 @@ void cf_cbcmac_stream_nopad_final(cf_cbcmac_stream *ctx, uint8_t out[CF_MAXBLOCK void cf_cbcmac_stream_pad_final(cf_cbcmac_stream *ctx, uint8_t out[CF_MAXBLOCK]) { uint8_t npad = ctx->prp->blocksz - ctx->used; - for (size_t i = 0; i < npad; i++) - cf_cbcmac_stream_update(ctx, &npad, 1); + cf_blockwise_acc_byte(ctx->buffer, &ctx->used, ctx->prp->blocksz, + npad, npad, + cbcmac_process, ctx); cf_cbcmac_stream_nopad_final(ctx, out); } diff --git a/src/ccm.c b/src/ccm.c index 039ea60..7ef87fc 100644 --- a/src/ccm.c +++ b/src/ccm.c @@ -35,9 +35,7 @@ static void write_be(uint8_t *out, size_t value, size_t bytes) static void zero_pad(cf_cbcmac_stream *cm) { - const uint8_t zero_byte = 0; - while (cm->used != 0) - cf_cbcmac_stream_update(cm, &zero_byte, 1); + cf_cbcmac_stream_finish_block_zero(cm); } /* nb. block is general workspace. */ diff --git a/src/cmac.c b/src/cmac.c index 274c4b0..f96d4d2 100644 --- a/src/cmac.c +++ b/src/cmac.c @@ -132,10 +132,9 @@ void cf_cmac_stream_update(cf_cmac_stream *ctx, const uint8_t *data, size_t len, /* Input padding */ if (needpad) { - uint8_t pad_block[CF_MAXBLOCK] = { 0x80 }; - cf_blockwise_accumulate(ctx->buffer, &ctx->used, blocksz, - pad_block, blocksz - ctx->used, - cmac_process_final_pad, ctx); + cf_blockwise_acc_pad(ctx->buffer, &ctx->used, blocksz, + 0x80, 0x00, 0x00, blocksz - ctx->used, + cmac_process_final_pad, ctx); } } diff --git a/src/gcm.c b/src/gcm.c index c9b9907..ca4d266 100644 --- a/src/gcm.c +++ b/src/gcm.c @@ -56,7 +56,7 @@ static void ghash_block(void *vctx, const uint8_t *data) static void ghash_add(ghash_ctx *ctx, const uint8_t *buf, size_t n) { cf_blockwise_accumulate(ctx->buffer, &ctx->buffer_used, - 16, + sizeof ctx->buffer, buf, n, ghash_block, ctx); @@ -64,9 +64,12 @@ static void ghash_add(ghash_ctx *ctx, const uint8_t *buf, size_t n) static void ghash_add_pad(ghash_ctx *ctx) { - uint8_t byte = 0x00; - while (ctx->buffer_used != 0) - ghash_add(ctx, &byte, 1); + if (ctx->buffer_used == 0) + return; + + memset(ctx->buffer + ctx->buffer_used, 0, sizeof(ctx->buffer) - ctx->buffer_used); + ghash_block(ctx, ctx->buffer); + ctx->buffer_used = 0; } static void ghash_add_aad(ghash_ctx *ctx, const uint8_t *buf, size_t n) diff --git a/src/modes.h b/src/modes.h index b9bf677..1b83985 100644 --- a/src/modes.h +++ b/src/modes.h @@ -194,6 +194,11 @@ void cf_cbcmac_stream_reset(cf_cbcmac_stream *ctx); * Process ndata bytes at data. */ void cf_cbcmac_stream_update(cf_cbcmac_stream *ctx, const uint8_t *data, size_t ndata); +/* .. c:function:: $DECL + * Finish the current block of data by adding zeroes. Does nothing if there + * are no bytes awaiting processing. */ +void cf_cbcmac_stream_finish_block_zero(cf_cbcmac_stream *ctx); + /* .. c:function:: $DECL * Output the MAC to ctx->prp->blocksz bytes at out. * ctx->used must be zero: the inputed message must be an exact number of diff --git a/src/sha1.c b/src/sha1.c index 909a326..8b7d02f 100644 --- a/src/sha1.c +++ b/src/sha1.c @@ -116,18 +116,15 @@ void cf_sha1_digest_final(cf_sha1_context *ctx, uint8_t hash[CF_SHA1_HASHSZ]) digested_bytes = digested_bytes * CF_SHA1_BLOCKSZ + ctx->npartial; uint64_t digested_bits = digested_bytes * 8; - size_t zeroes = CF_SHA1_BLOCKSZ - ((digested_bytes + 1 + 8) % CF_SHA1_BLOCKSZ); + size_t padbytes = CF_SHA1_BLOCKSZ - ((digested_bytes + 8) % CF_SHA1_BLOCKSZ); /* Hash 0x80 00 ... block first. */ - uint8_t buf[8]; - buf[0] = 0x80; - buf[1] = 0x00; - cf_sha1_update(ctx, &buf[0], 1); - - while (zeroes--) - cf_sha1_update(ctx, &buf[1], 1); + cf_blockwise_acc_pad(ctx->partial, &ctx->npartial, sizeof ctx->partial, + 0x80, 0x00, 0x00, padbytes, + sha1_update_block, ctx); /* Now hash length. */ + uint8_t buf[8]; write64_be(digested_bits, buf); cf_sha1_update(ctx, buf, 8); diff --git a/src/sha256.c b/src/sha256.c index d603e29..6a46598 100644 --- a/src/sha256.c +++ b/src/sha256.c @@ -172,18 +172,15 @@ void cf_sha256_digest_final(cf_sha256_context *ctx, uint8_t hash[CF_SHA256_HASHS digested_bytes = digested_bytes * CF_SHA256_BLOCKSZ + ctx->npartial; uint64_t digested_bits = digested_bytes * 8; - size_t zeroes = CF_SHA256_BLOCKSZ - ((digested_bytes + 1 + 8) % CF_SHA256_BLOCKSZ); + size_t padbytes = CF_SHA256_BLOCKSZ - ((digested_bytes + 8) % CF_SHA256_BLOCKSZ); /* Hash 0x80 00 ... block first. */ - uint8_t buf[8]; - buf[0] = 0x80; - buf[1] = 0x00; - cf_sha256_update(ctx, &buf[0], 1); - - while (zeroes--) - cf_sha256_update(ctx, &buf[1], 1); + cf_blockwise_acc_pad(ctx->partial, &ctx->npartial, sizeof ctx->partial, + 0x80, 0x00, 0x00, padbytes, + sha256_update_block, ctx); /* Now hash length. */ + uint8_t buf[8]; write64_be(digested_bits, buf); cf_sha256_update(ctx, buf, 8); diff --git a/src/sha3.c b/src/sha3.c index 6b11cee..1eccd36 100644 --- a/src/sha3.c +++ b/src/sha3.c @@ -311,21 +311,12 @@ static void sha3_update(cf_sha3_context *ctx, const void *data, size_t nbytes) static void pad(cf_sha3_context *ctx, uint8_t domain, size_t npad) { - uint8_t padding[CF_SHA3_224_BLOCKSZ]; - assert(npad >= 1); - if (npad == 1) - { - padding[0] = domain | 0x80; - sha3_update(ctx, padding, 1); - return; - } - - memset(padding, 0, npad); - padding[0] = domain; - padding[npad - 1] = 0x80; - sha3_update(ctx, padding, npad); + cf_blockwise_acc_pad(ctx->partial, &ctx->npartial, ctx->rate, + domain, 0x00, 0x80, + npad, + sha3_block, ctx); } static void pad_and_squeeze(cf_sha3_context *ctx, uint8_t *out, size_t nout) diff --git a/src/sha512.c b/src/sha512.c index 5551c57..2d1c896 100644 --- a/src/sha512.c +++ b/src/sha512.c @@ -188,18 +188,15 @@ void cf_sha512_digest_final(cf_sha512_context *ctx, uint8_t hash[CF_SHA512_HASHS digested_bytes = digested_bytes * CF_SHA512_BLOCKSZ + ctx->npartial; uint64_t digested_bits = digested_bytes * 8; - size_t zeroes = CF_SHA512_BLOCKSZ - ((digested_bytes + 1 + 16) % CF_SHA512_BLOCKSZ); + size_t padbytes = CF_SHA512_BLOCKSZ - ((digested_bytes + 16) % CF_SHA512_BLOCKSZ); /* Hash 0x80 00 ... block first. */ - uint8_t buf[8]; - buf[0] = 0x80; - buf[1] = 0x00; - cf_sha512_update(ctx, &buf[0], 1); - - while (zeroes--) - cf_sha512_update(ctx, &buf[1], 1); + cf_blockwise_acc_pad(ctx->partial, &ctx->npartial, sizeof ctx->partial, + 0x80, 0x00, 0x00, padbytes, + sha512_update_block, ctx); /* Now hash length (this is 128 bits long). */ + uint8_t buf[8]; write64_be(0, buf); cf_sha512_update(ctx, buf, 8); write64_be(digested_bits, buf);