diff --git a/Makefile.am b/Makefile.am index 4398988cd..07feb08d1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -57,6 +57,8 @@ cpuminer_SOURCES = \ lyra2/Lyra2.c lyra2/Sponge.c \ yescrypt/yescrypt-common.c yescrypt/yescrypt-best.c \ yescrypt/sha256_Y.c \ + yespower-1.0.1/sha256.c \ + yespower-1.0.1/yespower-opt.c \ algo/allium.c \ algo/axiom.c \ algo/bastion.c \ @@ -80,6 +82,7 @@ cpuminer_SOURCES = \ algo/lyra2re.c \ algo/lyra2rev2.c \ algo/lyra2v3.c \ + algo/minotaur.c \ algo/myr-groestl.c \ algo/keccak.c \ algo/pentablake.c \ diff --git a/README.md b/README.md index 315a84a83..6631e70d4 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ Algorithms * ✓ __lyra2RE__ (Cryptocoin) * ✓ __lyra2REv2__ * ✓ __lyra2REv3__ (VertCoin [VTC]) + * ✓ __minotaur__ (Ring [RNG]) + * ✓ __minotaurx__ (Avian[AVN], Litecash[LCC], Mazacoin[MAZA]) * ✓ __myr-gr__ Myriad-Groestl (MyriadCoin [MYR]) * ✓ __neoscrypt__ (Feathercoin) * ✓ __nist5__ (MistCoin [MIC], TalkCoin [TAC], ...) diff --git a/algo/minotaur.c b/algo/minotaur.c new file mode 100644 index 000000000..714c7a7db --- /dev/null +++ b/algo/minotaur.c @@ -0,0 +1,290 @@ +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Config +#define MINOTAUR_ALGO_COUNT 16 +//#define MINOTAUR_DEBUG + +static const yespower_params_t yespower_params = {YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17}; + +typedef struct TortureNode TortureNode; +typedef struct TortureGarden TortureGarden; + +// Graph of hash algos plus SPH contexts +struct TortureGarden { + sph_blake512_context context_blake; + sph_bmw512_context context_bmw; + sph_cubehash512_context context_cubehash; + sph_echo512_context context_echo; + sph_fugue512_context context_fugue; + sph_groestl512_context context_groestl; + sph_hamsi512_context context_hamsi; + sph_jh512_context context_jh; + sph_keccak512_context context_keccak; + sph_luffa512_context context_luffa; + sph_shabal512_context context_shabal; + sph_shavite512_context context_shavite; + sph_simd512_context context_simd; + sph_skein512_context context_skein; + sph_whirlpool_context context_whirlpool; + sph_sha512_context context_sha2; + + struct TortureNode { + unsigned int algo; + TortureNode *childLeft; + TortureNode *childRight; + } nodes[22]; +}; + +// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index +void get_hash(void *output, const void *input, TortureGarden *garden, unsigned int algo) +{ + unsigned char _ALIGN(64) hash[64]; + memset(hash, 0, sizeof(hash)); // Doesn't affect Minotaur as all hash outputs are 64 bytes; required for MinotaurX due to yespower's 32 byte output. + + switch (algo) { + case 0: + sph_blake512_init(&garden->context_blake); + sph_blake512(&garden->context_blake, input, 64); + sph_blake512_close(&garden->context_blake, hash); + break; + case 1: + sph_bmw512_init(&garden->context_bmw); + sph_bmw512(&garden->context_bmw, input, 64); + sph_bmw512_close(&garden->context_bmw, hash); + break; + case 2: + sph_cubehash512_init(&garden->context_cubehash); + sph_cubehash512(&garden->context_cubehash, input, 64); + sph_cubehash512_close(&garden->context_cubehash, hash); + break; + case 3: + sph_echo512_init(&garden->context_echo); + sph_echo512(&garden->context_echo, input, 64); + sph_echo512_close(&garden->context_echo, hash); + break; + case 4: + sph_fugue512_init(&garden->context_fugue); + sph_fugue512(&garden->context_fugue, input, 64); + sph_fugue512_close(&garden->context_fugue, hash); + break; + case 5: + sph_groestl512_init(&garden->context_groestl); + sph_groestl512(&garden->context_groestl, input, 64); + sph_groestl512_close(&garden->context_groestl, hash); + break; + case 6: + sph_hamsi512_init(&garden->context_hamsi); + sph_hamsi512(&garden->context_hamsi, input, 64); + sph_hamsi512_close(&garden->context_hamsi, hash); + break; + case 7: + sph_sha512_init(&garden->context_sha2); + sph_sha512(&garden->context_sha2, input, 64); + sph_sha512_close(&garden->context_sha2, hash); + break; + case 8: + sph_jh512_init(&garden->context_jh); + sph_jh512(&garden->context_jh, input, 64); + sph_jh512_close(&garden->context_jh, hash); + break; + case 9: + sph_keccak512_init(&garden->context_keccak); + sph_keccak512(&garden->context_keccak, input, 64); + sph_keccak512_close(&garden->context_keccak, hash); + break; + case 10: + sph_luffa512_init(&garden->context_luffa); + sph_luffa512(&garden->context_luffa, input, 64); + sph_luffa512_close(&garden->context_luffa, hash); + break; + case 11: + sph_shabal512_init(&garden->context_shabal); + sph_shabal512(&garden->context_shabal, input, 64); + sph_shabal512_close(&garden->context_shabal, hash); + break; + case 12: + sph_shavite512_init(&garden->context_shavite); + sph_shavite512(&garden->context_shavite, input, 64); + sph_shavite512_close(&garden->context_shavite, hash); + break; + case 13: + sph_simd512_init(&garden->context_simd); + sph_simd512(&garden->context_simd, input, 64); + sph_simd512_close(&garden->context_simd, hash); + break; + case 14: + sph_skein512_init(&garden->context_skein); + sph_skein512(&garden->context_skein, input, 64); + sph_skein512_close(&garden->context_skein, hash); + break; + case 15: + sph_whirlpool_init(&garden->context_whirlpool); + sph_whirlpool(&garden->context_whirlpool, input, 64); + sph_whirlpool_close(&garden->context_whirlpool, hash); + break; + // NB: The CPU-hard gate must be case MINOTAUR_ALGO_COUNT. + case 16: + yespower_tls(input, 64, &yespower_params, (yespower_binary_t*)hash); + } + + // Output the hash + memcpy(output, hash, 64); +} + +// Recursively traverse a given torture garden starting with a given hash and given node within the garden. The hash is overwritten with the final hash. +void traverse_garden(TortureGarden *garden, void *hash, TortureNode *node) +{ + unsigned char _ALIGN(64) partialHash[64]; + memset(partialHash, 0, sizeof(partialHash)); // Doesn't affect Minotaur as all hash outputs are 64 bytes; required for MinotaurX due to yespower's 32 byte output. + get_hash(partialHash, hash, garden, node->algo); + +#ifdef MINOTAUR_DEBUG + printf("* Ran algo %d. Partial hash:\t", node->algo); + for (int i = 63; i >= 0; i--) printf("%02x", partialHash[i]); + printf("\n"); + fflush(0); +#endif + + if (partialHash[63] % 2 == 0) { // Last byte of output hash is even + if (node->childLeft != NULL) + traverse_garden(garden, partialHash, node->childLeft); + } else { // Last byte of output hash is odd + if (node->childRight != NULL) + traverse_garden(garden, partialHash, node->childRight); + } + + memcpy(hash, partialHash, 64); +} + +// Associate child nodes with a parent node +inline void link_nodes(TortureNode *parent, TortureNode *childLeft, TortureNode *childRight) +{ + parent->childLeft = childLeft; + parent->childRight = childRight; +} + +// Produce a 32-byte hash from 80-byte input data +void minotaurhash(void *output, const void *input, bool minotaurX) +{ + // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete). + // The successful path through the garden visits 7 nodes. + TortureGarden garden; + link_nodes(&garden.nodes[0], &garden.nodes[1], &garden.nodes[2]); + link_nodes(&garden.nodes[1], &garden.nodes[3], &garden.nodes[4]); + link_nodes(&garden.nodes[2], &garden.nodes[5], &garden.nodes[6]); + link_nodes(&garden.nodes[3], &garden.nodes[7], &garden.nodes[8]); + link_nodes(&garden.nodes[4], &garden.nodes[9], &garden.nodes[10]); + link_nodes(&garden.nodes[5], &garden.nodes[11], &garden.nodes[12]); + link_nodes(&garden.nodes[6], &garden.nodes[13], &garden.nodes[14]); + link_nodes(&garden.nodes[7], &garden.nodes[15], &garden.nodes[16]); + link_nodes(&garden.nodes[8], &garden.nodes[15], &garden.nodes[16]); + link_nodes(&garden.nodes[9], &garden.nodes[15], &garden.nodes[16]); + link_nodes(&garden.nodes[10], &garden.nodes[15], &garden.nodes[16]); + link_nodes(&garden.nodes[11], &garden.nodes[17], &garden.nodes[18]); + link_nodes(&garden.nodes[12], &garden.nodes[17], &garden.nodes[18]); + link_nodes(&garden.nodes[13], &garden.nodes[17], &garden.nodes[18]); + link_nodes(&garden.nodes[14], &garden.nodes[17], &garden.nodes[18]); + link_nodes(&garden.nodes[15], &garden.nodes[19], &garden.nodes[20]); + link_nodes(&garden.nodes[16], &garden.nodes[19], &garden.nodes[20]); + link_nodes(&garden.nodes[17], &garden.nodes[19], &garden.nodes[20]); + link_nodes(&garden.nodes[18], &garden.nodes[19], &garden.nodes[20]); + link_nodes(&garden.nodes[19], &garden.nodes[21], &garden.nodes[21]); + link_nodes(&garden.nodes[20], &garden.nodes[21], &garden.nodes[21]); + garden.nodes[21].childLeft = NULL; + garden.nodes[21].childRight = NULL; + + // Find initial sha512 hash + unsigned char _ALIGN(64) hash[64]; + sph_sha512_init(&garden.context_sha2); + sph_sha512(&garden.context_sha2, input, 80); + sph_sha512_close(&garden.context_sha2, hash); + +#ifdef MINOTAUR_DEBUG + printf("** Initial hash:\t\t"); + for (int i = 63; i >= 0; i--) printf("%02x", hash[i]); + printf("\n"); + fflush(0); +#endif + + // Assign algos to torture garden nodes based on initial hash + for (int i = 0; i < 22; i++) + garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT; + + // Hardened garden gates on MinotaurX + if (minotaurX) + garden.nodes[21].algo = MINOTAUR_ALGO_COUNT; + + // Send the initial hash through the torture garden + traverse_garden(&garden, hash, &garden.nodes[0]); + + // Truncate the result to 32 bytes + memcpy(output, hash, 32); + +#ifdef MINOTAUR_DEBUG + printf("** Final hash:\t\t\t"); + for (int i = 31; i >= 0; i--) printf("%02x", hash[i]); + printf("\n"); + fflush(0); +#endif +} + +// Scan driver +int scanhash_minotaur(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, bool minotaurX) +{ + uint32_t _ALIGN(64) hash[8]; + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t nonce = first_nonce; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + if (opt_benchmark) + ptarget[7] = 0x0cff; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + do { + be32enc(&endiandata[19], nonce); + minotaurhash(hash, endiandata, minotaurX); + + if (hash[7] <= Htarg && fulltest(hash, ptarget)) { + work_set_target_ratio(work, hash); + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + return 1; + } + nonce++; + + } while (nonce < max_nonce && !(*restart)); + + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/cpu-miner.c b/cpu-miner.c index 5e8948d80..b59db1795 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -105,6 +105,8 @@ enum algos { ALGO_LYRA2, /* Lyra2RE */ ALGO_LYRA2REV2, /* Lyra2REv2 */ ALGO_LYRA2V3, /* Lyra2REv3 (Vertcoin) */ + ALGO_MINOTAUR, /* Minotaur (Ring) */ + ALGO_MINOTAURX, /* Minotaurx (Avian, Litecash, Maza) */ ALGO_MYR_GR, /* Myriad Groestl */ ALGO_NIST5, /* Nist5 */ ALGO_PENTABLAKE, /* Pentablake */ @@ -177,6 +179,8 @@ static const char *algo_names[] = { "lyra2re", "lyra2rev2", "lyra2v3", + "minotaur", + "minotaurx", "myr-gr", "nist5", "pentablake", @@ -347,6 +351,8 @@ Options:\n\ lyra2rev2 Lyra2REv2\n\ lyra2v3 Lyra2REv3 (Vertcoin)\n\ myr-gr Myriad-Groestl\n\ + minotaur Ring\n\ + minotaurx Avian, Litecash, Maza\n\ neoscrypt NeoScrypt(128, 2, 1)\n\ nist5 Nist5\n\ pluck Pluck:128 (Supcoin)\n\ @@ -2200,6 +2206,8 @@ static void *miner_thread(void *userdata) max64 = 0x40LL; break; case ALGO_DROP: + case ALGO_MINOTAUR: + case ALGO_MINOTAURX: case ALGO_PLUCK: case ALGO_YESCRYPT: case ALGO_YESCRYPTR8: @@ -2355,6 +2363,12 @@ static void *miner_thread(void *userdata) case ALGO_LYRA2V3: rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_MINOTAUR: + rc = scanhash_minotaur(thr_id, &work, max_nonce, &hashes_done, false); + break; + case ALGO_MINOTAURX: + rc = scanhash_minotaur(thr_id, &work, max_nonce, &hashes_done, true); + break; case ALGO_MYR_GR: rc = scanhash_myriad(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/miner.h b/miner.h index 34574030f..59dddc18e 100644 --- a/miner.h +++ b/miner.h @@ -223,6 +223,7 @@ int scanhash_luffa(int thr_id, struct work *work, uint32_t max_nonce, uint64_t * int scanhash_lyra2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done); int scanhash_lyra2rev2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done); int scanhash_lyra2v3(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done); +int scanhash_minotaur(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, bool minotaurX); int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done); int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t profile); int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done); @@ -531,6 +532,7 @@ void luffahash(void *output, const void *input); void lyra2_hash(void *state, const void *input); void lyra2rev2_hash(void *state, const void *input); void lyra2v3_hash(void *state, const void *input); +void minotaurhash(void *output, const void *input, bool minotaurX); void myriadhash(void *output, const void *input); void neoscrypt(unsigned char *output, const unsigned char *password, uint32_t profile); void nist5hash(void *output, const void *input); diff --git a/util.c b/util.c index 5ba320b53..2ef847164 100644 --- a/util.c +++ b/util.c @@ -2401,6 +2401,12 @@ void print_hash_tests(void) cryptonight_hash(&hash[0], &buf[0]); printpfx("monero", hash); + minotaurhash(&hash[0], &buf[0], false); + printpfx("minotaur", hash); + + minotaurhash(&hash[0], &buf[0], true); + printpfx("minotaurx", hash); + myriadhash(&hash[0], &buf[0]); printpfx("myr-gr", hash); diff --git a/yespower-1.0.1/CHANGES b/yespower-1.0.1/CHANGES new file mode 100644 index 000000000..82420210c --- /dev/null +++ b/yespower-1.0.1/CHANGES @@ -0,0 +1,18 @@ + Changes made between 1.0.0 (2018/07/12) and 1.0.1 (2019/06/30). + +Fill the destination buffer with all set bits on error for fail-safety +of the caller's "< target" check in case the caller neglects to check +for errors. + +Simplified SMix2 for its final invocation with Nloop=2 in yespower 0.5. + +Revised the "XOR of yespower" tests to trigger duplicate index in the +last SMix2 invocation in yespower 0.5 for N=2048 with at least one of +the values of r being tested. This is needed to test that a proper +kind of BlockMix is used in that special case, which would previously be +left untested. + +Added x32 ABI support (x86-64 with 32-bit pointers). + +Added a bit more detail to the README on caching of the computed PoW +hashes when integrating yespower in an altcoin based on Bitcoin Core. diff --git a/yespower-1.0.1/PERFORMANCE b/yespower-1.0.1/PERFORMANCE new file mode 100644 index 000000000..99cddd051 --- /dev/null +++ b/yespower-1.0.1/PERFORMANCE @@ -0,0 +1,95 @@ +Included with yespower is the "benchmark" program, which is built by +simply invoking "make". When invoked without parameters, it tests +yespower 0.5 at N = 2048, r = 8, which appears to be the lowest setting +in use by existing cryptocurrencies. On an i7-4770K with 4x DDR3-1600 +(on two memory channels) running CentOS 7 for x86-64 (and built with +CentOS 7's default version of gcc) and with thread affinity set, this +reports between 3700 and 3800 hashes per second for both SSE2 and AVX +builds, e.g.: + +$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark +version=0.5 N=2048 r=8 +Will use 2048.00 KiB RAM +a5 9f ec 4c 4f dd a1 6e 3b 14 05 ad da 66 d5 25 b6 8e 7c ad fc fe 6a c0 66 c7 ad 11 8c d8 05 90 +Benchmarking 1 thread ... +1018 H/s real, 1018 H/s virtual (2047 hashes in 2.01 seconds) +Benchmarking 4 threads ... +3773 H/s real, 950 H/s virtual (8188 hashes in 2.17 seconds) +min 0.984 ms, avg 1.052 ms, max 1.074 ms + +Running 8 threads (to match the logical rather than the physical CPU +core count) results in very slightly worse performance on this system, +but this might be the other way around on another and/or with other +parameters. Upgrading to yespower 1.0, performance at these parameters +improves to almost 4000 hashes per second: + +$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 10 +version=1.0 N=2048 r=8 +Will use 2048.00 KiB RAM +d0 78 cd d4 cf 3f 5a a8 4e 3c 4a 58 66 29 81 d8 2d 27 e5 67 36 37 c4 be 77 63 61 32 24 c1 8a 93 +Benchmarking 1 thread ... +1080 H/s real, 1080 H/s virtual (4095 hashes in 3.79 seconds) +Benchmarking 4 threads ... +3995 H/s real, 1011 H/s virtual (16380 hashes in 4.10 seconds) +min 0.923 ms, avg 0.989 ms, max 1.137 ms + +Running 8 threads results in substantial slowdown with this new version +(to between 3200 and 3400 hashes per second) because of cache thrashing. + +For higher settings such as those achieving 8 MiB instead of the 2 MiB +above, this system performs at around 800 hashes per second for yespower +0.5 and at around 830 hashes per second for yespower 1.0: + +$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 5 2048 32 +version=0.5 N=2048 r=32 +Will use 8192.00 KiB RAM +56 0a 89 1b 5c a2 e1 c6 36 11 1a 9f f7 c8 94 a5 d0 a2 60 2f 43 fd cf a5 94 9b 95 e2 2f e4 46 1e +Benchmarking 1 thread ... +265 H/s real, 265 H/s virtual (1023 hashes in 3.85 seconds) +Benchmarking 4 threads ... +803 H/s real, 200 H/s virtual (4092 hashes in 5.09 seconds) +min 4.924 ms, avg 4.980 ms, max 5.074 ms + +$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 10 2048 32 +version=1.0 N=2048 r=32 +Will use 8192.00 KiB RAM +f7 69 26 ae 4a dc 56 53 90 2f f0 22 78 ea aa 39 eb 99 84 11 ac 3e a6 24 2e 19 6d fb c4 3d 68 25 +Benchmarking 1 thread ... +275 H/s real, 275 H/s virtual (1023 hashes in 3.71 seconds) +Benchmarking 4 threads ... +831 H/s real, 209 H/s virtual (4092 hashes in 4.92 seconds) +min 3.614 ms, avg 4.769 ms, max 5.011 ms + +Again, running 8 threads results in a slowdown, albeit not as bad as can +be seen for lower settings. + +On x86(-64), the following code versions may reasonably be built: SSE2, +AVX, and XOP. (There's no reason to build for AVX2 and higher, which is +unsuitable for and thus unused by current yespower anyway. There's also +no reason to build yespower as-is for SSE4, although there's a disabled +by default 32-bit specific SSE4 code version that may be re-enabled and +given a try if someone is so inclined; it may perform slightly slower or +slightly faster across different systems.) + +yescrypt and especially yespower 1.0 have been designed to fit the SSE2 +instruction set almost perfectly, so there's very little benefit from +the AVX and XOP builds, yet even at yespower 1.0 there may be +performance differences between SSE2, AVX, and XOP builds within 2% or +so (and it is unclear which is the fastest on a given system until +tested, except that where XOP is supported it is almost always faster +than AVX). + +Proper setting of thread affinities to run exactly one thread per +physical CPU core is non-trivial. In the above examples, it so happened +that the first 4 logical CPU numbers corresponded to different physical +cores, but this won't always be the case. This can vary even between +apparently similar systems. On Linux, the mapping of logical CPUs to +physical cores may be obtained from /proc/cpuinfo (on x86[-64] and MIC) +or sysfs, which an optimized implementation of e.g. a cryptocurrency +miner could use. If you do not bother obtaining this information from +the operating system, you might be better off not setting thread +affinities at all (in order to avoid the risk of doing this incorrectly, +which would have a greater negative performance impact) and/or running +as many threads as there are logical CPUs. Also, there's no certainty +whether different and future CPUs will run yespower faster using one or +maybe more threads per physical core. diff --git a/yespower-1.0.1/README b/yespower-1.0.1/README new file mode 100644 index 000000000..c7a704db7 --- /dev/null +++ b/yespower-1.0.1/README @@ -0,0 +1,203 @@ + What is yespower? + +yespower is a proof-of-work (PoW) focused fork of yescrypt. While +yescrypt is a password-based key derivation function (KDF) and password +hashing scheme, and thus is meant for processing passwords, yespower is +meant for processing trial inputs such as block headers (including +nonces) in PoW-based blockchains. + +On its own, yespower isn't a complete proof-of-work system. Rather, in +the blockchain use case, yespower's return value is meant to be checked +for being numerically no greater than the blockchain's current target +(which is related to mining difficulty) or else the proof attempt +(yespower invocation) is to be repeated (with a different nonce) until +the condition is finally met (allowing a new block to be mined). This +process isn't specific to yespower and isn't part of yespower itself +(rather, it is similar in many PoW-based blockchains and is to be +defined and implemented externally to yespower) and thus isn't described +in here any further. + + + Why or why not yespower? + +Different proof-of-work schemes in existence vary in many aspects, +including in friendliness to different types of hardware. There's +demand for all sorts of hardware (un)friendliness in those - for +different use cases and by different communities. + +yespower in particular is designed to be CPU-friendly, GPU-unfriendly, +and FPGA/ASIC-neutral. In other words, it's meant to be relatively +efficient to compute on current CPUs and relatively inefficient on +current GPUs. Unfortunately, being GPU-unfriendly also means that +eventual FPGA and ASIC implementations will only compete with CPUs, and +at least ASICs will win over the CPUs (FPGAs might not because of this +market's peculiarities - large FPGAs are even more "over-priced" than +large CPUs are), albeit by far not to the extent they did e.g. for +Bitcoin and Litecoin. + +There's a lot of talk about "ASIC resistance". What is (or should be) +meant by that is limiting the advantage of specialized ASICs. While +limiting the advantage at KDF to e.g. 10x and at password hashing to +e.g. 100x (talking orders of magnitude here, in whatever terms) may be +considered "ASIC resistant" (as compared to e.g. 100,000x we'd have +without trying), similar improvement factors are practically not "ASIC +resistant" for cryptocurrency mining where they can make all the +difference between CPU mining being profitable and not. There might +also exist in-between PoW use cases where moderate ASIC advantage is OK, +such as with non-cryptocurrency and/or private/permissioned blockchains. + +Thus, current yespower may be considered either a short-term choice +(valid until one of its uses provides sufficient perceived incentive to +likely result in specialized ASICs) or a deliberate choice of a pro-CPU, +anti-GPU, moderately-pro-ASIC PoW scheme. It is also possible to +respond to known improvements in future GPUs/implementations and/or to +ASICs with new versions of yespower that users would need to switch to. + + + yespower versions. + +yespower includes optimized and specialized re-implementation of the +obsolete yescrypt 0.5 (based off its first submission to Password +Hashing Competition back in 2014) now re-released as yespower 0.5, and +brand new proof-of-work specific variation known as yespower 1.0. + +yespower 0.5 is intended as a compatible upgrade for cryptocurrencies +that already use yescrypt 0.5 (providing a few percent speedup), and +yespower 1.0 may be used as a further upgrade or a new choice of PoW by +those and other cryptocurrencies and other projects. + +There are many significant differences between yespower 0.5 and 1.0 +under the hood, but the main user visible difference is yespower 1.0 +greatly improving on GPU-unfriendliness in light of improvements seen in +modern GPUs (up to and including NVIDIA Volta) and GPU implementations +of yescrypt 0.5. This is achieved mostly through greater use of CPUs' +L2 cache. + +The version of algorithm to use is requested through parameters, +allowing for both algorithms to co-exist in client and miner +implementations (such as in preparation for a cryptocurrency hard-fork +and/or supporting multiple cryptocurrencies in one program). + + + Parameter selection. + +For new uses of yespower, set the requested version to the highest +supported, and set N*r to the highest you can reasonably afford in terms +of proof verification time (which might in turn be determined by desired +share rate per mining pool server), using one of the following options: + +1 MiB: N = 1024, r = 8 +2 MiB: N = 2048, r = 8 +4 MiB: N = 1024, r = 32 +8 MiB: N = 2048, r = 32 +16 MiB: N = 4096, r = 32 + +and so on for higher N keeping r=32. + +You may also set the personalization string to your liking, but that is +not required (you can set its pointer to NULL and its length to 0). Its +support is provided mostly for compatibility with existing modifications +of yescrypt 0.5. + + + Performance. + +Please refer to PERFORMANCE for some benchmarks and performance tuning. + + + How to test yespower for proper operation. + +On a Unix-like system, invoke "make check". This will build and run a +program called "tests", and check its output against the supplied file +TESTS-OK. If everything matches, the final line of output should be the +word "PASSED". + +We do most of our testing on Linux systems with gcc. The supplied +Makefile assumes that you use gcc. + + + Alternate code versions and make targets. + +Two implementations of yespower are included: reference and optimized. +By default, the optimized implementation is built. Internally, the +optimized implementation uses conditional compilation to choose between +usage of various SIMD instruction sets where supported and scalar code. + +The reference implementation is unoptimized and is very slow, but it has +simpler and shorter source code. Its purpose is to provide a simple +human- and machine-readable specification that implementations intended +for actual use should be tested against. It is deliberately mostly not +optimized, and it is not meant to be used in production. + +Similarly to "make check", there's "make check-ref" to build and test +the reference implementation. There's also "make ref" to build the +reference implementation and have the "benchmark" program use it. + +"make clean" may need to be run between making different builds. + + + How to integrate yespower in a program. + +Although yespower.h provides several functions, chances are that you +will only need to use yespower_tls(). Please see the comment on this +function in yespower.h and its example usage in tests.c and benchmark.c, +including parameter sets requesting yescrypt 0.5 as used by certain +existing cryptocurrencies. + +To integrate yespower in an altcoin based on Bitcoin Core, you might +invoke yespower_tls() from either a maybe-new (depending on where you +fork from) CBlockHeader::GetPoWHash() (and invoke that where PoW is +needed like e.g. Litecoin does for scrypt) or CBlockHeader::GetHash() +(easier, but inefficient and you'll be stuck with that inefficiency). + +You'll also want to implement caching of the computed PoW hashes like +e.g. YACoin does for scrypt. Caching is especially important if you +invoke yespower from CBlockHeader::GetHash(). However, even if you use +or introduce CBlockHeader::GetPoWHash() caching may still be desirable +as the PoW hash is commonly requested 4 times per block fetched during a +node's initial blockchain sync (once during prefetch of block headers, +and 3 times more during validation of a fully fetched block). On the +other hand, you'll likely want to bypass the cache when PoW is computed +by the node's built-in miner. + +Further detail on this (generating new genesis blocks, etc.) is even +farther from being yespower-specific and thus is not provided here. +Just like (and even more so than) yespower itself, the above guidance is +provided as-is and without guarantee of being correct and safe to +follow. You're supposed to know what you're doing. + + + Credits. + +scrypt has been designed by Colin Percival. yescrypt and yespower have +been designed by Solar Designer building upon scrypt. + +The following other people and projects have also indirectly helped make +yespower what it is: + + - Bill Cox + - Rich Felker + - Anthony Ferrara + - Christian Forler + - Taylor Hornby + - Dmitry Khovratovich + - Samuel Neves + - Marcos Simplicio + - Ken T Takusagawa + - Jakob Wenzel + - Christian Winnerlein + + - DARPA Cyber Fast Track + - Password Hashing Competition + + + Contact info. + +First, please check the yespower homepage for new versions, etc.: + + https://www.openwall.com/yespower/ + +If you have anything valuable to add or a non-trivial question to ask, +you may contact the maintainer of yespower at: + + Solar Designer diff --git a/yespower-1.0.1/TESTS-OK b/yespower-1.0.1/TESTS-OK new file mode 100644 index 000000000..d0bd2e03c --- /dev/null +++ b/yespower-1.0.1/TESTS-OK @@ -0,0 +1,16 @@ +yespower(5, 2048, 8, "Client Key") = a5 9f ec 4c 4f dd a1 6e 3b 14 05 ad da 66 d5 25 b6 8e 7c ad fc fe 6a c0 66 c7 ad 11 8c d8 05 90 +yespower(5, 2048, 8, BSTY) = 5e a2 b2 95 6a 9e ac e3 0a 32 37 ff 1d 44 1e de e1 dc 25 aa b8 f0 ea 15 c1 21 65 f8 3a 7b c2 65 +yespower(5, 4096, 16, "Client Key") = 92 7e 72 d0 de d3 d8 04 75 47 3f 40 f1 74 3c 67 28 9d 45 3d 52 42 d4 f5 5a f4 e3 25 e0 66 99 c5 +yespower(5, 4096, 24, "Jagaricoin") = 0e 13 66 97 32 11 e7 fe a8 ad 9d 81 98 9c 84 a2 54 d9 68 c9 d3 33 dd 8f f0 99 32 4f 38 61 1e 04 +yespower(5, 4096, 32, "WaviBanana") = 3a e0 5a bb 3c 5c f6 f7 54 15 a9 25 54 c9 8d 50 e3 8e c9 55 2c fa 78 37 36 16 f4 80 b2 4e 55 9f +yespower(5, 2048, 32, "Client Key") = 56 0a 89 1b 5c a2 e1 c6 36 11 1a 9f f7 c8 94 a5 d0 a2 60 2f 43 fd cf a5 94 9b 95 e2 2f e4 46 1e +yespower(5, 1024, 32, "Client Key") = 2a 79 e5 3d 1b e6 66 9b c5 56 cc c4 17 bc e3 d2 2a 74 a2 32 f5 6b 8e 1d 39 b4 57 92 67 5d e1 08 +yespower(5, 2048, 8, NULL) = 5e cb d8 e8 d7 c9 0b ae d4 bb f8 91 6a 12 25 dc c3 c6 5f 5c 91 65 ba e8 1c dd e3 cf fa d1 28 e8 +yespower(10, 2048, 8, NULL) = 69 e0 e8 95 b3 df 7a ee b8 37 d7 1f e1 99 e9 d3 4f 7e c4 6e cb ca 7a 2c 43 08 e5 18 57 ae 9b 46 +yespower(10, 4096, 16, NULL) = 33 fb 8f 06 38 24 a4 a0 20 f6 3d ca 53 5f 5c a6 6a b5 57 64 68 c7 5d 1c ca ac 75 42 f7 64 95 ac +yespower(10, 4096, 32, NULL) = 77 1a ee fd a8 fe 79 a0 82 5b c7 f2 ae e1 62 ab 55 78 57 46 39 ff c6 ca 37 23 cc 18 e5 e3 e2 85 +yespower(10, 2048, 32, NULL) = d5 ef b8 13 cd 26 3e 9b 34 54 01 30 23 3c bb c6 a9 21 fb ff 34 31 e5 ec 1a 1a bd e2 ae a6 ff 4d +yespower(10, 1024, 32, NULL) = 50 1b 79 2d b4 2e 38 8f 6e 7d 45 3c 95 d0 3a 12 a3 60 16 a5 15 4a 68 83 90 dd c6 09 a4 0c 67 99 +yespower(10, 1024, 32, "personality test") = 1f 02 69 ac f5 65 c4 9a dc 0e f9 b8 f2 6a b3 80 8c dc 38 39 4a 25 4f dd ee dc c3 aa cf f6 ad 9d +XOR of yespower(5, ...) = ae f1 32 91 87 0f 55 70 47 f4 2e 9b ef a6 16 df e5 f1 96 77 e1 3f 8b a6 92 f7 c5 97 55 a0 f5 0e +XOR of yespower(10, ...) = 8d 13 c5 fb 07 30 96 75 d1 b8 48 92 77 ba 4b e4 40 33 be df ae 7a 60 43 8a 9b e2 1f 3a 7b 12 37 diff --git a/yespower-1.0.1/benchmark.c b/yespower-1.0.1/benchmark.c new file mode 100644 index 000000000..4842dfa84 --- /dev/null +++ b/yespower-1.0.1/benchmark.c @@ -0,0 +1,262 @@ +/*- + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include /* for atoi() */ +#include +#include +#include +#include +#include + +#include "yespower.h" + +#ifdef _OPENMP +#include + +#define NSAVE 1000 + +static uint64_t time_us(void) +{ + struct timespec t; +#ifdef CLOCK_MONOTONIC_RAW + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t)) + return 0; +#else + if (clock_gettime(CLOCK_MONOTONIC, &t)) + return 0; +#endif + return 1 + (uint64_t)t.tv_sec * 1000000 + t.tv_nsec / 1000; +} +#endif + +int main(int argc, const char * const *argv) +{ + yespower_params_t params = { + .version = YESPOWER_0_5, + .N = 2048, + .r = 8, + .pers = (const uint8_t *)"Client Key", + .perslen = 10 + }; + + if (argc > 1) + params.version = atoi(argv[1]); + if (argc > 2) + params.N = atoi(argv[2]); + if (argc > 3) + params.r = atoi(argv[3]); + + printf("version=%.1f N=%u r=%u\n", + params.version * 0.1, params.N, params.r); + + printf("Will use %.2f KiB RAM\n", 0.125 * params.N * params.r); + + static __thread union { + uint8_t u8[80]; + uint32_t u32[20]; + } src; + yespower_binary_t dst; + unsigned int i; + + for (i = 0; i < sizeof(src); i++) + src.u8[i] = i * 3; + + if (yespower_tls(src.u8, sizeof(src), ¶ms, &dst)) { + puts("FAILED"); + return 1; + } + + for (i = 0; i < sizeof(dst); i++) + printf("%02x%c", dst.uc[i], i < sizeof(dst) - 1 ? ' ' : '\n'); + + puts("Benchmarking 1 thread ..."); + + clock_t clk_tck = sysconf(_SC_CLK_TCK); + struct tms start_tms, end_tms; + clock_t start = times(&start_tms), end; + unsigned int n; + unsigned long long count; +#ifdef _OPENMP + yespower_binary_t save[NSAVE]; + unsigned int nsave = 0; +#endif + uint32_t seed = start * 1812433253U; + + n = 1; + count = 0; + do { + for (i = 0; i < n; i++) { + yespower_binary_t *p = &dst; +#ifdef _OPENMP + if (nsave < NSAVE) + p = &save[nsave++]; +#endif + src.u32[19] = seed + (count + i); + if (yespower_tls(src.u8, sizeof(src), ¶ms, p)) { + puts("FAILED"); + return 1; + } + } + count += n; + + end = times(&end_tms); + n <<= 1; + } while (end - start < clk_tck * 2); + + clock_t start_v = start_tms.tms_utime + start_tms.tms_stime + + start_tms.tms_cutime + start_tms.tms_cstime; + clock_t end_v = end_tms.tms_utime + end_tms.tms_stime + + end_tms.tms_cutime + end_tms.tms_cstime; + + printf("%llu H/s real, %llu H/s virtual " + "(%llu hashes in %.2f seconds)\n", + count * clk_tck / (end - start), + count * clk_tck / (end_v - start_v), + count, (double)(end - start) / clk_tck); + + for (i = 0; i < nsave; i++) { + unsigned int j; + for (j = i + 1; j < nsave; j++) { + unsigned int k = 8; + if (!memcmp(&save[i], &save[j], k)) { + printf("%u-byte collision(s) detected\n", k); + i = nsave; break; + } + } + } + +#ifdef _OPENMP + unsigned int nt = omp_get_max_threads(); + + printf("Benchmarking %u thread%s ...\n", + nt, nt == 1 ? "" : "s"); + + typedef struct { + uint64_t min, max, total; + } thread_data_s; + union { + thread_data_s s; + uint8_t cachelines[2][64]; /* avoid false sharing */ + } thread_data[nt]; /* tricky to align this when on stack */ + + unsigned int t; + for (t = 0; t < nt; t++) { + thread_data_s *td = &thread_data[t].s; + td->min = ~(uint64_t)0; td->max = 0; td->total = 0; + } + + unsigned long long count1 = count, count_restart = 0; + + if (!geteuid()) { + puts("Running as root, so trying to set SCHED_RR"); +#pragma omp parallel + { + struct sched_param param = { .sched_priority = 1 }; + if (sched_setscheduler(getpid(), SCHED_RR, ¶m)) + perror("sched_setscheduler"); + } + } + + start = times(&start_tms); + + n = count * omp_get_max_threads(); + count = 0; + do { +#pragma omp parallel for default(none) copyin(src) private(i, dst) shared(n, thread_data, params, seed, count, save, nsave) + for (i = 0; i < n; i++) { + unsigned int j = count + i; + + src.u32[19] = seed + j; + + uint64_t start1 = time_us(); + + if (yespower_tls(src.u8, sizeof(src), ¶ms, &dst)) { +#pragma omp critical + puts("FAILED"); + } + + uint64_t end1 = time_us(); + if (end1 < start1) + end1 = start1; + uint64_t diff1 = end1 - start1; + + thread_data_s *td = &thread_data[omp_get_thread_num()].s; + td->total += diff1; + if (diff1 < td->min) + td->min = diff1; + if (diff1 > td->max) + td->max = diff1; + +#ifdef _OPENMP + if (j < nsave && memcmp(&save[j], &dst, sizeof(dst))) { +#pragma omp critical + printf("Mismatch at %u\n", j); + } +#endif + } + + count += n; + if ((count - n) < count1 && count >= count1) { +/* Disregard our repeat of single thread's results (could be partially cached + * by same core, but OTOH other cores not yet warmed up to full clock rate). */ + start = times(&start_tms); + count_restart = count; + for (t = 0; t < nt; t++) { + thread_data_s *td = &thread_data[t].s; + td->min = ~(uint64_t)0; td->max = 0; td->total = 0; + } + } else { + n <<= 1; + } + + end = times(&end_tms); + } while (end - start < clk_tck); + + if (!count_restart) + puts("Didn't reach single-thread's hash count"); + count -= count_restart; + + start_v = start_tms.tms_utime + start_tms.tms_stime + + start_tms.tms_cutime + start_tms.tms_cstime; + end_v = end_tms.tms_utime + end_tms.tms_stime + + end_tms.tms_cutime + end_tms.tms_cstime; + + printf("%llu H/s real, %llu H/s virtual " + "(%llu hashes in %.2f seconds)\n", + count * clk_tck / (end - start), + count * clk_tck / (end_v - start_v), + count, (double)(end - start) / clk_tck); + + uint64_t min = ~(uint64_t)0, max = 0, total = 0; + for (t = 0; t < nt; t++) { + thread_data_s *td = &thread_data[t].s; + total += td->total; + if (td->min < min) + min = td->min; + if (td->max > max) + max = td->max; + } + printf("min %.3f ms, avg %.3f ms, max %.3f ms\n", + min / 1000.0, total / 1000.0 / count, max / 1000.0); +#endif + + return 0; +} diff --git a/yespower-1.0.1/insecure_memzero.h b/yespower-1.0.1/insecure_memzero.h new file mode 100644 index 000000000..5a0ba75c4 --- /dev/null +++ b/yespower-1.0.1/insecure_memzero.h @@ -0,0 +1 @@ +#define insecure_memzero(buf, len) /* empty */ diff --git a/yespower-1.0.1/sha256.c b/yespower-1.0.1/sha256.c new file mode 100644 index 000000000..ef6b4ec70 --- /dev/null +++ b/yespower-1.0.1/sha256.c @@ -0,0 +1,646 @@ +/*- + * Copyright 2005-2016 Colin Percival + * Copyright 2016-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include + +#include "insecure_memzero.h" +#include "sysendian.h" + +#include "sha256.h" + +#ifdef __ICC +/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */ +#define restrict +#elif __STDC_VERSION__ >= 199901L +/* Have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +/* + * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of + * (uint8_t) in big-endian form. + */ +static void +be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len) +{ + + /* Encode vector, two words at a time. */ + do { + be32enc(&dst[0], src[0]); + be32enc(&dst[4], src[1]); + src += 2; + dst += 8; + } while (--len); +} + +/* + * Decode a big-endian length len*8 vector of (uint8_t) into a length + * len*2 vector of (uint32_t). + */ +static void +be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len) +{ + + /* Decode vector, two words at a time. */ + do { + dst[0] = be32dec(&src[0]); + dst[1] = be32dec(&src[4]); + src += 8; + dst += 2; + } while (--len); +} + +/* SHA256 round constants. */ +static const uint32_t Krnd[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + Krnd[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t state[static restrict 8], + const uint8_t block[static restrict 64], + uint32_t W[static restrict 64], uint32_t S[static restrict 8]) +{ + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be32dec_vect(W, block, 8); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state. */ + state[0] += S[0]; + state[1] += S[1]; + state[2] += S[2]; + state[3] += S[3]; + state[4] += S[4]; + state[5] += S[5]; + state[6] += S[6]; + state[7] += S[7]; +} + +static const uint8_t PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72]) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count >> 3) & 0x3f; + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if (r < 56) { + /* Pad to 56 mod 64. */ + memcpy(&ctx->buf[r], PAD, 56 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 64 - r); + SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 56); + } + + /* Add the terminating bit-count. */ + be64enc(&ctx->buf[56], ctx->count); + + /* Mix in the final block. */ + SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); +} + +/* Magic initialization constants. */ +static const uint32_t initial_state[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +/** + * SHA256_Init(ctx): + * Initialize the SHA256 context ${ctx}. + */ +void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far. */ + ctx->count = 0; + + /* Initialize state. */ + memcpy(ctx->state, initial_state, sizeof(initial_state)); +} + +/** + * SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the SHA256 context ${ctx}. + */ +static void +_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len, + uint32_t tmp32[static restrict 72]) +{ + uint32_t r; + const uint8_t * src = in; + + /* Return immediately if we have nothing to do. */ + if (len == 0) + return; + + /* Number of bytes left in the buffer from previous updates. */ + r = (ctx->count >> 3) & 0x3f; + + /* Update number of bits. */ + ctx->count += (uint64_t)(len) << 3; + + /* Handle the case where we don't need to perform any transforms. */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block. */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks. */ + while (len >= 64) { + SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer. */ + memcpy(ctx->buf, src, len); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len) +{ + uint32_t tmp32[72]; + + /* Call the real function. */ + _SHA256_Update(ctx, in, len, tmp32); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); +} + +/** + * SHA256_Final(digest, ctx): + * Output the SHA256 hash of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +static void +_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx, + uint32_t tmp32[static restrict 72]) +{ + + /* Add padding. */ + SHA256_Pad(ctx, tmp32); + + /* Write the hash. */ + be32enc_vect(digest, ctx->state, 4); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx) +{ + uint32_t tmp32[72]; + + /* Call the real function. */ + _SHA256_Final(digest, ctx, tmp32); + + /* Clear the context state. */ + insecure_memzero(ctx, sizeof(SHA256_CTX)); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); +} + +/** + * SHA256_Buf(in, len, digest): + * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. + */ +void +SHA256_Buf(const void * in, size_t len, uint8_t digest[32]) +{ + SHA256_CTX ctx; + uint32_t tmp32[72]; + + SHA256_Init(&ctx); + _SHA256_Update(&ctx, in, len, tmp32); + _SHA256_Final(digest, &ctx, tmp32); + + /* Clean the stack. */ + insecure_memzero(&ctx, sizeof(SHA256_CTX)); + insecure_memzero(tmp32, 288); +} + +/** + * HMAC_SHA256_Init(ctx, K, Klen): + * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from + * ${K}. + */ +static void +_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen, + uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64], + uint8_t khash[static restrict 32]) +{ + const uint8_t * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init(&ctx->ictx); + _SHA256_Update(&ctx->ictx, K, Klen, tmp32); + _SHA256_Final(khash, &ctx->ictx, tmp32); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + _SHA256_Update(&ctx->ictx, pad, 64, tmp32); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + pad[i] ^= K[i]; + _SHA256_Update(&ctx->octx, pad, 64, tmp32); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen) +{ + uint32_t tmp32[72]; + uint8_t pad[64]; + uint8_t khash[32]; + + /* Call the real function. */ + _HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); + insecure_memzero(khash, 32); + insecure_memzero(pad, 64); +} + +/** + * HMAC_SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}. + */ +static void +_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len, + uint32_t tmp32[static restrict 72]) +{ + + /* Feed data to the inner SHA256 operation. */ + _SHA256_Update(&ctx->ictx, in, len, tmp32); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len) +{ + uint32_t tmp32[72]; + + /* Call the real function. */ + _HMAC_SHA256_Update(ctx, in, len, tmp32); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); +} + +/** + * HMAC_SHA256_Final(digest, ctx): + * Output the HMAC-SHA256 of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +static void +_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx, + uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32]) +{ + + /* Finish the inner SHA256 operation. */ + _SHA256_Final(ihash, &ctx->ictx, tmp32); + + /* Feed the inner hash to the outer SHA256 operation. */ + _SHA256_Update(&ctx->octx, ihash, 32, tmp32); + + /* Finish the outer SHA256 operation. */ + _SHA256_Final(digest, &ctx->octx, tmp32); +} + +/* Wrapper function for intermediate-values sanitization. */ +void +HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx) +{ + uint32_t tmp32[72]; + uint8_t ihash[32]; + + /* Call the real function. */ + _HMAC_SHA256_Final(digest, ctx, tmp32, ihash); + + /* Clean the stack. */ + insecure_memzero(tmp32, 288); + insecure_memzero(ihash, 32); +} + +/** + * HMAC_SHA256_Buf(K, Klen, in, len, digest): + * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of + * length ${Klen}, and write the result to ${digest}. + */ +void +HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len, + uint8_t digest[32]) +{ + HMAC_SHA256_CTX ctx; + uint32_t tmp32[72]; + uint8_t tmp8[96]; + + _HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]); + _HMAC_SHA256_Update(&ctx, in, len, tmp32); + _HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]); + + /* Clean the stack. */ + insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(tmp32, 288); + insecure_memzero(tmp8, 96); +} + +/* Add padding and terminating bit-count, but don't invoke Transform yet. */ +static int +SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8], + uint32_t tmp32[static restrict 72]) +{ + uint32_t r; + + r = (ctx->count >> 3) & 0x3f; + if (r >= 56) + return -1; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be64enc(len, ctx->count); + + /* Add 1--56 bytes so that the resulting length is 56 mod 64. */ + _SHA256_Update(ctx, PAD, 56 - r, tmp32); + + /* Add the terminating bit-count. */ + ctx->buf[63] = len[7]; + _SHA256_Update(ctx, len, 7, tmp32); + + return 0; +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void +PBKDF3_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, + size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX Phctx, PShctx, hctx; + uint32_t tmp32[72]; + union { + uint8_t tmp8[96]; + uint32_t state[8]; + } u; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Sanity-check. */ + assert(dkLen <= 32 * (size_t)(UINT32_MAX)); + + if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) { + uint32_t oldcount; + uint8_t * ivecp; + + /* Compute HMAC state after processing P and S. */ + _HMAC_SHA256_Init(&hctx, passwd, passwdlen, + tmp32, &u.tmp8[0], &u.tmp8[64]); + _HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32); + + /* Prepare ictx padding. */ + oldcount = hctx.ictx.count & (0x3f << 3); + _HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32); + if ((hctx.ictx.count & (0x3f << 3)) < oldcount || + SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32)) + goto generic; /* Can't happen due to saltlen check */ + ivecp = hctx.ictx.buf + (oldcount >> 3); + + /* Prepare octx padding. */ + hctx.octx.count += 32 << 3; + SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivecp, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(u.state, hctx.ictx.state, sizeof(u.state)); + SHA256_Transform(u.state, hctx.ictx.buf, + &tmp32[0], &tmp32[64]); + be32enc_vect(hctx.octx.buf, u.state, 4); + memcpy(u.state, hctx.octx.state, sizeof(u.state)); + SHA256_Transform(u.state, hctx.octx.buf, + &tmp32[0], &tmp32[64]); + be32enc_vect(&buf[i * 32], u.state, 4); + } + + goto cleanup; + } + +generic: + /* Compute HMAC state after processing P. */ + _HMAC_SHA256_Init(&Phctx, passwd, passwdlen, + tmp32, &u.tmp8[0], &u.tmp8[64]); + + /* Compute HMAC state after processing P and S. */ + memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX)); + _HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX)); + _HMAC_SHA256_Update(&hctx, ivec, 4, tmp32); + _HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8); + + if (c > 1) { + /* T_i = U_1 ... */ + memcpy(U, T, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX)); + _HMAC_SHA256_Update(&hctx, U, 32, tmp32); + _HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean the stack. */ + insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(U, 32); + insecure_memzero(T, 32); + +cleanup: + insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX)); + insecure_memzero(tmp32, 288); + insecure_memzero(&u, sizeof(u)); +} diff --git a/yespower-1.0.1/sha256.h b/yespower-1.0.1/sha256.h new file mode 100644 index 000000000..6210502ff --- /dev/null +++ b/yespower-1.0.1/sha256.h @@ -0,0 +1,129 @@ +/*- + * Copyright 2005-2016 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Use #defines in order to avoid namespace collisions with anyone else's + * SHA256 code (e.g., the code in OpenSSL). + */ +#define SHA256_Init libcperciva_SHA256_Init +#define SHA256_Update libcperciva_SHA256_Update +#define SHA256_Final libcperciva_SHA256_Final +#define SHA256_Buf libcperciva_SHA256_Buf +#define SHA256_CTX libcperciva_SHA256_CTX +#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init +#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update +#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final +#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf +#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX + +/* Context structure for SHA256 operations. */ +typedef struct { + uint32_t state[8]; + uint64_t count; + uint8_t buf[64]; +} SHA256_CTX; + +/** + * SHA256_Init(ctx): + * Initialize the SHA256 context ${ctx}. + */ +void SHA256_Init(SHA256_CTX *); + +/** + * SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the SHA256 context ${ctx}. + */ +void SHA256_Update(SHA256_CTX *, const void *, size_t); + +/** + * SHA256_Final(digest, ctx): + * Output the SHA256 hash of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +void SHA256_Final(uint8_t[32], SHA256_CTX *); + +/** + * SHA256_Buf(in, len, digest): + * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}. + */ +void SHA256_Buf(const void *, size_t, uint8_t[32]); + +/* Context structure for HMAC-SHA256 operations. */ +typedef struct { + SHA256_CTX ictx; + SHA256_CTX octx; +} HMAC_SHA256_CTX; + +/** + * HMAC_SHA256_Init(ctx, K, Klen): + * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from + * ${K}. + */ +void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t); + +/** + * HMAC_SHA256_Update(ctx, in, len): + * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}. + */ +void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t); + +/** + * HMAC_SHA256_Final(digest, ctx): + * Output the HMAC-SHA256 of the data input to the context ${ctx} into the + * buffer ${digest}. + */ +void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *); + +/** + * HMAC_SHA256_Buf(K, Klen, in, len, digest): + * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of + * length ${Klen}, and write the result to ${digest}. + */ +void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]); + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, + uint64_t, uint8_t *, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* !_SHA256_H_ */ diff --git a/yespower-1.0.1/sysendian.h b/yespower-1.0.1/sysendian.h new file mode 100644 index 000000000..52c1fe73b --- /dev/null +++ b/yespower-1.0.1/sysendian.h @@ -0,0 +1,94 @@ +/*- + * Copyright 2007-2014 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYSENDIAN_H_ +#define _SYSENDIAN_H_ + +#include + +/* Avoid namespace collisions with BSD . */ +#define be32dec libcperciva_be32dec +#define be32enc libcperciva_be32enc +#define be64enc libcperciva_be64enc +#define le32dec libcperciva_le32dec +#define le32enc libcperciva_le32enc + +static inline uint32_t +be32dec(const void * pp) +{ + const uint8_t * p = (uint8_t const *)pp; + + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} + +static inline void +be32enc(void * pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} + +static inline void +be64enc(void * pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[7] = x & 0xff; + p[6] = (x >> 8) & 0xff; + p[5] = (x >> 16) & 0xff; + p[4] = (x >> 24) & 0xff; + p[3] = (x >> 32) & 0xff; + p[2] = (x >> 40) & 0xff; + p[1] = (x >> 48) & 0xff; + p[0] = (x >> 56) & 0xff; +} + +static inline uint32_t +le32dec(const void * pp) +{ + const uint8_t * p = (uint8_t const *)pp; + + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} + +static inline void +le32enc(void * pp, uint32_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} + +#endif /* !_SYSENDIAN_H_ */ diff --git a/yespower-1.0.1/tests.c b/yespower-1.0.1/tests.c new file mode 100644 index 000000000..121a586a8 --- /dev/null +++ b/yespower-1.0.1/tests.c @@ -0,0 +1,190 @@ +/*- + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +#include "yespower.h" + +#undef TEST_PBKDF2_SHA256 + +#ifdef TEST_PBKDF2_SHA256 +#include + +#include "sha256.h" + +static void print_PBKDF2_SHA256_raw(const char *passwd, size_t passwdlen, + const char *salt, size_t saltlen, uint64_t c, size_t dkLen) +{ + uint8_t dk[64]; + size_t i; + + assert(dkLen <= sizeof(dk)); + + /* XXX This prints the strings truncated at first NUL */ + printf("PBKDF2_SHA256(\"%s\", \"%s\", %llu, %llu) = ", + passwd, salt, (unsigned long long)c, (unsigned long long)dkLen); + + PBKDF2_SHA256((const uint8_t *) passwd, passwdlen, + (const uint8_t *) salt, saltlen, c, dk, dkLen); + + for (i = 0; i < dkLen; i++) + printf("%02x%c", dk[i], i < dkLen - 1 ? ' ' : '\n'); +} + +static void print_PBKDF2_SHA256(const char *passwd, + const char *salt, uint64_t c, size_t dkLen) +{ + print_PBKDF2_SHA256_raw(passwd, strlen(passwd), salt, strlen(salt), c, + dkLen); +} +#endif + +static const char *pers_bsty_magic = "BSTY"; + +static void print_yespower(yespower_version_t version, uint32_t N, uint32_t r, + const char *pers) +{ + yespower_params_t params = { + .version = version, + .N = N, + .r = r, + .pers = (const uint8_t *)pers, + .perslen = pers ? strlen(pers) : 0 + }; + uint8_t src[80]; + yespower_binary_t dst; + size_t i; + + const char *q = (pers && pers != pers_bsty_magic) ? "\"": ""; + printf("yespower(%u, %u, %u, %s%s%s) = ", (unsigned int)version, N, r, + q, pers ? pers : "NULL", q); + + for (i = 0; i < sizeof(src); i++) + src[i] = i * 3; + + if (pers == pers_bsty_magic) { + params.pers = src; + params.perslen = sizeof(src); + } + + if (yespower_tls(src, sizeof(src), ¶ms, &dst)) { + puts("FAILED"); + return; + } + + for (i = 0; i < sizeof(dst); i++) + printf("%02x%c", dst.uc[i], i < sizeof(dst) - 1 ? ' ' : '\n'); +} + +static void print_yespower_loop(yespower_version_t version, const char *pers) +{ + uint32_t N, r; + uint8_t src[80]; + yespower_binary_t dst, xor = {{0}}; + size_t i; + + printf("XOR of yespower(%u, ...) = ", (unsigned int)version); + + /* + * This value of src is chosen to trigger duplicate index in the last + * SMix2 invocation in yespower 0.5 for N=2048 with at least one of the + * values of r below. This is needed to test that a non-save version + * of BlockMix is used in that special case. Most other values of src + * would leave this untested. + */ + src[0] = 43; + for (i = 1; i < sizeof(src); i++) + src[i] = i * 3; + + for (N = 1024; N <= 4096; N <<= 1) { + for (r = 8; r <= 32; r++) { + yespower_params_t params = { + .version = version, + .N = N, + .r = r, + .pers = (const uint8_t *)pers, + .perslen = pers ? strlen(pers) : 0 + }; + if (yespower_tls(src, sizeof(src), ¶ms, &dst)) { + puts("FAILED"); + return; + } + for (i = 0; i < sizeof(xor); i++) + xor.uc[i] ^= dst.uc[i]; + } + } + + for (i = 0; i < sizeof(xor); i++) + printf("%02x%c", xor.uc[i], i < sizeof(xor) - 1 ? ' ' : '\n'); +} + +int main(void) +{ + setvbuf(stdout, NULL, _IOLBF, 0); + +#ifdef TEST_PBKDF2_SHA256 + print_PBKDF2_SHA256("password", "salt", 1, 20); + print_PBKDF2_SHA256("password", "salt", 2, 20); + print_PBKDF2_SHA256("password", "salt", 4096, 20); + print_PBKDF2_SHA256("password", "salt", 16777216, 20); + print_PBKDF2_SHA256("passwordPASSWORDpassword", + "saltSALTsaltSALTsaltSALTsaltSALTsalt", 4096, 25); + print_PBKDF2_SHA256_raw("pass\0word", 9, "sa\0lt", 5, 4096, 16); +#if 0 + print_PBKDF2_SHA256("password", "salt", 1, 32); + print_PBKDF2_SHA256("password", "salt", 2, 32); + print_PBKDF2_SHA256("password", "salt", 4096, 32); + print_PBKDF2_SHA256("password", "salt", 16777216, 32); + print_PBKDF2_SHA256("passwordPASSWORDpassword", + "saltSALTsaltSALTsaltSALTsaltSALTsalt", 4096, 40); + print_PBKDF2_SHA256("password", "salt", 4096, 16); + print_PBKDF2_SHA256("password", "salt", 1, 20); + print_PBKDF2_SHA256("password", "salt", 2, 20); + print_PBKDF2_SHA256("password", "salt", 4096, 20); + print_PBKDF2_SHA256("password", "salt", 16777216, 20); + print_PBKDF2_SHA256("password", "salt", 4096, 25); + print_PBKDF2_SHA256("password", "salt", 4096, 16); +#endif +#endif + + print_yespower(YESPOWER_0_5, 2048, 8, "Client Key"); /* yescrypt 0.5 */ + print_yespower(YESPOWER_0_5, 2048, 8, pers_bsty_magic); /* BSTY */ + print_yespower(YESPOWER_0_5, 4096, 16, "Client Key"); /* Cryply */ + print_yespower(YESPOWER_0_5, 4096, 24, "Jagaricoin"); + print_yespower(YESPOWER_0_5, 4096, 32, "WaviBanana"); + print_yespower(YESPOWER_0_5, 2048, 32, "Client Key"); + print_yespower(YESPOWER_0_5, 1024, 32, "Client Key"); + + print_yespower(YESPOWER_0_5, 2048, 8, NULL); /* no personality */ + + print_yespower(YESPOWER_1_0, 2048, 8, NULL); + print_yespower(YESPOWER_1_0, 4096, 16, NULL); + print_yespower(YESPOWER_1_0, 4096, 32, NULL); + print_yespower(YESPOWER_1_0, 2048, 32, NULL); + print_yespower(YESPOWER_1_0, 1024, 32, NULL); + + print_yespower(YESPOWER_1_0, 1024, 32, "personality test"); + + print_yespower_loop(YESPOWER_0_5, "Client Key"); + print_yespower_loop(YESPOWER_1_0, NULL); + + return 0; +} diff --git a/yespower-1.0.1/yespower-opt.c b/yespower-1.0.1/yespower-opt.c new file mode 100644 index 000000000..02f71406c --- /dev/null +++ b/yespower-1.0.1/yespower-opt.c @@ -0,0 +1,1153 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2012-2019 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + * + * This is a proof-of-work focused fork of yescrypt, including optimized and + * cut-down implementation of the obsolete yescrypt 0.5 (based off its first + * submission to PHC back in 2014) and a new proof-of-work specific variation + * known as yespower 1.0. The former is intended as an upgrade for + * cryptocurrencies that already use yescrypt 0.5 and the latter may be used + * as a further upgrade (hard fork) by those and other cryptocurrencies. The + * version of algorithm to use is requested through parameters, allowing for + * both algorithms to co-exist in client and miner implementations (such as in + * preparation for a hard-fork). + */ + +#ifndef _YESPOWER_OPT_C_PASS_ +#define _YESPOWER_OPT_C_PASS_ 1 +#endif + +#if _YESPOWER_OPT_C_PASS_ == 1 +/* + * AVX and especially XOP speed up Salsa20 a lot, but needlessly result in + * extra instruction prefixes for pwxform (which we make more use of). While + * no slowdown from the prefixes is generally observed on AMD CPUs supporting + * XOP, some slowdown is sometimes observed on Intel CPUs with AVX. + */ +#ifdef __XOP__ +#warning "Note: XOP is enabled. That's great." +#elif defined(__AVX__) +#warning "Note: AVX is enabled. That's OK." +#elif defined(__SSE2__) +#warning "Note: AVX and XOP are not enabled. That's OK." +#elif defined(__x86_64__) || defined(__i386__) +#warning "SSE2 not enabled. Expect poor performance." +#else +#warning "Note: building generic code for non-x86. That's OK." +#endif + +/* + * The SSE4 code version has fewer instructions than the generic SSE2 version, + * but all of the instructions are SIMD, thereby wasting the scalar execution + * units. Thus, the generic SSE2 version below actually runs faster on some + * CPUs due to its balanced mix of SIMD and scalar instructions. + */ +#undef USE_SSE4_FOR_32BIT + +#ifdef __SSE2__ +/* + * GCC before 4.9 would by default unnecessarily use store/load (without + * SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV. + * This was tracked as GCC bug 54349. + * "-mtune=corei7" works around this, but is only supported for GCC 4.6+. + * We use inline asm for pre-4.6 GCC, further down this file. + */ +#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \ + !defined(__clang__) && !defined(__ICC) +#pragma GCC target ("tune=corei7") +#endif +#include +#ifdef __XOP__ +#include +#endif +#elif defined(__SSE__) +#include +#endif + +#include +#include +#include +#include + +#include "insecure_memzero.h" +#include "sha256.h" +#include "sysendian.h" + +#include "yespower.h" + +#include "yespower-platform.c" + +#if __STDC_VERSION__ >= 199901L +/* Have restrict */ +#elif defined(__GNUC__) +#define restrict __restrict +#else +#define restrict +#endif + +#ifdef __GNUC__ +#define unlikely(exp) __builtin_expect(exp, 0) +#else +#define unlikely(exp) (exp) +#endif + +#ifdef __SSE__ +#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint)); +#else +#undef PREFETCH +#endif + +typedef union { + uint32_t w[16]; + uint64_t d[8]; +#ifdef __SSE2__ + __m128i q[4]; +#endif +} salsa20_blk_t; + +static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin, + salsa20_blk_t *Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); + COMBINE(0, 0, 2) + COMBINE(1, 5, 7) + COMBINE(2, 2, 4) + COMBINE(3, 7, 1) + COMBINE(4, 4, 6) + COMBINE(5, 1, 3) + COMBINE(6, 6, 0) + COMBINE(7, 3, 5) +#undef COMBINE +} + +static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin, + salsa20_blk_t *Bout) +{ +#define UNCOMBINE(out, in1, in2) \ + Bout->w[out * 2] = Bin->d[in1]; \ + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; + UNCOMBINE(0, 0, 6) + UNCOMBINE(1, 5, 3) + UNCOMBINE(2, 2, 0) + UNCOMBINE(3, 7, 5) + UNCOMBINE(4, 4, 2) + UNCOMBINE(5, 1, 7) + UNCOMBINE(6, 6, 4) + UNCOMBINE(7, 3, 1) +#undef UNCOMBINE +} + +#ifdef __SSE2__ +#define DECL_X \ + __m128i X0, X1, X2, X3; +#define DECL_Y \ + __m128i Y0, Y1, Y2, Y3; +#define READ_X(in) \ + X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3]; +#define WRITE_X(out) \ + (out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3; + +#ifdef __XOP__ +#define ARX(out, in1, in2, s) \ + out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s)); +#else +#define ARX(out, in1, in2, s) { \ + __m128i tmp = _mm_add_epi32(in1, in2); \ + out = _mm_xor_si128(out, _mm_slli_epi32(tmp, s)); \ + out = _mm_xor_si128(out, _mm_srli_epi32(tmp, 32 - s)); \ +} +#endif + +#define SALSA20_2ROUNDS \ + /* Operate on "columns" */ \ + ARX(X1, X0, X3, 7) \ + ARX(X2, X1, X0, 9) \ + ARX(X3, X2, X1, 13) \ + ARX(X0, X3, X2, 18) \ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x93); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x39); \ + /* Operate on "rows" */ \ + ARX(X3, X0, X1, 7) \ + ARX(X2, X3, X0, 9) \ + ARX(X1, X2, X3, 13) \ + ARX(X0, X1, X2, 18) \ + /* Rearrange data */ \ + X1 = _mm_shuffle_epi32(X1, 0x39); \ + X2 = _mm_shuffle_epi32(X2, 0x4E); \ + X3 = _mm_shuffle_epi32(X3, 0x93); + +/** + * Apply the Salsa20 core to the block provided in (X0 ... X3). + */ +#define SALSA20_wrapper(out, rounds) { \ + __m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \ + rounds \ + (out).q[0] = X0 = _mm_add_epi32(X0, Z0); \ + (out).q[1] = X1 = _mm_add_epi32(X1, Z1); \ + (out).q[2] = X2 = _mm_add_epi32(X2, Z2); \ + (out).q[3] = X3 = _mm_add_epi32(X3, Z3); \ +} + +/** + * Apply the Salsa20/2 core to the block provided in X. + */ +#define SALSA20_2(out) \ + SALSA20_wrapper(out, SALSA20_2ROUNDS) + +#define SALSA20_8ROUNDS \ + SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS + +/** + * Apply the Salsa20/8 core to the block provided in X. + */ +#define SALSA20_8(out) \ + SALSA20_wrapper(out, SALSA20_8ROUNDS) + +#define XOR_X(in) \ + X0 = _mm_xor_si128(X0, (in).q[0]); \ + X1 = _mm_xor_si128(X1, (in).q[1]); \ + X2 = _mm_xor_si128(X2, (in).q[2]); \ + X3 = _mm_xor_si128(X3, (in).q[3]); + +#define XOR_X_2(in1, in2) \ + X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \ + X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \ + X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \ + X3 = _mm_xor_si128((in1).q[3], (in2).q[3]); + +#define XOR_X_WRITE_XOR_Y_2(out, in) \ + (out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \ + (out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \ + (out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \ + (out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \ + X0 = _mm_xor_si128(X0, Y0); \ + X1 = _mm_xor_si128(X1, Y1); \ + X2 = _mm_xor_si128(X2, Y2); \ + X3 = _mm_xor_si128(X3, Y3); + +#define INTEGERIFY _mm_cvtsi128_si32(X0) + +#else /* !defined(__SSE2__) */ + +#define DECL_X \ + salsa20_blk_t X; +#define DECL_Y \ + salsa20_blk_t Y; + +#define COPY(out, in) \ + (out).d[0] = (in).d[0]; \ + (out).d[1] = (in).d[1]; \ + (out).d[2] = (in).d[2]; \ + (out).d[3] = (in).d[3]; \ + (out).d[4] = (in).d[4]; \ + (out).d[5] = (in).d[5]; \ + (out).d[6] = (in).d[6]; \ + (out).d[7] = (in).d[7]; + +#define READ_X(in) COPY(X, in) +#define WRITE_X(out) COPY(out, X) + +/** + * salsa20(B): + * Apply the Salsa20 core to the provided block. + */ +static inline void salsa20(salsa20_blk_t *restrict B, + salsa20_blk_t *restrict Bout, uint32_t doublerounds) +{ + salsa20_blk_t X; +#define x X.w + + salsa20_simd_unshuffle(B, &X); + + do { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } while (--doublerounds); +#undef x + + { + uint32_t i; + salsa20_simd_shuffle(&X, Bout); + for (i = 0; i < 16; i += 4) { + B->w[i] = Bout->w[i] += B->w[i]; + B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1]; + B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2]; + B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3]; + } + } +} + +/** + * Apply the Salsa20/2 core to the block provided in X. + */ +#define SALSA20_2(out) \ + salsa20(&X, &out, 1); + +/** + * Apply the Salsa20/8 core to the block provided in X. + */ +#define SALSA20_8(out) \ + salsa20(&X, &out, 4); + +#define XOR(out, in1, in2) \ + (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ + (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ + (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ + (out).d[3] = (in1).d[3] ^ (in2).d[3]; \ + (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ + (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ + (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ + (out).d[7] = (in1).d[7] ^ (in2).d[7]; + +#define XOR_X(in) XOR(X, X, in) +#define XOR_X_2(in1, in2) XOR(X, in1, in2) +#define XOR_X_WRITE_XOR_Y_2(out, in) \ + XOR(Y, out, in) \ + COPY(out, Y) \ + XOR(X, X, Y) + +#define INTEGERIFY (uint32_t)X.d[0] +#endif + +/** + * Apply the Salsa20 core to the block provided in X ^ in. + */ +#define SALSA20_XOR_MEM(in, out) \ + XOR_X(in) \ + SALSA20(out) + +#define SALSA20 SALSA20_8 +#else /* pass 2 */ +#undef SALSA20 +#define SALSA20 SALSA20_2 +#endif + +/** + * blockmix_salsa(Bin, Bout): + * Compute Bout = BlockMix_{salsa20, 1}(Bin). The input Bin must be 128 + * bytes in length; the output Bout must also be the same size. + */ +static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout) +{ + DECL_X + + READ_X(Bin[1]) + SALSA20_XOR_MEM(Bin[0], Bout[0]) + SALSA20_XOR_MEM(Bin[1], Bout[1]) +} + +static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout) +{ + DECL_X + + XOR_X_2(Bin1[1], Bin2[1]) + XOR_X(Bin1[0]) + SALSA20_XOR_MEM(Bin2[0], Bout[0]) + XOR_X(Bin1[1]) + SALSA20_XOR_MEM(Bin2[1], Bout[1]) + + return INTEGERIFY; +} + +#if _YESPOWER_OPT_C_PASS_ == 1 +/* This is tunable, but it is part of what defines a yespower version */ +/* Version 0.5 */ +#define Swidth_0_5 8 +/* Version 1.0 */ +#define Swidth_1_0 11 + +/* Not tunable in this implementation, hard-coded in a few places */ +#define PWXsimple 2 +#define PWXgather 4 + +/* Derived value. Not tunable on its own. */ +#define PWXbytes (PWXgather * PWXsimple * 8) + +/* (Maybe-)runtime derived values. Not tunable on their own. */ +#define Swidth_to_Sbytes1(Swidth) ((1 << (Swidth)) * PWXsimple * 8) +#define Swidth_to_Smask(Swidth) (((1 << (Swidth)) - 1) * PWXsimple * 8) +#define Smask_to_Smask2(Smask) (((uint64_t)(Smask) << 32) | (Smask)) + +/* These should be compile-time derived */ +#define Smask2_0_5 Smask_to_Smask2(Swidth_to_Smask(Swidth_0_5)) +#define Smask2_1_0 Smask_to_Smask2(Swidth_to_Smask(Swidth_1_0)) + +typedef struct { + uint8_t *S0, *S1, *S2; + size_t w; + uint32_t Sbytes; +} pwxform_ctx_t; + +#define DECL_SMASK2REG /* empty */ +#define MAYBE_MEMORY_BARRIER /* empty */ + +#ifdef __SSE2__ +/* + * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs + * starting with Sandy Bridge. Additionally, PSHUFD uses separate source and + * destination registers, whereas the shifts would require an extra move + * instruction for our code when building without AVX. Unfortunately, PSHUFD + * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ) + * and somewhat slower on some non-Intel CPUs (luckily not including AMD + * Bulldozer and Piledriver). + */ +#ifdef __AVX__ +#define HI32(X) \ + _mm_srli_si128((X), 4) +#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */ +#define HI32(X) \ + _mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1)) +#else +#define HI32(X) \ + _mm_srli_epi64((X), 32) +#endif + +#if defined(__x86_64__) && \ + __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC) +#ifdef __AVX__ +#define MOVQ "vmovq" +#else +/* "movq" would be more correct, but "movd" is supported by older binutils + * due to an error in AMD's spec for x86-64. */ +#define MOVQ "movd" +#endif +#define EXTRACT64(X) ({ \ + uint64_t result; \ + __asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \ + result; \ +}) +#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__) +/* MSVC and Open64 had bugs */ +#define EXTRACT64(X) _mm_cvtsi128_si64(X) +#elif defined(__x86_64__) && defined(__SSE4_1__) +/* No known bugs for this intrinsic */ +#include +#define EXTRACT64(X) _mm_extract_epi64((X), 0) +#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) +/* 32-bit */ +#include +#if 0 +/* This is currently unused by the code below, which instead uses these two + * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32)) +#endif +#else +/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */ +#define EXTRACT64(X) \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32)) +#endif + +#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__)) +/* 64-bit with AVX */ +/* Force use of 64-bit AND instead of two 32-bit ANDs */ +#undef DECL_SMASK2REG +#if defined(__GNUC__) && !defined(__ICC) +#define DECL_SMASK2REG uint64_t Smask2reg = Smask2; +/* Force use of lower-numbered registers to reduce number of prefixes, relying + * on out-of-order execution and register renaming. */ +#define FORCE_REGALLOC_1 \ + __asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1)); +#define FORCE_REGALLOC_2 \ + __asm__("" : : "c" (lo)); +#else +static volatile uint64_t Smask2var = Smask2; +#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var; +#define FORCE_REGALLOC_1 /* empty */ +#define FORCE_REGALLOC_2 /* empty */ +#endif +#define PWXFORM_SIMD(X) { \ + uint64_t x; \ + FORCE_REGALLOC_1 \ + uint32_t lo = x = EXTRACT64(X) & Smask2reg; \ + FORCE_REGALLOC_2 \ + uint32_t hi = x >> 32; \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \ + X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \ +} +#elif defined(__x86_64__) +/* 64-bit without AVX. This relies on out-of-order execution and register + * renaming. It may actually be fastest on CPUs with AVX(2) as well - e.g., + * it runs great on Haswell. */ +#warning "Note: using x86-64 inline assembly for pwxform. That's great." +#undef MAYBE_MEMORY_BARRIER +#define MAYBE_MEMORY_BARRIER \ + __asm__("" : : : "memory"); +#ifdef __ILP32__ /* x32 */ +#define REGISTER_PREFIX "e" +#else +#define REGISTER_PREFIX "r" +#endif +#define PWXFORM_SIMD(X) { \ + __m128i H; \ + __asm__( \ + "movd %0, %%rax\n\t" \ + "pshufd $0xb1, %0, %1\n\t" \ + "andq %2, %%rax\n\t" \ + "pmuludq %1, %0\n\t" \ + "movl %%eax, %%ecx\n\t" \ + "shrq $0x20, %%rax\n\t" \ + "paddq (%3,%%" REGISTER_PREFIX "cx), %0\n\t" \ + "pxor (%4,%%" REGISTER_PREFIX "ax), %0\n\t" \ + : "+x" (X), "=x" (H) \ + : "d" (Smask2), "S" (S0), "D" (S1) \ + : "cc", "ax", "cx"); \ +} +#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__) +/* 32-bit with SSE4.1 */ +#define PWXFORM_SIMD(X) { \ + __m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \ + __m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \ + __m128i s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); \ +} +#else +/* 32-bit without SSE4.1 */ +#define PWXFORM_SIMD(X) { \ + uint64_t x = EXTRACT64(X) & Smask2; \ + __m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \ + __m128i s1 = *(__m128i *)(S1 + (x >> 32)); \ + X = _mm_mul_epu32(HI32(X), X); \ + X = _mm_add_epi64(X, s0); \ + X = _mm_xor_si128(X, s1); \ +} +#endif + +#define PWXFORM_SIMD_WRITE(X, Sw) \ + PWXFORM_SIMD(X) \ + MAYBE_MEMORY_BARRIER \ + *(__m128i *)(Sw + w) = X; \ + MAYBE_MEMORY_BARRIER + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X0) \ + PWXFORM_SIMD(X1) \ + PWXFORM_SIMD(X2) \ + PWXFORM_SIMD(X3) + +#define PWXFORM_ROUND_WRITE4 \ + PWXFORM_SIMD_WRITE(X0, S0) \ + PWXFORM_SIMD_WRITE(X1, S1) \ + w += 16; \ + PWXFORM_SIMD_WRITE(X2, S0) \ + PWXFORM_SIMD_WRITE(X3, S1) \ + w += 16; + +#define PWXFORM_ROUND_WRITE2 \ + PWXFORM_SIMD_WRITE(X0, S0) \ + PWXFORM_SIMD_WRITE(X1, S1) \ + w += 16; \ + PWXFORM_SIMD(X2) \ + PWXFORM_SIMD(X3) + +#else /* !defined(__SSE2__) */ + +#define PWXFORM_SIMD(x0, x1) { \ + uint64_t x = x0 & Smask2; \ + uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ + uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ + x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ + x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ +} + +#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \ + PWXFORM_SIMD(x0, x1) \ + ((uint64_t *)(Sw + w))[0] = x0; \ + ((uint64_t *)(Sw + w))[1] = x1; + +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X.d[0], X.d[1]) \ + PWXFORM_SIMD(X.d[2], X.d[3]) \ + PWXFORM_SIMD(X.d[4], X.d[5]) \ + PWXFORM_SIMD(X.d[6], X.d[7]) + +#define PWXFORM_ROUND_WRITE4 \ + PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \ + PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \ + w += 16; \ + PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \ + PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \ + w += 16; + +#define PWXFORM_ROUND_WRITE2 \ + PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \ + PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \ + w += 16; \ + PWXFORM_SIMD(X.d[4], X.d[5]) \ + PWXFORM_SIMD(X.d[6], X.d[7]) +#endif + +#define PWXFORM \ + PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND + +#define Smask2 Smask2_0_5 + +#else /* pass 2 */ + +#undef PWXFORM +#define PWXFORM \ + PWXFORM_ROUND_WRITE4 PWXFORM_ROUND_WRITE2 PWXFORM_ROUND_WRITE2 \ + w &= Smask2; \ + { \ + uint8_t *Stmp = S2; \ + S2 = S1; \ + S1 = S0; \ + S0 = Stmp; \ + } + +#undef Smask2 +#define Smask2 Smask2_1_0 + +#endif + +/** + * blockmix_pwxform(Bin, Bout, r, S): + * Compute Bout = BlockMix_pwxform{salsa20, r, S}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + */ +static void blockmix(const salsa20_blk_t *restrict Bin, + salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx) +{ + if (unlikely(!ctx)) { + blockmix_salsa(Bin, Bout); + return; + } + + uint8_t *S0 = ctx->S0, *S1 = ctx->S1; +#if _YESPOWER_OPT_C_PASS_ > 1 + uint8_t *S2 = ctx->S2; + size_t w = ctx->w; +#endif + size_t i; + DECL_X + + /* Convert count of 128-byte blocks to max index of 64-byte block */ + r = r * 2 - 1; + + READ_X(Bin[r]) + + DECL_SMASK2REG + + i = 0; + do { + XOR_X(Bin[i]) + PWXFORM + if (unlikely(i >= r)) + break; + WRITE_X(Bout[i]) + i++; + } while (1); + +#if _YESPOWER_OPT_C_PASS_ > 1 + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; +#endif + + SALSA20(Bout[i]) +} + +static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1, + const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, + size_t r, pwxform_ctx_t *restrict ctx) +{ + if (unlikely(!ctx)) + return blockmix_salsa_xor(Bin1, Bin2, Bout); + + uint8_t *S0 = ctx->S0, *S1 = ctx->S1; +#if _YESPOWER_OPT_C_PASS_ > 1 + uint8_t *S2 = ctx->S2; + size_t w = ctx->w; +#endif + size_t i; + DECL_X + + /* Convert count of 128-byte blocks to max index of 64-byte block */ + r = r * 2 - 1; + +#ifdef PREFETCH + PREFETCH(&Bin2[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + } +#endif + + XOR_X_2(Bin1[r], Bin2[r]) + + DECL_SMASK2REG + + i = 0; + r--; + do { + XOR_X(Bin1[i]) + XOR_X(Bin2[i]) + PWXFORM + WRITE_X(Bout[i]) + + XOR_X(Bin1[i + 1]) + XOR_X(Bin2[i + 1]) + PWXFORM + + if (unlikely(i >= r)) + break; + + WRITE_X(Bout[i + 1]) + + i += 2; + } while (1); + i++; + +#if _YESPOWER_OPT_C_PASS_ > 1 + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; +#endif + + SALSA20(Bout[i]) + + return INTEGERIFY; +} + +static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out, + salsa20_blk_t *restrict Bin2, + size_t r, pwxform_ctx_t *restrict ctx) +{ + uint8_t *S0 = ctx->S0, *S1 = ctx->S1; +#if _YESPOWER_OPT_C_PASS_ > 1 + uint8_t *S2 = ctx->S2; + size_t w = ctx->w; +#endif + size_t i; + DECL_X + DECL_Y + + /* Convert count of 128-byte blocks to max index of 64-byte block */ + r = r * 2 - 1; + +#ifdef PREFETCH + PREFETCH(&Bin2[r], _MM_HINT_T0) + for (i = 0; i < r; i++) { + PREFETCH(&Bin2[i], _MM_HINT_T0) + } +#endif + + XOR_X_2(Bin1out[r], Bin2[r]) + + DECL_SMASK2REG + + i = 0; + r--; + do { + XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]) + PWXFORM + WRITE_X(Bin1out[i]) + + XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]) + PWXFORM + + if (unlikely(i >= r)) + break; + + WRITE_X(Bin1out[i + 1]) + + i += 2; + } while (1); + i++; + +#if _YESPOWER_OPT_C_PASS_ > 1 + ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; + ctx->w = w; +#endif + + SALSA20(Bin1out[i]) + + return INTEGERIFY; +} + +#if _YESPOWER_OPT_C_PASS_ == 1 +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static inline uint32_t integerify(const salsa20_blk_t *B, size_t r) +{ +/* + * Our 64-bit words are in host byte order, which is why we don't just read + * w[0] here (would be wrong on big-endian). Also, our 32-bit words are + * SIMD-shuffled, but we only care about the least significant 32 bits anyway. + */ + return (uint32_t)B[2 * r - 1].d[0]; +} +#endif + +/** + * smix1(B, r, N, V, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 128r+64 bytes in length. N must be even and at least 4. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY + * to a multiple of at least 16 bytes. + */ +static void smix1(uint8_t *B, size_t r, uint32_t N, + salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) +{ + size_t s = 2 * r; + salsa20_blk_t *X = V, *Y = &V[s], *V_j; + uint32_t i, j, n; + +#if _YESPOWER_OPT_C_PASS_ == 1 + for (i = 0; i < 2 * r; i++) { +#else + for (i = 0; i < 2; i++) { +#endif + const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; + salsa20_blk_t *tmp = Y; + salsa20_blk_t *dst = &X[i]; + size_t k; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + +#if _YESPOWER_OPT_C_PASS_ > 1 + for (i = 1; i < r; i++) + blockmix(&X[(i - 1) * 2], &X[i * 2], 1, ctx); +#endif + + blockmix(X, Y, r, ctx); + X = Y + s; + blockmix(Y, X, r, ctx); + j = integerify(X, r); + + for (n = 2; n < N; n <<= 1) { + uint32_t m = (n < N / 2) ? n : (N - 1 - n); + for (i = 1; i < m; i += 2) { + Y = X + s; + j &= n - 1; + j += i - 1; + V_j = &V[j * s]; + j = blockmix_xor(X, V_j, Y, r, ctx); + j &= n - 1; + j += i; + V_j = &V[j * s]; + X = Y + s; + j = blockmix_xor(Y, V_j, X, r, ctx); + } + } + n >>= 1; + + j &= n - 1; + j += N - 2 - n; + V_j = &V[j * s]; + Y = X + s; + j = blockmix_xor(X, V_j, Y, r, ctx); + j &= n - 1; + j += N - 1 - n; + V_j = &V[j * s]; + blockmix_xor(Y, V_j, XY, r, ctx); + + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = &XY[i]; + salsa20_blk_t *tmp = &XY[s]; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; + size_t k; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + +/** + * smix2(B, r, N, Nloop, V, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r bytes in length. N must be a power of 2 and at + * least 2. Nloop must be even. The array V must be aligned to a multiple of + * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes. + */ +static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop, + salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) +{ + size_t s = 2 * r; + salsa20_blk_t *X = XY, *Y = &XY[s]; + uint32_t i, j; + + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; + salsa20_blk_t *tmp = Y; + salsa20_blk_t *dst = &X[i]; + size_t k; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + + j = integerify(X, r) & (N - 1); + +#if _YESPOWER_OPT_C_PASS_ == 1 + if (Nloop > 2) { +#endif + do { + salsa20_blk_t *V_j = &V[j * s]; + j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); + V_j = &V[j * s]; + j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); + } while (Nloop -= 2); +#if _YESPOWER_OPT_C_PASS_ == 1 + } else { + const salsa20_blk_t * V_j = &V[j * s]; + j = blockmix_xor(X, V_j, Y, r, ctx) & (N - 1); + V_j = &V[j * s]; + blockmix_xor(Y, V_j, X, r, ctx); + } +#endif + + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = &X[i]; + salsa20_blk_t *tmp = Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; + size_t k; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + +/** + * smix(B, r, N, V, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * XY must be 256r bytes in length. N must be a power of 2 and at least 16. + * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY + * to a multiple of at least 16 bytes (aligning them to 64 bytes as well saves + * cache lines, but it might also result in cache bank conflicts). + */ +static void smix(uint8_t *B, size_t r, uint32_t N, + salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx) +{ +#if _YESPOWER_OPT_C_PASS_ == 1 + uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ + uint32_t Nloop_rw = Nloop_all; + + Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ + Nloop_rw &= ~(uint32_t)1; /* round down to even */ +#else + uint32_t Nloop_rw = (N + 2) / 3; /* 1/3, round up */ + Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ +#endif + + smix1(B, 1, ctx->Sbytes / 128, (salsa20_blk_t *)ctx->S0, XY, NULL); + smix1(B, r, N, V, XY, ctx); + smix2(B, r, N, Nloop_rw /* must be > 2 */, V, XY, ctx); +#if _YESPOWER_OPT_C_PASS_ == 1 + if (Nloop_all > Nloop_rw) + smix2(B, r, N, 2, V, XY, ctx); +#endif +} + +#if _YESPOWER_OPT_C_PASS_ == 1 +#undef _YESPOWER_OPT_C_PASS_ +#define _YESPOWER_OPT_C_PASS_ 2 +#define blockmix_salsa blockmix_salsa_1_0 +#define blockmix_salsa_xor blockmix_salsa_xor_1_0 +#define blockmix blockmix_1_0 +#define blockmix_xor blockmix_xor_1_0 +#define blockmix_xor_save blockmix_xor_save_1_0 +#define smix1 smix1_1_0 +#define smix2 smix2_1_0 +#define smix smix_1_0 +#include "yespower-opt.c" +#undef smix + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * local is the thread-local data structure, allowing to preserve and reuse a + * memory allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int yespower(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, + yespower_binary_t *dst) +{ + yespower_version_t version = params->version; + uint32_t N = params->N; + uint32_t r = params->r; + const uint8_t *pers = params->pers; + size_t perslen = params->perslen; + uint32_t Swidth; + size_t B_size, V_size, XY_size, need; + uint8_t *B, *S; + salsa20_blk_t *V, *XY; + pwxform_ctx_t ctx; + uint8_t sha256[32]; + + /* Sanity-check parameters */ + if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || + N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || + (N & (N - 1)) != 0 || + (!pers && perslen)) { + errno = EINVAL; + goto fail; + } + + /* Allocate memory */ + B_size = (size_t)128 * r; + V_size = B_size * N; + if (version == YESPOWER_0_5) { + XY_size = B_size * 2; + Swidth = Swidth_0_5; + ctx.Sbytes = 2 * Swidth_to_Sbytes1(Swidth); + } else { + XY_size = B_size + 64; + Swidth = Swidth_1_0; + ctx.Sbytes = 3 * Swidth_to_Sbytes1(Swidth); + } + need = B_size + V_size + XY_size + ctx.Sbytes; + if (local->aligned_size < need) { + if (free_region(local)) + goto fail; + if (!alloc_region(local, need)) + goto fail; + } + B = (uint8_t *)local->aligned; + V = (salsa20_blk_t *)((uint8_t *)B + B_size); + XY = (salsa20_blk_t *)((uint8_t *)V + V_size); + S = (uint8_t *)XY + XY_size; + ctx.S0 = S; + ctx.S1 = S + Swidth_to_Sbytes1(Swidth); + + SHA256_Buf(src, srclen, sha256); + + if (version == YESPOWER_0_5) { + PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, + B, B_size); + memcpy(sha256, B, sizeof(sha256)); + smix(B, r, N, V, XY, &ctx); + PBKDF2_SHA256(sha256, sizeof(sha256), B, B_size, 1, + (uint8_t *)dst, sizeof(*dst)); + + if (pers) { + HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, + sha256); + SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); + } + } else { + ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth); + ctx.w = 0; + + if (pers) { + src = pers; + srclen = perslen; + } else { + srclen = 0; + } + + PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, B, 128); + memcpy(sha256, B, sizeof(sha256)); + smix_1_0(B, r, N, V, XY, &ctx); + HMAC_SHA256_Buf(B + B_size - 64, 64, + sha256, sizeof(sha256), (uint8_t *)dst); + } + + /* Success! */ + return 0; + +fail: + memset(dst, 0xff, sizeof(*dst)); + return -1; +} + +/** + * yespower_tls(src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * The memory allocation is maintained internally using thread-local storage. + * + * Return 0 on success; or -1 on error. + */ +int yespower_tls(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst) +{ + static __thread int initialized = 0; + static __thread yespower_local_t local; + + if (!initialized) { + init_region(&local); + initialized = 1; + } + + return yespower(&local, src, srclen, params, dst); +} + +int yespower_init_local(yespower_local_t *local) +{ + init_region(local); + return 0; +} + +int yespower_free_local(yespower_local_t *local) +{ + return free_region(local); +} +#endif diff --git a/yespower-1.0.1/yespower-platform.c b/yespower-1.0.1/yespower-platform.c new file mode 100644 index 000000000..2b1a03f0f --- /dev/null +++ b/yespower-1.0.1/yespower-platform.c @@ -0,0 +1,107 @@ +/*- + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef __unix__ +#include +#endif + +#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) + +#ifdef __x86_64__ +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#else +#undef HUGEPAGE_SIZE +#endif + +static void *alloc_region(yespower_region_t *region, size_t size) +{ + size_t base_size = size; + uint8_t *base, *aligned; +#ifdef MAP_ANON + int flags = +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_ANON | MAP_PRIVATE; +#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) + size_t new_size = size; + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; + if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { + flags |= MAP_HUGETLB; +/* + * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of + * huge page size, so let's round up to huge page size here. + */ + new_size = size + hugepage_mask; + new_size &= ~hugepage_mask; + } + base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (base != MAP_FAILED) { + base_size = new_size; + } else if (flags & MAP_HUGETLB) { + flags &= ~MAP_HUGETLB; + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + +#else + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if (base == MAP_FAILED) + base = NULL; + aligned = base; +#elif defined(HAVE_POSIX_MEMALIGN) + if ((errno = posix_memalign((void **)&base, 64, size)) != 0) + base = NULL; + aligned = base; +#else + base = aligned = NULL; + if (size + 63 < size) { + errno = ENOMEM; + } else if ((base = malloc(size + 63)) != NULL) { + aligned = base + 63; + aligned -= (uintptr_t)aligned & 63; + } +#endif + region->base = base; + region->aligned = aligned; + region->base_size = base ? base_size : 0; + region->aligned_size = base ? size : 0; + return aligned; +} + +static inline void init_region(yespower_region_t *region) +{ + region->base = region->aligned = NULL; + region->base_size = region->aligned_size = 0; +} + +static int free_region(yespower_region_t *region) +{ + if (region->base) { +#ifdef MAP_ANON + if (munmap(region->base, region->base_size)) + return -1; +#else + free(region->base); +#endif + } + init_region(region); + return 0; +} diff --git a/yespower-1.0.1/yespower-ref.c b/yespower-1.0.1/yespower-ref.c new file mode 100644 index 000000000..ad0f607ea --- /dev/null +++ b/yespower-1.0.1/yespower-ref.c @@ -0,0 +1,582 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2019 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + * + * This is a proof-of-work focused fork of yescrypt, including reference and + * cut-down implementation of the obsolete yescrypt 0.5 (based off its first + * submission to PHC back in 2014) and a new proof-of-work specific variation + * known as yespower 1.0. The former is intended as an upgrade for + * cryptocurrencies that already use yescrypt 0.5 and the latter may be used + * as a further upgrade (hard fork) by those and other cryptocurrencies. The + * version of algorithm to use is requested through parameters, allowing for + * both algorithms to co-exist in client and miner implementations (such as in + * preparation for a hard-fork). + * + * This is the reference implementation. Its purpose is to provide a simple + * human- and machine-readable specification that implementations intended + * for actual use should be tested against. It is deliberately mostly not + * optimized, and it is not meant to be used in production. Instead, use + * yespower-opt.c. + */ + +#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose." + +#include +#include +#include +#include + +#include "sha256.h" +#include "sysendian.h" + +#include "yespower.h" + +static void blkcpy(uint32_t *dst, const uint32_t *src, size_t count) +{ + do { + *dst++ = *src++; + } while (--count); +} + +static void blkxor(uint32_t *dst, const uint32_t *src, size_t count) +{ + do { + *dst++ ^= *src++; + } while (--count); +} + +/** + * salsa20(B): + * Apply the Salsa20 core to the provided block. + */ +static void salsa20(uint32_t B[16], uint32_t rounds) +{ + uint32_t x[16]; + size_t i; + + /* SIMD unshuffle */ + for (i = 0; i < 16; i++) + x[i * 5 % 16] = B[i]; + + for (i = 0; i < rounds; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } + + /* SIMD shuffle */ + for (i = 0; i < 16; i++) + B[i] += x[i * 5 % 16]; +} + +/** + * blockmix_salsa(B): + * Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in + * length. + */ +static void blockmix_salsa(uint32_t *B, uint32_t rounds) +{ + uint32_t X[16]; + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &B[16], 16); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2; i++) { + /* 3: X <-- H(X xor B_i) */ + blkxor(X, &B[i * 16], 16); + salsa20(X, rounds); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&B[i * 16], X, 16); + } +} + +/* + * These are tunable, but they must meet certain constraints and are part of + * what defines a yespower version. + */ +#define PWXsimple 2 +#define PWXgather 4 +/* Version 0.5 */ +#define PWXrounds_0_5 6 +#define Swidth_0_5 8 +/* Version 1.0 */ +#define PWXrounds_1_0 3 +#define Swidth_1_0 11 + +/* Derived values. Not tunable on their own. */ +#define PWXbytes (PWXgather * PWXsimple * 8) +#define PWXwords (PWXbytes / sizeof(uint32_t)) +#define rmin ((PWXbytes + 127) / 128) + +/* Runtime derived values. Not tunable on their own. */ +#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8) +#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8) + +typedef struct { + yespower_version_t version; + uint32_t salsa20_rounds; + uint32_t PWXrounds, Swidth, Sbytes, Smask; + uint32_t *S; + uint32_t (*S0)[2], (*S1)[2], (*S2)[2]; + size_t w; +} pwxform_ctx_t; + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ +static void pwxform(uint32_t *B, pwxform_ctx_t *ctx) +{ + uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B; + uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2; + uint32_t Smask = ctx->Smask; + size_t w = ctx->w; + size_t i, j, k; + + /* 1: for i = 0 to PWXrounds - 1 do */ + for (i = 0; i < ctx->PWXrounds; i++) { + /* 2: for j = 0 to PWXgather - 1 do */ + for (j = 0; j < PWXgather; j++) { + uint32_t xl = X[j][0][0]; + uint32_t xh = X[j][0][1]; + uint32_t (*p0)[2], (*p1)[2]; + + /* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p0 = S0 + (xl & Smask) / sizeof(*S0); + /* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */ + p1 = S1 + (xh & Smask) / sizeof(*S1); + + /* 5: for k = 0 to PWXsimple - 1 do */ + for (k = 0; k < PWXsimple; k++) { + uint64_t x, s0, s1; + + /* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) xor S1_{p1,k} */ + s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0]; + s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0]; + + xl = X[j][k][0]; + xh = X[j][k][1]; + + x = (uint64_t)xh * xl; + x += s0; + x ^= s1; + + X[j][k][0] = x; + X[j][k][1] = x >> 32; + } + + if (ctx->version != YESPOWER_0_5 && + (i == 0 || j < PWXgather / 2)) { + if (j & 1) { + for (k = 0; k < PWXsimple; k++) { + S1[w][0] = X[j][k][0]; + S1[w][1] = X[j][k][1]; + w++; + } + } else { + for (k = 0; k < PWXsimple; k++) { + S0[w + k][0] = X[j][k][0]; + S0[w + k][1] = X[j][k][1]; + } + } + } + } + } + + if (ctx->version != YESPOWER_0_5) { + /* 14: (S0, S1, S2) <-- (S2, S0, S1) */ + ctx->S0 = S2; + ctx->S1 = S0; + ctx->S2 = S1; + /* 15: w <-- w mod 2^Swidth */ + ctx->w = w & ((1 << ctx->Swidth) * PWXsimple - 1); + } +} + +/** + * blockmix_pwxform(B, ctx, r): + * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be + * 128r bytes in length. + */ +static void blockmix_pwxform(uint32_t *B, pwxform_ctx_t *ctx, size_t r) +{ + uint32_t X[PWXwords]; + size_t r1, i; + + /* Convert 128-byte blocks to PWXbytes blocks */ + /* 1: r_1 <-- 128r / PWXbytes */ + r1 = 128 * r / PWXbytes; + + /* 2: X <-- B'_{r_1 - 1} */ + blkcpy(X, &B[(r1 - 1) * PWXwords], PWXwords); + + /* 3: for i = 0 to r_1 - 1 do */ + for (i = 0; i < r1; i++) { + /* 4: if r_1 > 1 */ + if (r1 > 1) { + /* 5: X <-- X xor B'_i */ + blkxor(X, &B[i * PWXwords], PWXwords); + } + + /* 7: X <-- pwxform(X) */ + pwxform(X, ctx); + + /* 8: B'_i <-- X */ + blkcpy(&B[i * PWXwords], X, PWXwords); + } + + /* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */ + i = (r1 - 1) * PWXbytes / 64; + + /* 11: B_i <-- H(B_i) */ + salsa20(&B[i * 16], ctx->salsa20_rounds); + +#if 1 /* No-op with our current pwxform settings, but do it to make sure */ + /* 12: for i = i + 1 to 2r - 1 do */ + for (i++; i < 2 * r; i++) { + /* 13: B_i <-- H(B_i xor B_{i-1}) */ + blkxor(&B[i * 16], &B[(i - 1) * 16], 16); + salsa20(&B[i * 16], ctx->salsa20_rounds); + } +#endif +} + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint32_t integerify(const uint32_t *B, size_t r) +{ +/* + * Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but + * we only care about the least significant 32 bits anyway. + */ + const uint32_t *X = &B[(2 * r - 1) * 16]; + return X[0]; +} + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint32_t p2floor(uint32_t x) +{ + uint32_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * wrap(x, i): + * Wrap x to the range 0 to i-1. + */ +static uint32_t wrap(uint32_t x, uint32_t i) +{ + uint32_t n = p2floor(i); + return (x & (n - 1)) + (i - n); +} + +/** + * smix1(B, r, N, V, X, ctx): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage X must be 128r bytes in length. + */ +static void smix1(uint32_t *B, size_t r, uint32_t N, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + size_t s = 32 * r; + uint32_t i, j; + size_t k; + + /* 1: X <-- B */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); + + if (ctx->version != YESPOWER_0_5) { + for (k = 1; k < r; k++) { + blkcpy(&X[k * 32], &X[(k - 1) * 32], 32); + blockmix_pwxform(&X[k * 32], ctx, 1); + } + } + + /* 2: for i = 0 to N - 1 do */ + for (i = 0; i < N; i++) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if (i > 1) { + /* j <-- Wrap(Integerify(X), i) */ + j = wrap(integerify(X, r), i); + + /* X <-- X xor V_j */ + blkxor(X, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + if (V != ctx->S) + blockmix_pwxform(X, ctx, r); + else + blockmix_salsa(X, ctx->salsa20_rounds); + } + + /* B' <-- X */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); +} + +/** + * smix2(B, r, N, Nloop, V, X, ctx): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage X must be 128r bytes in length. The value N must be a power of 2 + * greater than 1. + */ +static void smix2(uint32_t *B, size_t r, uint32_t N, uint32_t Nloop, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + size_t s = 32 * r; + uint32_t i, j; + size_t k; + + /* X <-- B */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i++) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8.1: X <-- X xor V_j */ + blkxor(X, &V[j * s], s); + /* V_j <-- X */ + if (Nloop != 2) + blkcpy(&V[j * s], X, s); + + /* 8.2: X <-- H(X) */ + blockmix_pwxform(X, ctx, r); + } + + /* 10: B' <-- X */ + for (k = 0; k < 2 * r; k++) + for (i = 0; i < 16; i++) + le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]); +} + +/** + * smix(B, r, N, p, t, V, X, ctx): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * X must be 128r bytes in length. The value N must be a power of 2 and at + * least 16. + */ +static void smix(uint32_t *B, size_t r, uint32_t N, + uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx) +{ + uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */ + uint32_t Nloop_rw = Nloop_all; + + Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */ + if (ctx->version == YESPOWER_0_5) { + Nloop_rw &= ~(uint32_t)1; /* round down to even */ + } else { + Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */ + } + + smix1(B, 1, ctx->Sbytes / 128, ctx->S, X, ctx); + smix1(B, r, N, V, X, ctx); + smix2(B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx); + smix2(B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx); +} + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * + * Return 0 on success; or -1 on error. + */ +int yespower(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst) +{ + yespower_version_t version = params->version; + uint32_t N = params->N; + uint32_t r = params->r; + const uint8_t *pers = params->pers; + size_t perslen = params->perslen; + int retval = -1; + size_t B_size, V_size; + uint32_t *B, *V, *X, *S; + pwxform_ctx_t ctx; + uint32_t sha256[8]; + + memset(dst, 0xff, sizeof(*dst)); + + /* Sanity-check parameters */ + if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) || + N < 1024 || N > 512 * 1024 || r < 8 || r > 32 || + (N & (N - 1)) != 0 || r < rmin || + (!pers && perslen)) { + errno = EINVAL; + return -1; + } + + /* Allocate memory */ + B_size = (size_t)128 * r; + V_size = B_size * N; + if ((V = malloc(V_size)) == NULL) + return -1; + if ((B = malloc(B_size)) == NULL) + goto free_V; + if ((X = malloc(B_size)) == NULL) + goto free_B; + ctx.version = version; + if (version == YESPOWER_0_5) { + ctx.salsa20_rounds = 8; + ctx.PWXrounds = PWXrounds_0_5; + ctx.Swidth = Swidth_0_5; + ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth); + } else { + ctx.salsa20_rounds = 2; + ctx.PWXrounds = PWXrounds_1_0; + ctx.Swidth = Swidth_1_0; + ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth); + } + if ((S = malloc(ctx.Sbytes)) == NULL) + goto free_X; + ctx.S = S; + ctx.S0 = (uint32_t (*)[2])S; + ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple; + ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple; + ctx.Smask = Swidth_to_Smask(ctx.Swidth); + ctx.w = 0; + + SHA256_Buf(src, srclen, (uint8_t *)sha256); + + if (version != YESPOWER_0_5) { + if (pers) { + src = pers; + srclen = perslen; + } else { + srclen = 0; + } + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), + src, srclen, 1, (uint8_t *)B, B_size); + + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + + /* 3: B_i <-- MF(B_i, N) */ + smix(B, r, N, V, X, &ctx); + + if (version == YESPOWER_0_5) { + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256), + (uint8_t *)B, B_size, 1, (uint8_t *)dst, sizeof(*dst)); + + if (pers) { + HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen, + (uint8_t *)sha256); + SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst); + } + } else { + HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64, + sha256, sizeof(sha256), (uint8_t *)dst); + } + + /* Success! */ + retval = 0; + + /* Free memory */ + free(S); +free_X: + free(X); +free_B: + free(B); +free_V: + free(V); + + return retval; +} + +int yespower_tls(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst) +{ +/* The reference implementation doesn't use thread-local storage */ + return yespower(NULL, src, srclen, params, dst); +} + +int yespower_init_local(yespower_local_t *local) +{ +/* The reference implementation doesn't use the local structure */ + local->base = local->aligned = NULL; + local->base_size = local->aligned_size = 0; + return 0; +} + +int yespower_free_local(yespower_local_t *local) +{ +/* The reference implementation frees its memory in yespower() */ + (void)local; /* unused */ + return 0; +} diff --git a/yespower-1.0.1/yespower.h b/yespower-1.0.1/yespower.h new file mode 100644 index 000000000..b388d7217 --- /dev/null +++ b/yespower-1.0.1/yespower.h @@ -0,0 +1,130 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013-2018 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _YESPOWER_H_ +#define _YESPOWER_H_ + +#include +#include /* for size_t */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Internal type used by the memory allocator. Please do not use it directly. + * Use yespower_local_t instead. + */ +typedef struct { + void *base, *aligned; + size_t base_size, aligned_size; +} yespower_region_t; + +/** + * Type for thread-local (RAM) data structure. + */ +typedef yespower_region_t yespower_local_t; + +/* + * Type for yespower algorithm version numbers. + */ +typedef enum { YESPOWER_0_5 = 5, YESPOWER_1_0 = 10 } yespower_version_t; + +/** + * yespower parameters combined into one struct. + */ +typedef struct { + yespower_version_t version; + uint32_t N, r; + const uint8_t *pers; + size_t perslen; +} yespower_params_t; + +/** + * A 256-bit yespower hash. + */ +typedef struct { + unsigned char uc[32]; +} yespower_binary_t; + +/** + * yespower_init_local(local): + * Initialize the thread-local (RAM) data structure. Actual memory allocation + * is currently fully postponed until a call to yespower(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yespower_init_local(yespower_local_t *local); + +/** + * yespower_free_local(local): + * Free memory that may have been allocated for an initialized thread-local + * (RAM) data structure. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yespower_free_local(yespower_local_t *local); + +/** + * yespower(local, src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * local is the thread-local data structure, allowing to preserve and reuse a + * memory allocation across calls, thereby reducing processing overhead. + * + * Return 0 on success; or -1 on error. + * + * local must be initialized with yespower_init_local(). + * + * MT-safe as long as local and dst are local to the thread. + */ +extern int yespower(yespower_local_t *local, + const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst); + +/** + * yespower_tls(src, srclen, params, dst): + * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target". + * The memory allocation is maintained internally using thread-local storage. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as dst is local to the thread. + */ +extern int yespower_tls(const uint8_t *src, size_t srclen, + const yespower_params_t *params, yespower_binary_t *dst); + +#ifdef __cplusplus +} +#endif + +#endif /* !_YESPOWER_H_ */