diff --git a/Makefile.am b/Makefile.am
index 4398988cd..07feb08d1 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -57,6 +57,8 @@ cpuminer_SOURCES = \
   lyra2/Lyra2.c lyra2/Sponge.c \
   yescrypt/yescrypt-common.c yescrypt/yescrypt-best.c \
   yescrypt/sha256_Y.c \
+  yespower-1.0.1/sha256.c \
+  yespower-1.0.1/yespower-opt.c \
   algo/allium.c \
   algo/axiom.c \
   algo/bastion.c \
@@ -80,6 +82,7 @@ cpuminer_SOURCES = \
   algo/lyra2re.c \
   algo/lyra2rev2.c \
   algo/lyra2v3.c \
+  algo/minotaur.c \
   algo/myr-groestl.c \
   algo/keccak.c \
   algo/pentablake.c \
diff --git a/README.md b/README.md
index 315a84a83..6631e70d4 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,8 @@ Algorithms
  * ✓ __lyra2RE__ (Cryptocoin)
  * ✓ __lyra2REv2__
  * ✓ __lyra2REv3__ (VertCoin [VTC])
+ * ✓ __minotaur__ (Ring [RNG])
+ * ✓ __minotaurx__ (Avian[AVN], Litecash[LCC], Mazacoin[MAZA])
  * ✓ __myr-gr__ Myriad-Groestl (MyriadCoin [MYR])
  * ✓ __neoscrypt__ (Feathercoin)
  * ✓ __nist5__ (MistCoin [MIC], TalkCoin [TAC], ...)
diff --git a/algo/minotaur.c b/algo/minotaur.c
new file mode 100644
index 000000000..714c7a7db
--- /dev/null
+++ b/algo/minotaur.c
@@ -0,0 +1,290 @@
+#include <miner.h>
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include <sha3/sph_blake.h>
+#include <sha3/sph_bmw.h>
+#include <sha3/sph_groestl.h>
+#include <sha3/sph_jh.h>
+#include <sha3/sph_keccak.h>
+#include <sha3/sph_skein.h>
+#include <sha3/sph_luffa.h>
+#include <sha3/sph_cubehash.h>
+#include <sha3/sph_shavite.h>
+#include <sha3/sph_simd.h>
+#include <sha3/sph_echo.h>
+#include <sha3/sph_hamsi.h>
+#include <sha3/sph_fugue.h>
+#include <sha3/sph_shabal.h>
+#include <sha3/sph_whirlpool.h>
+#include <sha3/sph_sha2.h>
+#include <yespower-1.0.1/yespower.h>
+
+// Config
+#define MINOTAUR_ALGO_COUNT 16
+//#define MINOTAUR_DEBUG
+
+static const yespower_params_t yespower_params = {YESPOWER_1_0, 2048, 8, "et in arcadia ego", 17};
+
+typedef struct TortureNode TortureNode;
+typedef struct TortureGarden TortureGarden;
+
+// Graph of hash algos plus SPH contexts
+struct TortureGarden {
+    sph_blake512_context context_blake;
+    sph_bmw512_context context_bmw;
+    sph_cubehash512_context context_cubehash;
+    sph_echo512_context context_echo;
+    sph_fugue512_context context_fugue;
+    sph_groestl512_context context_groestl;
+    sph_hamsi512_context context_hamsi;
+    sph_jh512_context context_jh;
+    sph_keccak512_context context_keccak;
+    sph_luffa512_context context_luffa;
+    sph_shabal512_context context_shabal;
+    sph_shavite512_context context_shavite;
+    sph_simd512_context context_simd;
+    sph_skein512_context context_skein;
+    sph_whirlpool_context context_whirlpool;
+    sph_sha512_context context_sha2;
+
+    struct TortureNode {
+        unsigned int algo;
+        TortureNode *childLeft;
+        TortureNode *childRight;
+    } nodes[22];
+};
+
+// Get a 64-byte hash for given 64-byte input, using given TortureGarden contexts and given algo index
+void get_hash(void *output, const void *input, TortureGarden *garden, unsigned int algo)
+{    
+	unsigned char _ALIGN(64) hash[64];
+    memset(hash, 0, sizeof(hash));                        // Doesn't affect Minotaur as all hash outputs are 64 bytes; required for MinotaurX due to yespower's 32 byte output.
+
+    switch (algo) {
+        case 0:
+            sph_blake512_init(&garden->context_blake);
+            sph_blake512(&garden->context_blake, input, 64);
+            sph_blake512_close(&garden->context_blake, hash);
+            break;
+        case 1:
+            sph_bmw512_init(&garden->context_bmw);
+            sph_bmw512(&garden->context_bmw, input, 64);
+            sph_bmw512_close(&garden->context_bmw, hash);        
+            break;
+        case 2:
+            sph_cubehash512_init(&garden->context_cubehash);
+            sph_cubehash512(&garden->context_cubehash, input, 64);
+            sph_cubehash512_close(&garden->context_cubehash, hash);          
+            break;
+        case 3:
+            sph_echo512_init(&garden->context_echo);
+            sph_echo512(&garden->context_echo, input, 64);
+            sph_echo512_close(&garden->context_echo, hash);          
+            break;
+        case 4:
+            sph_fugue512_init(&garden->context_fugue);
+            sph_fugue512(&garden->context_fugue, input, 64);
+            sph_fugue512_close(&garden->context_fugue, hash);          
+            break;
+        case 5:
+            sph_groestl512_init(&garden->context_groestl);
+            sph_groestl512(&garden->context_groestl, input, 64);
+            sph_groestl512_close(&garden->context_groestl, hash);          
+            break;
+        case 6:
+            sph_hamsi512_init(&garden->context_hamsi);
+            sph_hamsi512(&garden->context_hamsi, input, 64);
+            sph_hamsi512_close(&garden->context_hamsi, hash);          
+            break;
+        case 7:
+            sph_sha512_init(&garden->context_sha2);
+            sph_sha512(&garden->context_sha2, input, 64);
+            sph_sha512_close(&garden->context_sha2, hash);
+            break;
+        case 8:
+            sph_jh512_init(&garden->context_jh);
+            sph_jh512(&garden->context_jh, input, 64);
+            sph_jh512_close(&garden->context_jh, hash);          
+            break;
+        case 9:
+            sph_keccak512_init(&garden->context_keccak);
+            sph_keccak512(&garden->context_keccak, input, 64);
+            sph_keccak512_close(&garden->context_keccak, hash);
+            break;
+        case 10:
+            sph_luffa512_init(&garden->context_luffa);
+            sph_luffa512(&garden->context_luffa, input, 64);
+            sph_luffa512_close(&garden->context_luffa, hash);          
+            break;
+        case 11:
+            sph_shabal512_init(&garden->context_shabal);
+            sph_shabal512(&garden->context_shabal, input, 64);
+            sph_shabal512_close(&garden->context_shabal, hash);          
+            break;
+        case 12:
+            sph_shavite512_init(&garden->context_shavite);
+            sph_shavite512(&garden->context_shavite, input, 64);
+            sph_shavite512_close(&garden->context_shavite, hash);          
+            break;
+        case 13:
+            sph_simd512_init(&garden->context_simd);
+            sph_simd512(&garden->context_simd, input, 64);
+            sph_simd512_close(&garden->context_simd, hash);          
+            break;
+        case 14:
+            sph_skein512_init(&garden->context_skein);
+            sph_skein512(&garden->context_skein, input, 64);
+            sph_skein512_close(&garden->context_skein, hash);          
+            break;
+        case 15:
+            sph_whirlpool_init(&garden->context_whirlpool);
+            sph_whirlpool(&garden->context_whirlpool, input, 64);
+            sph_whirlpool_close(&garden->context_whirlpool, hash);
+            break;
+        // NB: The CPU-hard gate must be case MINOTAUR_ALGO_COUNT.
+        case 16:
+            yespower_tls(input, 64, &yespower_params, (yespower_binary_t*)hash);
+    }
+
+    // Output the hash
+    memcpy(output, hash, 64);
+}
+
+// Recursively traverse a given torture garden starting with a given hash and given node within the garden. The hash is overwritten with the final hash.
+void traverse_garden(TortureGarden *garden, void *hash, TortureNode *node)
+{
+    unsigned char _ALIGN(64) partialHash[64];
+    memset(partialHash, 0, sizeof(partialHash));                        // Doesn't affect Minotaur as all hash outputs are 64 bytes; required for MinotaurX due to yespower's 32 byte output.
+    get_hash(partialHash, hash, garden, node->algo);
+
+#ifdef MINOTAUR_DEBUG
+    printf("* Ran algo %d. Partial hash:\t", node->algo);
+    for (int i = 63; i >= 0; i--) printf("%02x", partialHash[i]);
+    printf("\n");
+    fflush(0);
+#endif
+
+    if (partialHash[63] % 2 == 0) {                                     // Last byte of output hash is even
+        if (node->childLeft != NULL)
+            traverse_garden(garden, partialHash, node->childLeft);
+    } else {                                                            // Last byte of output hash is odd
+        if (node->childRight != NULL)
+            traverse_garden(garden, partialHash, node->childRight);
+    }
+
+    memcpy(hash, partialHash, 64);
+}
+
+// Associate child nodes with a parent node
+inline void link_nodes(TortureNode *parent, TortureNode *childLeft, TortureNode *childRight) 
+{
+    parent->childLeft = childLeft;
+    parent->childRight = childRight;
+}
+
+// Produce a 32-byte hash from 80-byte input data
+void minotaurhash(void *output, const void *input, bool minotaurX)
+{    
+    // Create torture garden nodes. Note that both sides of 19 and 20 lead to 21, and 21 has no children (to make traversal complete).
+    // The successful path through the garden visits 7 nodes.
+    TortureGarden garden;
+    link_nodes(&garden.nodes[0], &garden.nodes[1], &garden.nodes[2]);
+    link_nodes(&garden.nodes[1], &garden.nodes[3], &garden.nodes[4]);
+    link_nodes(&garden.nodes[2], &garden.nodes[5], &garden.nodes[6]);
+    link_nodes(&garden.nodes[3], &garden.nodes[7], &garden.nodes[8]);
+    link_nodes(&garden.nodes[4], &garden.nodes[9], &garden.nodes[10]);
+    link_nodes(&garden.nodes[5], &garden.nodes[11], &garden.nodes[12]);
+    link_nodes(&garden.nodes[6], &garden.nodes[13], &garden.nodes[14]);
+    link_nodes(&garden.nodes[7], &garden.nodes[15], &garden.nodes[16]);
+    link_nodes(&garden.nodes[8], &garden.nodes[15], &garden.nodes[16]);
+    link_nodes(&garden.nodes[9], &garden.nodes[15], &garden.nodes[16]);
+    link_nodes(&garden.nodes[10], &garden.nodes[15], &garden.nodes[16]);
+    link_nodes(&garden.nodes[11], &garden.nodes[17], &garden.nodes[18]);
+    link_nodes(&garden.nodes[12], &garden.nodes[17], &garden.nodes[18]);
+    link_nodes(&garden.nodes[13], &garden.nodes[17], &garden.nodes[18]);
+    link_nodes(&garden.nodes[14], &garden.nodes[17], &garden.nodes[18]);
+    link_nodes(&garden.nodes[15], &garden.nodes[19], &garden.nodes[20]);
+    link_nodes(&garden.nodes[16], &garden.nodes[19], &garden.nodes[20]);
+    link_nodes(&garden.nodes[17], &garden.nodes[19], &garden.nodes[20]);
+    link_nodes(&garden.nodes[18], &garden.nodes[19], &garden.nodes[20]);
+    link_nodes(&garden.nodes[19], &garden.nodes[21], &garden.nodes[21]);
+    link_nodes(&garden.nodes[20], &garden.nodes[21], &garden.nodes[21]);
+    garden.nodes[21].childLeft = NULL;
+    garden.nodes[21].childRight = NULL;
+        
+    // Find initial sha512 hash
+    unsigned char _ALIGN(64) hash[64];
+	sph_sha512_init(&garden.context_sha2);
+	sph_sha512(&garden.context_sha2, input, 80);
+	sph_sha512_close(&garden.context_sha2, hash);
+
+#ifdef MINOTAUR_DEBUG
+    printf("** Initial hash:\t\t");
+    for (int i = 63; i >= 0; i--) printf("%02x", hash[i]);
+    printf("\n");
+    fflush(0);
+#endif
+
+    // Assign algos to torture garden nodes based on initial hash
+    for (int i = 0; i < 22; i++)
+        garden.nodes[i].algo = hash[i] % MINOTAUR_ALGO_COUNT;
+
+    // Hardened garden gates on MinotaurX
+    if (minotaurX)
+        garden.nodes[21].algo = MINOTAUR_ALGO_COUNT;
+
+    // Send the initial hash through the torture garden
+    traverse_garden(&garden, hash, &garden.nodes[0]);
+
+	// Truncate the result to 32 bytes
+    memcpy(output, hash, 32);
+
+#ifdef MINOTAUR_DEBUG
+    printf("** Final hash:\t\t\t");
+    for (int i = 31; i >= 0; i--) printf("%02x", hash[i]);
+    printf("\n");
+    fflush(0);
+#endif
+}
+
+// Scan driver
+int scanhash_minotaur(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, bool minotaurX)
+{
+	uint32_t _ALIGN(64) hash[8];
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+	volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0cff;
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		minotaurhash(hash, endiandata, minotaurX);
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !(*restart));
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
diff --git a/cpu-miner.c b/cpu-miner.c
index 5e8948d80..b59db1795 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -105,6 +105,8 @@ enum algos {
 	ALGO_LYRA2,       /* Lyra2RE */
 	ALGO_LYRA2REV2,   /* Lyra2REv2 */
 	ALGO_LYRA2V3,     /* Lyra2REv3 (Vertcoin) */
+	ALGO_MINOTAUR,    /* Minotaur (Ring) */
+	ALGO_MINOTAURX,   /* Minotaurx (Avian, Litecash, Maza) */
 	ALGO_MYR_GR,      /* Myriad Groestl */
 	ALGO_NIST5,       /* Nist5 */
 	ALGO_PENTABLAKE,  /* Pentablake */
@@ -177,6 +179,8 @@ static const char *algo_names[] = {
 	"lyra2re",
 	"lyra2rev2",
 	"lyra2v3",
+	"minotaur",
+	"minotaurx",
 	"myr-gr",
 	"nist5",
 	"pentablake",
@@ -347,6 +351,8 @@ Options:\n\
                           lyra2rev2    Lyra2REv2\n\
                           lyra2v3      Lyra2REv3 (Vertcoin)\n\
                           myr-gr       Myriad-Groestl\n\
+                          minotaur     Ring\n\
+                          minotaurx    Avian, Litecash, Maza\n\
                           neoscrypt    NeoScrypt(128, 2, 1)\n\
                           nist5        Nist5\n\
                           pluck        Pluck:128 (Supcoin)\n\
@@ -2200,6 +2206,8 @@ static void *miner_thread(void *userdata)
 				max64 = 0x40LL;
 				break;
 			case ALGO_DROP:
+			case ALGO_MINOTAUR:
+			case ALGO_MINOTAURX:
 			case ALGO_PLUCK:
 			case ALGO_YESCRYPT:
 			case ALGO_YESCRYPTR8:
@@ -2355,6 +2363,12 @@ static void *miner_thread(void *userdata)
 		case ALGO_LYRA2V3:
 			rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_MINOTAUR:
+			rc = scanhash_minotaur(thr_id, &work, max_nonce, &hashes_done, false);
+			break;
+		case ALGO_MINOTAURX:
+			rc = scanhash_minotaur(thr_id, &work, max_nonce, &hashes_done, true);
+			break;
 		case ALGO_MYR_GR:
 			rc = scanhash_myriad(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/miner.h b/miner.h
index 34574030f..59dddc18e 100644
--- a/miner.h
+++ b/miner.h
@@ -223,6 +223,7 @@ int scanhash_luffa(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
 int scanhash_lyra2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done);
 int scanhash_lyra2rev2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done);
 int scanhash_lyra2v3(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done);
+int scanhash_minotaur(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, bool minotaurX);
 int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done);
 int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done, uint32_t profile);
 int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done);
@@ -531,6 +532,7 @@ void luffahash(void *output, const void *input);
 void lyra2_hash(void *state, const void *input);
 void lyra2rev2_hash(void *state, const void *input);
 void lyra2v3_hash(void *state, const void *input);
+void minotaurhash(void *output, const void *input, bool minotaurX);
 void myriadhash(void *output, const void *input);
 void neoscrypt(unsigned char *output, const unsigned char *password, uint32_t profile);
 void nist5hash(void *output, const void *input);
diff --git a/util.c b/util.c
index 5ba320b53..2ef847164 100644
--- a/util.c
+++ b/util.c
@@ -2401,6 +2401,12 @@ void print_hash_tests(void)
 	cryptonight_hash(&hash[0], &buf[0]);
 	printpfx("monero", hash);
 
+	minotaurhash(&hash[0], &buf[0], false);
+	printpfx("minotaur", hash);
+
+	minotaurhash(&hash[0], &buf[0], true);
+        printpfx("minotaurx", hash);
+
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myr-gr", hash);
 
diff --git a/yespower-1.0.1/CHANGES b/yespower-1.0.1/CHANGES
new file mode 100644
index 000000000..82420210c
--- /dev/null
+++ b/yespower-1.0.1/CHANGES
@@ -0,0 +1,18 @@
+	Changes made between 1.0.0 (2018/07/12) and 1.0.1 (2019/06/30).
+
+Fill the destination buffer with all set bits on error for fail-safety
+of the caller's "< target" check in case the caller neglects to check
+for errors.
+
+Simplified SMix2 for its final invocation with Nloop=2 in yespower 0.5.
+
+Revised the "XOR of yespower" tests to trigger duplicate index in the
+last SMix2 invocation in yespower 0.5 for N=2048 with at least one of
+the values of r being tested.  This is needed to test that a proper
+kind of BlockMix is used in that special case, which would previously be
+left untested.
+
+Added x32 ABI support (x86-64 with 32-bit pointers).
+
+Added a bit more detail to the README on caching of the computed PoW
+hashes when integrating yespower in an altcoin based on Bitcoin Core.
diff --git a/yespower-1.0.1/PERFORMANCE b/yespower-1.0.1/PERFORMANCE
new file mode 100644
index 000000000..99cddd051
--- /dev/null
+++ b/yespower-1.0.1/PERFORMANCE
@@ -0,0 +1,95 @@
+Included with yespower is the "benchmark" program, which is built by
+simply invoking "make".  When invoked without parameters, it tests
+yespower 0.5 at N = 2048, r = 8, which appears to be the lowest setting
+in use by existing cryptocurrencies.  On an i7-4770K with 4x DDR3-1600
+(on two memory channels) running CentOS 7 for x86-64 (and built with
+CentOS 7's default version of gcc) and with thread affinity set, this
+reports between 3700 and 3800 hashes per second for both SSE2 and AVX
+builds, e.g.:
+
+$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark
+version=0.5 N=2048 r=8
+Will use 2048.00 KiB RAM
+a5 9f ec 4c 4f dd a1 6e 3b 14 05 ad da 66 d5 25 b6 8e 7c ad fc fe 6a c0 66 c7 ad 11 8c d8 05 90
+Benchmarking 1 thread ...
+1018 H/s real, 1018 H/s virtual (2047 hashes in 2.01 seconds)
+Benchmarking 4 threads ...
+3773 H/s real, 950 H/s virtual (8188 hashes in 2.17 seconds)
+min 0.984 ms, avg 1.052 ms, max 1.074 ms
+
+Running 8 threads (to match the logical rather than the physical CPU
+core count) results in very slightly worse performance on this system,
+but this might be the other way around on another and/or with other
+parameters.  Upgrading to yespower 1.0, performance at these parameters
+improves to almost 4000 hashes per second:
+
+$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 10
+version=1.0 N=2048 r=8
+Will use 2048.00 KiB RAM
+d0 78 cd d4 cf 3f 5a a8 4e 3c 4a 58 66 29 81 d8 2d 27 e5 67 36 37 c4 be 77 63 61 32 24 c1 8a 93
+Benchmarking 1 thread ...
+1080 H/s real, 1080 H/s virtual (4095 hashes in 3.79 seconds)
+Benchmarking 4 threads ...
+3995 H/s real, 1011 H/s virtual (16380 hashes in 4.10 seconds)
+min 0.923 ms, avg 0.989 ms, max 1.137 ms
+
+Running 8 threads results in substantial slowdown with this new version
+(to between 3200 and 3400 hashes per second) because of cache thrashing.
+
+For higher settings such as those achieving 8 MiB instead of the 2 MiB
+above, this system performs at around 800 hashes per second for yespower
+0.5 and at around 830 hashes per second for yespower 1.0:
+
+$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 5 2048 32
+version=0.5 N=2048 r=32
+Will use 8192.00 KiB RAM
+56 0a 89 1b 5c a2 e1 c6 36 11 1a 9f f7 c8 94 a5 d0 a2 60 2f 43 fd cf a5 94 9b 95 e2 2f e4 46 1e
+Benchmarking 1 thread ...
+265 H/s real, 265 H/s virtual (1023 hashes in 3.85 seconds)
+Benchmarking 4 threads ...
+803 H/s real, 200 H/s virtual (4092 hashes in 5.09 seconds)
+min 4.924 ms, avg 4.980 ms, max 5.074 ms
+
+$ GOMP_CPU_AFFINITY=0-7 OMP_NUM_THREADS=4 ./benchmark 10 2048 32
+version=1.0 N=2048 r=32
+Will use 8192.00 KiB RAM
+f7 69 26 ae 4a dc 56 53 90 2f f0 22 78 ea aa 39 eb 99 84 11 ac 3e a6 24 2e 19 6d fb c4 3d 68 25
+Benchmarking 1 thread ...
+275 H/s real, 275 H/s virtual (1023 hashes in 3.71 seconds)
+Benchmarking 4 threads ...
+831 H/s real, 209 H/s virtual (4092 hashes in 4.92 seconds)
+min 3.614 ms, avg 4.769 ms, max 5.011 ms
+
+Again, running 8 threads results in a slowdown, albeit not as bad as can
+be seen for lower settings.
+
+On x86(-64), the following code versions may reasonably be built: SSE2,
+AVX, and XOP.  (There's no reason to build for AVX2 and higher, which is
+unsuitable for and thus unused by current yespower anyway.  There's also
+no reason to build yespower as-is for SSE4, although there's a disabled
+by default 32-bit specific SSE4 code version that may be re-enabled and
+given a try if someone is so inclined; it may perform slightly slower or
+slightly faster across different systems.)
+
+yescrypt and especially yespower 1.0 have been designed to fit the SSE2
+instruction set almost perfectly, so there's very little benefit from
+the AVX and XOP builds, yet even at yespower 1.0 there may be
+performance differences between SSE2, AVX, and XOP builds within 2% or
+so (and it is unclear which is the fastest on a given system until
+tested, except that where XOP is supported it is almost always faster
+than AVX).
+
+Proper setting of thread affinities to run exactly one thread per
+physical CPU core is non-trivial.  In the above examples, it so happened
+that the first 4 logical CPU numbers corresponded to different physical
+cores, but this won't always be the case.  This can vary even between
+apparently similar systems.  On Linux, the mapping of logical CPUs to
+physical cores may be obtained from /proc/cpuinfo (on x86[-64] and MIC)
+or sysfs, which an optimized implementation of e.g. a cryptocurrency
+miner could use.  If you do not bother obtaining this information from
+the operating system, you might be better off not setting thread
+affinities at all (in order to avoid the risk of doing this incorrectly,
+which would have a greater negative performance impact) and/or running
+as many threads as there are logical CPUs.  Also, there's no certainty
+whether different and future CPUs will run yespower faster using one or
+maybe more threads per physical core.
diff --git a/yespower-1.0.1/README b/yespower-1.0.1/README
new file mode 100644
index 000000000..c7a704db7
--- /dev/null
+++ b/yespower-1.0.1/README
@@ -0,0 +1,203 @@
+	What is yespower?
+
+yespower is a proof-of-work (PoW) focused fork of yescrypt.  While
+yescrypt is a password-based key derivation function (KDF) and password
+hashing scheme, and thus is meant for processing passwords, yespower is
+meant for processing trial inputs such as block headers (including
+nonces) in PoW-based blockchains.
+
+On its own, yespower isn't a complete proof-of-work system.  Rather, in
+the blockchain use case, yespower's return value is meant to be checked
+for being numerically no greater than the blockchain's current target
+(which is related to mining difficulty) or else the proof attempt
+(yespower invocation) is to be repeated (with a different nonce) until
+the condition is finally met (allowing a new block to be mined).  This
+process isn't specific to yespower and isn't part of yespower itself
+(rather, it is similar in many PoW-based blockchains and is to be
+defined and implemented externally to yespower) and thus isn't described
+in here any further.
+
+
+	Why or why not yespower?
+
+Different proof-of-work schemes in existence vary in many aspects,
+including in friendliness to different types of hardware.  There's
+demand for all sorts of hardware (un)friendliness in those - for
+different use cases and by different communities.
+
+yespower in particular is designed to be CPU-friendly, GPU-unfriendly,
+and FPGA/ASIC-neutral.  In other words, it's meant to be relatively
+efficient to compute on current CPUs and relatively inefficient on
+current GPUs.  Unfortunately, being GPU-unfriendly also means that
+eventual FPGA and ASIC implementations will only compete with CPUs, and
+at least ASICs will win over the CPUs (FPGAs might not because of this
+market's peculiarities - large FPGAs are even more "over-priced" than
+large CPUs are), albeit by far not to the extent they did e.g. for
+Bitcoin and Litecoin.
+
+There's a lot of talk about "ASIC resistance".  What is (or should be)
+meant by that is limiting the advantage of specialized ASICs.  While
+limiting the advantage at KDF to e.g. 10x and at password hashing to
+e.g. 100x (talking orders of magnitude here, in whatever terms) may be
+considered "ASIC resistant" (as compared to e.g. 100,000x we'd have
+without trying), similar improvement factors are practically not "ASIC
+resistant" for cryptocurrency mining where they can make all the
+difference between CPU mining being profitable and not.  There might
+also exist in-between PoW use cases where moderate ASIC advantage is OK,
+such as with non-cryptocurrency and/or private/permissioned blockchains.
+
+Thus, current yespower may be considered either a short-term choice
+(valid until one of its uses provides sufficient perceived incentive to
+likely result in specialized ASICs) or a deliberate choice of a pro-CPU,
+anti-GPU, moderately-pro-ASIC PoW scheme.  It is also possible to
+respond to known improvements in future GPUs/implementations and/or to
+ASICs with new versions of yespower that users would need to switch to.
+
+
+	yespower versions.
+
+yespower includes optimized and specialized re-implementation of the
+obsolete yescrypt 0.5 (based off its first submission to Password
+Hashing Competition back in 2014) now re-released as yespower 0.5, and
+brand new proof-of-work specific variation known as yespower 1.0.
+
+yespower 0.5 is intended as a compatible upgrade for cryptocurrencies
+that already use yescrypt 0.5 (providing a few percent speedup), and
+yespower 1.0 may be used as a further upgrade or a new choice of PoW by
+those and other cryptocurrencies and other projects.
+
+There are many significant differences between yespower 0.5 and 1.0
+under the hood, but the main user visible difference is yespower 1.0
+greatly improving on GPU-unfriendliness in light of improvements seen in
+modern GPUs (up to and including NVIDIA Volta) and GPU implementations
+of yescrypt 0.5.  This is achieved mostly through greater use of CPUs'
+L2 cache.
+
+The version of algorithm to use is requested through parameters,
+allowing for both algorithms to co-exist in client and miner
+implementations (such as in preparation for a cryptocurrency hard-fork
+and/or supporting multiple cryptocurrencies in one program).
+
+
+	Parameter selection.
+
+For new uses of yespower, set the requested version to the highest
+supported, and set N*r to the highest you can reasonably afford in terms
+of proof verification time (which might in turn be determined by desired
+share rate per mining pool server), using one of the following options:
+
+1 MiB: N = 1024, r = 8
+2 MiB: N = 2048, r = 8
+4 MiB: N = 1024, r = 32
+8 MiB: N = 2048, r = 32
+16 MiB: N = 4096, r = 32
+
+and so on for higher N keeping r=32.
+
+You may also set the personalization string to your liking, but that is
+not required (you can set its pointer to NULL and its length to 0).  Its
+support is provided mostly for compatibility with existing modifications
+of yescrypt 0.5.
+
+
+	Performance.
+
+Please refer to PERFORMANCE for some benchmarks and performance tuning.
+
+
+	How to test yespower for proper operation.
+
+On a Unix-like system, invoke "make check".  This will build and run a
+program called "tests", and check its output against the supplied file
+TESTS-OK.  If everything matches, the final line of output should be the
+word "PASSED".
+
+We do most of our testing on Linux systems with gcc.  The supplied
+Makefile assumes that you use gcc.
+
+
+	Alternate code versions and make targets.
+
+Two implementations of yespower are included: reference and optimized.
+By default, the optimized implementation is built.  Internally, the
+optimized implementation uses conditional compilation to choose between
+usage of various SIMD instruction sets where supported and scalar code.
+
+The reference implementation is unoptimized and is very slow, but it has
+simpler and shorter source code.  Its purpose is to provide a simple
+human- and machine-readable specification that implementations intended
+for actual use should be tested against.  It is deliberately mostly not
+optimized, and it is not meant to be used in production.
+
+Similarly to "make check", there's "make check-ref" to build and test
+the reference implementation.  There's also "make ref" to build the
+reference implementation and have the "benchmark" program use it.
+
+"make clean" may need to be run between making different builds.
+
+
+	How to integrate yespower in a program.
+
+Although yespower.h provides several functions, chances are that you
+will only need to use yespower_tls().  Please see the comment on this
+function in yespower.h and its example usage in tests.c and benchmark.c,
+including parameter sets requesting yescrypt 0.5 as used by certain
+existing cryptocurrencies.
+
+To integrate yespower in an altcoin based on Bitcoin Core, you might
+invoke yespower_tls() from either a maybe-new (depending on where you
+fork from) CBlockHeader::GetPoWHash() (and invoke that where PoW is
+needed like e.g. Litecoin does for scrypt) or CBlockHeader::GetHash()
+(easier, but inefficient and you'll be stuck with that inefficiency).
+
+You'll also want to implement caching of the computed PoW hashes like
+e.g. YACoin does for scrypt.  Caching is especially important if you
+invoke yespower from CBlockHeader::GetHash().  However, even if you use
+or introduce CBlockHeader::GetPoWHash() caching may still be desirable
+as the PoW hash is commonly requested 4 times per block fetched during a
+node's initial blockchain sync (once during prefetch of block headers,
+and 3 times more during validation of a fully fetched block).  On the
+other hand, you'll likely want to bypass the cache when PoW is computed
+by the node's built-in miner.
+
+Further detail on this (generating new genesis blocks, etc.) is even
+farther from being yespower-specific and thus is not provided here.
+Just like (and even more so than) yespower itself, the above guidance is
+provided as-is and without guarantee of being correct and safe to
+follow.  You're supposed to know what you're doing.
+
+
+	Credits.
+
+scrypt has been designed by Colin Percival.  yescrypt and yespower have
+been designed by Solar Designer building upon scrypt.
+
+The following other people and projects have also indirectly helped make
+yespower what it is:
+
+ - Bill Cox
+ - Rich Felker
+ - Anthony Ferrara
+ - Christian Forler
+ - Taylor Hornby
+ - Dmitry Khovratovich
+ - Samuel Neves
+ - Marcos Simplicio
+ - Ken T Takusagawa
+ - Jakob Wenzel
+ - Christian Winnerlein
+
+ - DARPA Cyber Fast Track
+ - Password Hashing Competition
+
+
+	Contact info.
+
+First, please check the yespower homepage for new versions, etc.:
+
+    https://www.openwall.com/yespower/
+
+If you have anything valuable to add or a non-trivial question to ask,
+you may contact the maintainer of yespower at:
+
+    Solar Designer <solar at openwall.com>
diff --git a/yespower-1.0.1/TESTS-OK b/yespower-1.0.1/TESTS-OK
new file mode 100644
index 000000000..d0bd2e03c
--- /dev/null
+++ b/yespower-1.0.1/TESTS-OK
@@ -0,0 +1,16 @@
+yespower(5, 2048, 8, "Client Key") = a5 9f ec 4c 4f dd a1 6e 3b 14 05 ad da 66 d5 25 b6 8e 7c ad fc fe 6a c0 66 c7 ad 11 8c d8 05 90
+yespower(5, 2048, 8, BSTY) = 5e a2 b2 95 6a 9e ac e3 0a 32 37 ff 1d 44 1e de e1 dc 25 aa b8 f0 ea 15 c1 21 65 f8 3a 7b c2 65
+yespower(5, 4096, 16, "Client Key") = 92 7e 72 d0 de d3 d8 04 75 47 3f 40 f1 74 3c 67 28 9d 45 3d 52 42 d4 f5 5a f4 e3 25 e0 66 99 c5
+yespower(5, 4096, 24, "Jagaricoin") = 0e 13 66 97 32 11 e7 fe a8 ad 9d 81 98 9c 84 a2 54 d9 68 c9 d3 33 dd 8f f0 99 32 4f 38 61 1e 04
+yespower(5, 4096, 32, "WaviBanana") = 3a e0 5a bb 3c 5c f6 f7 54 15 a9 25 54 c9 8d 50 e3 8e c9 55 2c fa 78 37 36 16 f4 80 b2 4e 55 9f
+yespower(5, 2048, 32, "Client Key") = 56 0a 89 1b 5c a2 e1 c6 36 11 1a 9f f7 c8 94 a5 d0 a2 60 2f 43 fd cf a5 94 9b 95 e2 2f e4 46 1e
+yespower(5, 1024, 32, "Client Key") = 2a 79 e5 3d 1b e6 66 9b c5 56 cc c4 17 bc e3 d2 2a 74 a2 32 f5 6b 8e 1d 39 b4 57 92 67 5d e1 08
+yespower(5, 2048, 8, NULL) = 5e cb d8 e8 d7 c9 0b ae d4 bb f8 91 6a 12 25 dc c3 c6 5f 5c 91 65 ba e8 1c dd e3 cf fa d1 28 e8
+yespower(10, 2048, 8, NULL) = 69 e0 e8 95 b3 df 7a ee b8 37 d7 1f e1 99 e9 d3 4f 7e c4 6e cb ca 7a 2c 43 08 e5 18 57 ae 9b 46
+yespower(10, 4096, 16, NULL) = 33 fb 8f 06 38 24 a4 a0 20 f6 3d ca 53 5f 5c a6 6a b5 57 64 68 c7 5d 1c ca ac 75 42 f7 64 95 ac
+yespower(10, 4096, 32, NULL) = 77 1a ee fd a8 fe 79 a0 82 5b c7 f2 ae e1 62 ab 55 78 57 46 39 ff c6 ca 37 23 cc 18 e5 e3 e2 85
+yespower(10, 2048, 32, NULL) = d5 ef b8 13 cd 26 3e 9b 34 54 01 30 23 3c bb c6 a9 21 fb ff 34 31 e5 ec 1a 1a bd e2 ae a6 ff 4d
+yespower(10, 1024, 32, NULL) = 50 1b 79 2d b4 2e 38 8f 6e 7d 45 3c 95 d0 3a 12 a3 60 16 a5 15 4a 68 83 90 dd c6 09 a4 0c 67 99
+yespower(10, 1024, 32, "personality test") = 1f 02 69 ac f5 65 c4 9a dc 0e f9 b8 f2 6a b3 80 8c dc 38 39 4a 25 4f dd ee dc c3 aa cf f6 ad 9d
+XOR of yespower(5, ...) = ae f1 32 91 87 0f 55 70 47 f4 2e 9b ef a6 16 df e5 f1 96 77 e1 3f 8b a6 92 f7 c5 97 55 a0 f5 0e
+XOR of yespower(10, ...) = 8d 13 c5 fb 07 30 96 75 d1 b8 48 92 77 ba 4b e4 40 33 be df ae 7a 60 43 8a 9b e2 1f 3a 7b 12 37
diff --git a/yespower-1.0.1/benchmark.c b/yespower-1.0.1/benchmark.c
new file mode 100644
index 000000000..4842dfa84
--- /dev/null
+++ b/yespower-1.0.1/benchmark.c
@@ -0,0 +1,262 @@
+/*-
+ * Copyright 2013-2018 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h> /* for atoi() */
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/times.h>
+#include <sched.h>
+
+#include "yespower.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+
+#define NSAVE 1000
+
+static uint64_t time_us(void)
+{
+	struct timespec t;
+#ifdef CLOCK_MONOTONIC_RAW
+	if (clock_gettime(CLOCK_MONOTONIC_RAW, &t))
+		return 0;
+#else
+	if (clock_gettime(CLOCK_MONOTONIC, &t))
+		return 0;
+#endif
+	return 1 + (uint64_t)t.tv_sec * 1000000 + t.tv_nsec / 1000;
+}
+#endif
+
+int main(int argc, const char * const *argv)
+{
+	yespower_params_t params = {
+		.version = YESPOWER_0_5,
+		.N = 2048,
+		.r = 8,
+		.pers = (const uint8_t *)"Client Key",
+		.perslen = 10
+	};
+
+	if (argc > 1)
+		params.version = atoi(argv[1]);
+	if (argc > 2)
+		params.N = atoi(argv[2]);
+	if (argc > 3)
+		params.r = atoi(argv[3]);
+
+	printf("version=%.1f N=%u r=%u\n",
+	    params.version * 0.1, params.N, params.r);
+
+	printf("Will use %.2f KiB RAM\n", 0.125 * params.N * params.r);
+
+	static __thread union {
+		uint8_t u8[80];
+		uint32_t u32[20];
+	} src;
+	yespower_binary_t dst;
+	unsigned int i;
+
+	for (i = 0; i < sizeof(src); i++)
+		src.u8[i] = i * 3;
+
+	if (yespower_tls(src.u8, sizeof(src), &params, &dst)) {
+		puts("FAILED");
+		return 1;
+	}
+
+	for (i = 0; i < sizeof(dst); i++)
+		printf("%02x%c", dst.uc[i], i < sizeof(dst) - 1 ? ' ' : '\n');
+
+	puts("Benchmarking 1 thread ...");
+
+	clock_t clk_tck = sysconf(_SC_CLK_TCK);
+	struct tms start_tms, end_tms;
+	clock_t start = times(&start_tms), end;
+	unsigned int n;
+	unsigned long long count;
+#ifdef _OPENMP
+	yespower_binary_t save[NSAVE];
+	unsigned int nsave = 0;
+#endif
+	uint32_t seed = start * 1812433253U;
+
+	n = 1;
+	count = 0;
+	do {
+		for (i = 0; i < n; i++) {
+			yespower_binary_t *p = &dst;
+#ifdef _OPENMP
+			if (nsave < NSAVE)
+				p = &save[nsave++];
+#endif
+			src.u32[19] = seed + (count + i);
+			if (yespower_tls(src.u8, sizeof(src), &params, p)) {
+				puts("FAILED");
+				return 1;
+			}
+		}
+		count += n;
+
+		end = times(&end_tms);
+		n <<= 1;
+	} while (end - start < clk_tck * 2);
+
+	clock_t start_v = start_tms.tms_utime + start_tms.tms_stime +
+	    start_tms.tms_cutime + start_tms.tms_cstime;
+	clock_t end_v = end_tms.tms_utime + end_tms.tms_stime +
+	    end_tms.tms_cutime + end_tms.tms_cstime;
+
+	printf("%llu H/s real, %llu H/s virtual "
+	    "(%llu hashes in %.2f seconds)\n",
+	    count * clk_tck / (end - start),
+	    count * clk_tck / (end_v - start_v),
+	    count, (double)(end - start) / clk_tck);
+
+	for (i = 0; i < nsave; i++) {
+		unsigned int j;
+		for (j = i + 1; j < nsave; j++) {
+			unsigned int k = 8;
+			if (!memcmp(&save[i], &save[j], k)) {
+				printf("%u-byte collision(s) detected\n", k);
+				i = nsave; break;
+			}
+		}
+	}
+
+#ifdef _OPENMP
+	unsigned int nt = omp_get_max_threads();
+
+	printf("Benchmarking %u thread%s ...\n",
+	    nt, nt == 1 ? "" : "s");
+
+	typedef struct {
+		uint64_t min, max, total;
+	} thread_data_s;
+	union {
+		thread_data_s s;
+		uint8_t cachelines[2][64]; /* avoid false sharing */
+	} thread_data[nt]; /* tricky to align this when on stack */
+
+	unsigned int t;
+	for (t = 0; t < nt; t++) {
+		thread_data_s *td = &thread_data[t].s;
+		td->min = ~(uint64_t)0; td->max = 0; td->total = 0;
+	}
+
+	unsigned long long count1 = count, count_restart = 0;
+
+	if (!geteuid()) {
+		puts("Running as root, so trying to set SCHED_RR");
+#pragma omp parallel
+		{
+			struct sched_param param = { .sched_priority = 1 };
+			if (sched_setscheduler(getpid(), SCHED_RR, &param))
+				perror("sched_setscheduler");
+		}
+	}
+
+	start = times(&start_tms);
+
+	n = count * omp_get_max_threads();
+	count = 0;
+	do {
+#pragma omp parallel for default(none) copyin(src) private(i, dst) shared(n, thread_data, params, seed, count, save, nsave)
+		for (i = 0; i < n; i++) {
+			unsigned int j = count + i;
+
+			src.u32[19] = seed + j;
+
+			uint64_t start1 = time_us();
+
+			if (yespower_tls(src.u8, sizeof(src), &params, &dst)) {
+#pragma omp critical
+				puts("FAILED");
+			}
+
+			uint64_t end1 = time_us();
+			if (end1 < start1)
+				end1 = start1;
+			uint64_t diff1 = end1 - start1;
+
+			thread_data_s *td = &thread_data[omp_get_thread_num()].s;
+			td->total += diff1;
+			if (diff1 < td->min)
+				td->min = diff1;
+			if (diff1 > td->max)
+				td->max = diff1;
+
+#ifdef _OPENMP
+			if (j < nsave && memcmp(&save[j], &dst, sizeof(dst))) {
+#pragma omp critical
+				printf("Mismatch at %u\n", j);
+			}
+#endif
+		}
+
+		count += n;
+		if ((count - n) < count1 && count >= count1) {
+/* Disregard our repeat of single thread's results (could be partially cached
+ * by same core, but OTOH other cores not yet warmed up to full clock rate). */
+			start = times(&start_tms);
+			count_restart = count;
+			for (t = 0; t < nt; t++) {
+				thread_data_s *td = &thread_data[t].s;
+				td->min = ~(uint64_t)0; td->max = 0; td->total = 0;
+			}
+		} else {
+			n <<= 1;
+		}
+
+		end = times(&end_tms);
+	} while (end - start < clk_tck);
+
+	if (!count_restart)
+		puts("Didn't reach single-thread's hash count");
+	count -= count_restart;
+
+	start_v = start_tms.tms_utime + start_tms.tms_stime +
+	    start_tms.tms_cutime + start_tms.tms_cstime;
+	end_v = end_tms.tms_utime + end_tms.tms_stime +
+	    end_tms.tms_cutime + end_tms.tms_cstime;
+
+	printf("%llu H/s real, %llu H/s virtual "
+	    "(%llu hashes in %.2f seconds)\n",
+	    count * clk_tck / (end - start),
+	    count * clk_tck / (end_v - start_v),
+	    count, (double)(end - start) / clk_tck);
+
+	uint64_t min = ~(uint64_t)0, max = 0, total = 0;
+	for (t = 0; t < nt; t++) {
+		thread_data_s *td = &thread_data[t].s;
+		total += td->total;
+		if (td->min < min)
+			min = td->min;
+		if (td->max > max)
+			max = td->max;
+	}
+	printf("min %.3f ms, avg %.3f ms, max %.3f ms\n",
+		min / 1000.0, total / 1000.0 / count, max / 1000.0);
+#endif
+
+	return 0;
+}
diff --git a/yespower-1.0.1/insecure_memzero.h b/yespower-1.0.1/insecure_memzero.h
new file mode 100644
index 000000000..5a0ba75c4
--- /dev/null
+++ b/yespower-1.0.1/insecure_memzero.h
@@ -0,0 +1 @@
+#define insecure_memzero(buf, len) /* empty */
diff --git a/yespower-1.0.1/sha256.c b/yespower-1.0.1/sha256.c
new file mode 100644
index 000000000..ef6b4ec70
--- /dev/null
+++ b/yespower-1.0.1/sha256.c
@@ -0,0 +1,646 @@
+/*-
+ * Copyright 2005-2016 Colin Percival
+ * Copyright 2016-2018 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "insecure_memzero.h"
+#include "sysendian.h"
+
+#include "sha256.h"
+
+#ifdef __ICC
+/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
+#define restrict
+#elif __STDC_VERSION__ >= 199901L
+/* Have restrict */
+#elif defined(__GNUC__)
+#define restrict __restrict
+#else
+#define restrict
+#endif
+
+/*
+ * Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
+ * (uint8_t) in big-endian form.
+ */
+static void
+be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
+{
+
+	/* Encode vector, two words at a time. */
+	do {
+		be32enc(&dst[0], src[0]);
+		be32enc(&dst[4], src[1]);
+		src += 2;
+		dst += 8;
+	} while (--len);
+}
+
+/*
+ * Decode a big-endian length len*8 vector of (uint8_t) into a length
+ * len*2 vector of (uint32_t).
+ */
+static void
+be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
+{
+
+	/* Decode vector, two words at a time. */
+	do {
+		dst[0] = be32dec(&src[0]);
+		dst[1] = be32dec(&src[4]);
+		src += 8;
+		dst += 2;
+	} while (--len);
+}
+
+/* SHA256 round constants. */
+static const uint32_t Krnd[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	h += S1(e) + Ch(e, f, g) + k;			\
+	d += h;						\
+	h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, ii)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i + ii] + Krnd[i + ii])
+
+/* Message schedule computation */
+#define MSCH(W, ii, i)				\
+	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t state[static restrict 8],
+    const uint8_t block[static restrict 64],
+    uint32_t W[static restrict 64], uint32_t S[static restrict 8])
+{
+	int i;
+
+	/* 1. Prepare the first part of the message schedule W. */
+	be32dec_vect(W, block, 8);
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	for (i = 0; i < 64; i += 16) {
+		RNDr(S, W, 0, i);
+		RNDr(S, W, 1, i);
+		RNDr(S, W, 2, i);
+		RNDr(S, W, 3, i);
+		RNDr(S, W, 4, i);
+		RNDr(S, W, 5, i);
+		RNDr(S, W, 6, i);
+		RNDr(S, W, 7, i);
+		RNDr(S, W, 8, i);
+		RNDr(S, W, 9, i);
+		RNDr(S, W, 10, i);
+		RNDr(S, W, 11, i);
+		RNDr(S, W, 12, i);
+		RNDr(S, W, 13, i);
+		RNDr(S, W, 14, i);
+		RNDr(S, W, 15, i);
+
+		if (i == 48)
+			break;
+		MSCH(W, 0, i);
+		MSCH(W, 1, i);
+		MSCH(W, 2, i);
+		MSCH(W, 3, i);
+		MSCH(W, 4, i);
+		MSCH(W, 5, i);
+		MSCH(W, 6, i);
+		MSCH(W, 7, i);
+		MSCH(W, 8, i);
+		MSCH(W, 9, i);
+		MSCH(W, 10, i);
+		MSCH(W, 11, i);
+		MSCH(W, 12, i);
+		MSCH(W, 13, i);
+		MSCH(W, 14, i);
+		MSCH(W, 15, i);
+	}
+
+	/* 4. Mix local working variables into global state. */
+	state[0] += S[0];
+	state[1] += S[1];
+	state[2] += S[2];
+	state[3] += S[3];
+	state[4] += S[4];
+	state[5] += S[5];
+	state[6] += S[6];
+	state[7] += S[7];
+}
+
+static const uint8_t PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
+{
+	size_t r;
+
+	/* Figure out how many bytes we have buffered. */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Pad to 56 mod 64, transforming if we finish a block en route. */
+	if (r < 56) {
+		/* Pad to 56 mod 64. */
+		memcpy(&ctx->buf[r], PAD, 56 - r);
+	} else {
+		/* Finish the current block and mix. */
+		memcpy(&ctx->buf[r], PAD, 64 - r);
+		SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+
+		/* The start of the final block is all zeroes. */
+		memset(&ctx->buf[0], 0, 56);
+	}
+
+	/* Add the terminating bit-count. */
+	be64enc(&ctx->buf[56], ctx->count);
+
+	/* Mix in the final block. */
+	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+}
+
+/* Magic initialization constants. */
+static const uint32_t initial_state[8] = {
+	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+/**
+ * SHA256_Init(ctx):
+ * Initialize the SHA256 context ${ctx}.
+ */
+void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+	/* Zero bits processed so far. */
+	ctx->count = 0;
+
+	/* Initialize state. */
+	memcpy(ctx->state, initial_state, sizeof(initial_state));
+}
+
+/**
+ * SHA256_Update(ctx, in, len):
+ * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
+ */
+static void
+_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
+    uint32_t tmp32[static restrict 72])
+{
+	uint32_t r;
+	const uint8_t * src = in;
+
+	/* Return immediately if we have nothing to do. */
+	if (len == 0)
+		return;
+
+	/* Number of bytes left in the buffer from previous updates. */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Update number of bits. */
+	ctx->count += (uint64_t)(len) << 3;
+
+	/* Handle the case where we don't need to perform any transforms. */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block. */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks. */
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer. */
+	memcpy(ctx->buf, src, len);
+}
+
+/* Wrapper function for intermediate-values sanitization. */
+void
+SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
+{
+	uint32_t tmp32[72];
+
+	/* Call the real function. */
+	_SHA256_Update(ctx, in, len, tmp32);
+
+	/* Clean the stack. */
+	insecure_memzero(tmp32, 288);
+}
+
+/**
+ * SHA256_Final(digest, ctx):
+ * Output the SHA256 hash of the data input to the context ${ctx} into the
+ * buffer ${digest}.
+ */
+static void
+_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
+    uint32_t tmp32[static restrict 72])
+{
+
+	/* Add padding. */
+	SHA256_Pad(ctx, tmp32);
+
+	/* Write the hash. */
+	be32enc_vect(digest, ctx->state, 4);
+}
+
+/* Wrapper function for intermediate-values sanitization. */
+void
+SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
+{
+	uint32_t tmp32[72];
+
+	/* Call the real function. */
+	_SHA256_Final(digest, ctx, tmp32);
+
+	/* Clear the context state. */
+	insecure_memzero(ctx, sizeof(SHA256_CTX));
+
+	/* Clean the stack. */
+	insecure_memzero(tmp32, 288);
+}
+
+/**
+ * SHA256_Buf(in, len, digest):
+ * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
+ */
+void
+SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
+{
+	SHA256_CTX ctx;
+	uint32_t tmp32[72];
+
+	SHA256_Init(&ctx);
+	_SHA256_Update(&ctx, in, len, tmp32);
+	_SHA256_Final(digest, &ctx, tmp32);
+
+	/* Clean the stack. */
+	insecure_memzero(&ctx, sizeof(SHA256_CTX));
+	insecure_memzero(tmp32, 288);
+}
+
+/**
+ * HMAC_SHA256_Init(ctx, K, Klen):
+ * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
+ * ${K}.
+ */
+static void
+_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
+    uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
+    uint8_t khash[static restrict 32])
+{
+	const uint8_t * K = _K;
+	size_t i;
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	if (Klen > 64) {
+		SHA256_Init(&ctx->ictx);
+		_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
+		_SHA256_Final(khash, &ctx->ictx, tmp32);
+		K = khash;
+		Klen = 32;
+	}
+
+	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
+	SHA256_Init(&ctx->ictx);
+	memset(pad, 0x36, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
+
+	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
+	SHA256_Init(&ctx->octx);
+	memset(pad, 0x5c, 64);
+	for (i = 0; i < Klen; i++)
+		pad[i] ^= K[i];
+	_SHA256_Update(&ctx->octx, pad, 64, tmp32);
+}
+
+/* Wrapper function for intermediate-values sanitization. */
+void
+HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
+{
+	uint32_t tmp32[72];
+	uint8_t pad[64];
+	uint8_t khash[32];
+
+	/* Call the real function. */
+	_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
+
+	/* Clean the stack. */
+	insecure_memzero(tmp32, 288);
+	insecure_memzero(khash, 32);
+	insecure_memzero(pad, 64);
+}
+
+/**
+ * HMAC_SHA256_Update(ctx, in, len):
+ * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
+ */
+static void
+_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
+    uint32_t tmp32[static restrict 72])
+{
+
+	/* Feed data to the inner SHA256 operation. */
+	_SHA256_Update(&ctx->ictx, in, len, tmp32);
+}
+
+/* Wrapper function for intermediate-values sanitization. */
+void
+HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
+{
+	uint32_t tmp32[72];
+
+	/* Call the real function. */
+	_HMAC_SHA256_Update(ctx, in, len, tmp32);
+
+	/* Clean the stack. */
+	insecure_memzero(tmp32, 288);
+}
+
+/**
+ * HMAC_SHA256_Final(digest, ctx):
+ * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
+ * buffer ${digest}.
+ */
+static void
+_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
+    uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
+{
+
+	/* Finish the inner SHA256 operation. */
+	_SHA256_Final(ihash, &ctx->ictx, tmp32);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
+
+	/* Finish the outer SHA256 operation. */
+	_SHA256_Final(digest, &ctx->octx, tmp32);
+}
+
+/* Wrapper function for intermediate-values sanitization. */
+void
+HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
+{
+	uint32_t tmp32[72];
+	uint8_t ihash[32];
+
+	/* Call the real function. */
+	_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
+
+	/* Clean the stack. */
+	insecure_memzero(tmp32, 288);
+	insecure_memzero(ihash, 32);
+}
+
+/**
+ * HMAC_SHA256_Buf(K, Klen, in, len, digest):
+ * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
+ * length ${Klen}, and write the result to ${digest}.
+ */
+void
+HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
+    uint8_t digest[32])
+{
+	HMAC_SHA256_CTX ctx;
+	uint32_t tmp32[72];
+	uint8_t tmp8[96];
+
+	_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
+	_HMAC_SHA256_Update(&ctx, in, len, tmp32);
+	_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
+
+	/* Clean the stack. */
+	insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
+	insecure_memzero(tmp32, 288);
+	insecure_memzero(tmp8, 96);
+}
+
+/* Add padding and terminating bit-count, but don't invoke Transform yet. */
+static int
+SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
+    uint32_t tmp32[static restrict 72])
+{
+	uint32_t r;
+
+	r = (ctx->count >> 3) & 0x3f;
+	if (r >= 56)
+		return -1;
+
+	/*
+	 * Convert length to a vector of bytes -- we do this now rather
+	 * than later because the length will change after we pad.
+	 */
+	be64enc(len, ctx->count);
+
+	/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
+	_SHA256_Update(ctx, PAD, 56 - r, tmp32);
+
+	/* Add the terminating bit-count. */
+	ctx->buf[63] = len[7];
+	_SHA256_Update(ctx, len, 7, tmp32);
+
+	return 0;
+}
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void
+PBKDF3_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
+    size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
+{
+	HMAC_SHA256_CTX Phctx, PShctx, hctx;
+	uint32_t tmp32[72];
+	union {
+		uint8_t tmp8[96];
+		uint32_t state[8];
+	} u;
+	size_t i;
+	uint8_t ivec[4];
+	uint8_t U[32];
+	uint8_t T[32];
+	uint64_t j;
+	int k;
+	size_t clen;
+
+	/* Sanity-check. */
+	assert(dkLen <= 32 * (size_t)(UINT32_MAX));
+
+	if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
+		uint32_t oldcount;
+		uint8_t * ivecp;
+
+		/* Compute HMAC state after processing P and S. */
+		_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
+		    tmp32, &u.tmp8[0], &u.tmp8[64]);
+		_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
+
+		/* Prepare ictx padding. */
+		oldcount = hctx.ictx.count & (0x3f << 3);
+		_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
+		if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
+		    SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
+			goto generic; /* Can't happen due to saltlen check */
+		ivecp = hctx.ictx.buf + (oldcount >> 3);
+
+		/* Prepare octx padding. */
+		hctx.octx.count += 32 << 3;
+		SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
+
+		/* Iterate through the blocks. */
+		for (i = 0; i * 32 < dkLen; i++) {
+			/* Generate INT(i + 1). */
+			be32enc(ivecp, (uint32_t)(i + 1));
+
+			/* Compute U_1 = PRF(P, S || INT(i)). */
+			memcpy(u.state, hctx.ictx.state, sizeof(u.state));
+			SHA256_Transform(u.state, hctx.ictx.buf,
+			    &tmp32[0], &tmp32[64]);
+			be32enc_vect(hctx.octx.buf, u.state, 4);
+			memcpy(u.state, hctx.octx.state, sizeof(u.state));
+			SHA256_Transform(u.state, hctx.octx.buf,
+			    &tmp32[0], &tmp32[64]);
+			be32enc_vect(&buf[i * 32], u.state, 4);
+		}
+
+		goto cleanup;
+	}
+
+generic:
+	/* Compute HMAC state after processing P. */
+	_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
+	    tmp32, &u.tmp8[0], &u.tmp8[64]);
+
+	/* Compute HMAC state after processing P and S. */
+	memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
+	_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i * 32 < dkLen; i++) {
+		/* Generate INT(i + 1). */
+		be32enc(ivec, (uint32_t)(i + 1));
+
+		/* Compute U_1 = PRF(P, S || INT(i)). */
+		memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
+		_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
+		_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
+
+		if (c > 1) {
+			/* T_i = U_1 ... */
+			memcpy(U, T, 32);
+
+			for (j = 2; j <= c; j++) {
+				/* Compute U_j. */
+				memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
+				_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
+				_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
+
+				/* ... xor U_j ... */
+				for (k = 0; k < 32; k++)
+					T[k] ^= U[k];
+			}
+		}
+
+		/* Copy as many bytes as necessary into buf. */
+		clen = dkLen - i * 32;
+		if (clen > 32)
+			clen = 32;
+		memcpy(&buf[i * 32], T, clen);
+	}
+
+	/* Clean the stack. */
+	insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
+	insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
+	insecure_memzero(U, 32);
+	insecure_memzero(T, 32);
+
+cleanup:
+	insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
+	insecure_memzero(tmp32, 288);
+	insecure_memzero(&u, sizeof(u));
+}
diff --git a/yespower-1.0.1/sha256.h b/yespower-1.0.1/sha256.h
new file mode 100644
index 000000000..6210502ff
--- /dev/null
+++ b/yespower-1.0.1/sha256.h
@@ -0,0 +1,129 @@
+/*-
+ * Copyright 2005-2016 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SHA256_H_
+#define _SHA256_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Use #defines in order to avoid namespace collisions with anyone else's
+ * SHA256 code (e.g., the code in OpenSSL).
+ */
+#define SHA256_Init libcperciva_SHA256_Init
+#define SHA256_Update libcperciva_SHA256_Update
+#define SHA256_Final libcperciva_SHA256_Final
+#define SHA256_Buf libcperciva_SHA256_Buf
+#define SHA256_CTX libcperciva_SHA256_CTX
+#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
+#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
+#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
+#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
+#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
+
+/* Context structure for SHA256 operations. */
+typedef struct {
+	uint32_t state[8];
+	uint64_t count;
+	uint8_t buf[64];
+} SHA256_CTX;
+
+/**
+ * SHA256_Init(ctx):
+ * Initialize the SHA256 context ${ctx}.
+ */
+void SHA256_Init(SHA256_CTX *);
+
+/**
+ * SHA256_Update(ctx, in, len):
+ * Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
+ */
+void SHA256_Update(SHA256_CTX *, const void *, size_t);
+
+/**
+ * SHA256_Final(digest, ctx):
+ * Output the SHA256 hash of the data input to the context ${ctx} into the
+ * buffer ${digest}.
+ */
+void SHA256_Final(uint8_t[32], SHA256_CTX *);
+
+/**
+ * SHA256_Buf(in, len, digest):
+ * Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
+ */
+void SHA256_Buf(const void *, size_t, uint8_t[32]);
+
+/* Context structure for HMAC-SHA256 operations. */
+typedef struct {
+	SHA256_CTX ictx;
+	SHA256_CTX octx;
+} HMAC_SHA256_CTX;
+
+/**
+ * HMAC_SHA256_Init(ctx, K, Klen):
+ * Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
+ * ${K}.
+ */
+void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
+
+/**
+ * HMAC_SHA256_Update(ctx, in, len):
+ * Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
+ */
+void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
+
+/**
+ * HMAC_SHA256_Final(digest, ctx):
+ * Output the HMAC-SHA256 of the data input to the context ${ctx} into the
+ * buffer ${digest}.
+ */
+void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
+
+/**
+ * HMAC_SHA256_Buf(K, Klen, in, len, digest):
+ * Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
+ * length ${Klen}, and write the result to ${digest}.
+ */
+void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
+    uint64_t, uint8_t *, size_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_SHA256_H_ */
diff --git a/yespower-1.0.1/sysendian.h b/yespower-1.0.1/sysendian.h
new file mode 100644
index 000000000..52c1fe73b
--- /dev/null
+++ b/yespower-1.0.1/sysendian.h
@@ -0,0 +1,94 @@
+/*-
+ * Copyright 2007-2014 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _SYSENDIAN_H_
+#define _SYSENDIAN_H_
+
+#include <stdint.h>
+
+/* Avoid namespace collisions with BSD <sys/endian.h>. */
+#define be32dec libcperciva_be32dec
+#define be32enc libcperciva_be32enc
+#define be64enc libcperciva_be64enc
+#define le32dec libcperciva_le32dec
+#define le32enc libcperciva_le32enc
+
+static inline uint32_t
+be32dec(const void * pp)
+{
+	const uint8_t * p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
+	    ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
+}
+
+static inline void
+be32enc(void * pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[3] = x & 0xff;
+	p[2] = (x >> 8) & 0xff;
+	p[1] = (x >> 16) & 0xff;
+	p[0] = (x >> 24) & 0xff;
+}
+
+static inline void
+be64enc(void * pp, uint64_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[7] = x & 0xff;
+	p[6] = (x >> 8) & 0xff;
+	p[5] = (x >> 16) & 0xff;
+	p[4] = (x >> 24) & 0xff;
+	p[3] = (x >> 32) & 0xff;
+	p[2] = (x >> 40) & 0xff;
+	p[1] = (x >> 48) & 0xff;
+	p[0] = (x >> 56) & 0xff;
+}
+
+static inline uint32_t
+le32dec(const void * pp)
+{
+	const uint8_t * p = (uint8_t const *)pp;
+
+	return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
+	    ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
+}
+
+static inline void
+le32enc(void * pp, uint32_t x)
+{
+	uint8_t * p = (uint8_t *)pp;
+
+	p[0] = x & 0xff;
+	p[1] = (x >> 8) & 0xff;
+	p[2] = (x >> 16) & 0xff;
+	p[3] = (x >> 24) & 0xff;
+}
+
+#endif /* !_SYSENDIAN_H_ */
diff --git a/yespower-1.0.1/tests.c b/yespower-1.0.1/tests.c
new file mode 100644
index 000000000..121a586a8
--- /dev/null
+++ b/yespower-1.0.1/tests.c
@@ -0,0 +1,190 @@
+/*-
+ * Copyright 2013-2018 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "yespower.h"
+
+#undef TEST_PBKDF2_SHA256
+
+#ifdef TEST_PBKDF2_SHA256
+#include <assert.h>
+
+#include "sha256.h"
+
+static void print_PBKDF2_SHA256_raw(const char *passwd, size_t passwdlen,
+    const char *salt, size_t saltlen, uint64_t c, size_t dkLen)
+{
+	uint8_t dk[64];
+	size_t i;
+
+	assert(dkLen <= sizeof(dk));
+
+	/* XXX This prints the strings truncated at first NUL */
+	printf("PBKDF2_SHA256(\"%s\", \"%s\", %llu, %llu) = ",
+	    passwd, salt, (unsigned long long)c, (unsigned long long)dkLen);
+
+	PBKDF2_SHA256((const uint8_t *) passwd, passwdlen,
+	    (const uint8_t *) salt, saltlen, c, dk, dkLen);
+
+	for (i = 0; i < dkLen; i++)
+		printf("%02x%c", dk[i], i < dkLen - 1 ? ' ' : '\n');
+}
+
+static void print_PBKDF2_SHA256(const char *passwd,
+    const char *salt, uint64_t c, size_t dkLen)
+{
+	print_PBKDF2_SHA256_raw(passwd, strlen(passwd), salt, strlen(salt), c,
+	    dkLen);
+}
+#endif
+
+static const char *pers_bsty_magic = "BSTY";
+
+static void print_yespower(yespower_version_t version, uint32_t N, uint32_t r,
+    const char *pers)
+{
+	yespower_params_t params = {
+		.version = version,
+		.N = N,
+		.r = r,
+		.pers = (const uint8_t *)pers,
+		.perslen = pers ? strlen(pers) : 0
+	};
+	uint8_t src[80];
+	yespower_binary_t dst;
+	size_t i;
+
+	const char *q = (pers && pers != pers_bsty_magic) ? "\"": "";
+	printf("yespower(%u, %u, %u, %s%s%s) = ", (unsigned int)version, N, r,
+	    q, pers ? pers : "NULL", q);
+
+	for (i = 0; i < sizeof(src); i++)
+		src[i] = i * 3;
+
+	if (pers == pers_bsty_magic) {
+		params.pers = src;
+		params.perslen = sizeof(src);
+	}
+
+	if (yespower_tls(src, sizeof(src), &params, &dst)) {
+		puts("FAILED");
+		return;
+	}
+
+	for (i = 0; i < sizeof(dst); i++)
+		printf("%02x%c", dst.uc[i], i < sizeof(dst) - 1 ? ' ' : '\n');
+}
+
+static void print_yespower_loop(yespower_version_t version, const char *pers)
+{
+	uint32_t N, r;
+	uint8_t src[80];
+	yespower_binary_t dst, xor = {{0}};
+	size_t i;
+
+	printf("XOR of yespower(%u, ...) = ", (unsigned int)version);
+
+	/*
+	 * This value of src is chosen to trigger duplicate index in the last
+	 * SMix2 invocation in yespower 0.5 for N=2048 with at least one of the
+	 * values of r below.  This is needed to test that a non-save version
+	 * of BlockMix is used in that special case.  Most other values of src
+	 * would leave this untested.
+	 */
+	src[0] = 43;
+	for (i = 1; i < sizeof(src); i++)
+		src[i] = i * 3;
+
+	for (N = 1024; N <= 4096; N <<= 1) {
+		for (r = 8; r <= 32; r++) {
+			yespower_params_t params = {
+				.version = version,
+				.N = N,
+				.r = r,
+				.pers = (const uint8_t *)pers,
+				.perslen = pers ? strlen(pers) : 0
+			};
+			if (yespower_tls(src, sizeof(src), &params, &dst)) {
+				puts("FAILED");
+				return;
+			}
+			for (i = 0; i < sizeof(xor); i++)
+				xor.uc[i] ^= dst.uc[i];
+		}
+	}
+
+	for (i = 0; i < sizeof(xor); i++)
+		printf("%02x%c", xor.uc[i], i < sizeof(xor) - 1 ? ' ' : '\n');
+}
+
+int main(void)
+{
+	setvbuf(stdout, NULL, _IOLBF, 0);
+
+#ifdef TEST_PBKDF2_SHA256
+	print_PBKDF2_SHA256("password", "salt", 1, 20);
+	print_PBKDF2_SHA256("password", "salt", 2, 20);
+	print_PBKDF2_SHA256("password", "salt", 4096, 20);
+	print_PBKDF2_SHA256("password", "salt", 16777216, 20);
+	print_PBKDF2_SHA256("passwordPASSWORDpassword",
+	    "saltSALTsaltSALTsaltSALTsaltSALTsalt", 4096, 25);
+	print_PBKDF2_SHA256_raw("pass\0word", 9, "sa\0lt", 5, 4096, 16);
+#if 0
+	print_PBKDF2_SHA256("password", "salt", 1, 32);
+	print_PBKDF2_SHA256("password", "salt", 2, 32);
+	print_PBKDF2_SHA256("password", "salt", 4096, 32);
+	print_PBKDF2_SHA256("password", "salt", 16777216, 32);
+	print_PBKDF2_SHA256("passwordPASSWORDpassword",
+	    "saltSALTsaltSALTsaltSALTsaltSALTsalt", 4096, 40);
+	print_PBKDF2_SHA256("password", "salt", 4096, 16);
+	print_PBKDF2_SHA256("password", "salt", 1, 20);
+	print_PBKDF2_SHA256("password", "salt", 2, 20);
+	print_PBKDF2_SHA256("password", "salt", 4096, 20);
+	print_PBKDF2_SHA256("password", "salt", 16777216, 20);
+	print_PBKDF2_SHA256("password", "salt", 4096, 25);
+	print_PBKDF2_SHA256("password", "salt", 4096, 16);
+#endif
+#endif
+
+	print_yespower(YESPOWER_0_5, 2048, 8, "Client Key"); /* yescrypt 0.5 */
+	print_yespower(YESPOWER_0_5, 2048, 8, pers_bsty_magic); /* BSTY */
+	print_yespower(YESPOWER_0_5, 4096, 16, "Client Key"); /* Cryply */
+	print_yespower(YESPOWER_0_5, 4096, 24, "Jagaricoin");
+	print_yespower(YESPOWER_0_5, 4096, 32, "WaviBanana");
+	print_yespower(YESPOWER_0_5, 2048, 32, "Client Key");
+	print_yespower(YESPOWER_0_5, 1024, 32, "Client Key");
+
+	print_yespower(YESPOWER_0_5, 2048, 8, NULL); /* no personality */
+
+	print_yespower(YESPOWER_1_0, 2048, 8, NULL);
+	print_yespower(YESPOWER_1_0, 4096, 16, NULL);
+	print_yespower(YESPOWER_1_0, 4096, 32, NULL);
+	print_yespower(YESPOWER_1_0, 2048, 32, NULL);
+	print_yespower(YESPOWER_1_0, 1024, 32, NULL);
+
+	print_yespower(YESPOWER_1_0, 1024, 32, "personality test");
+
+	print_yespower_loop(YESPOWER_0_5, "Client Key");
+	print_yespower_loop(YESPOWER_1_0, NULL);
+
+	return 0;
+}
diff --git a/yespower-1.0.1/yespower-opt.c b/yespower-1.0.1/yespower-opt.c
new file mode 100644
index 000000000..02f71406c
--- /dev/null
+++ b/yespower-1.0.1/yespower-opt.c
@@ -0,0 +1,1153 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2012-2019 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ *
+ * This is a proof-of-work focused fork of yescrypt, including optimized and
+ * cut-down implementation of the obsolete yescrypt 0.5 (based off its first
+ * submission to PHC back in 2014) and a new proof-of-work specific variation
+ * known as yespower 1.0.  The former is intended as an upgrade for
+ * cryptocurrencies that already use yescrypt 0.5 and the latter may be used
+ * as a further upgrade (hard fork) by those and other cryptocurrencies.  The
+ * version of algorithm to use is requested through parameters, allowing for
+ * both algorithms to co-exist in client and miner implementations (such as in
+ * preparation for a hard-fork).
+ */
+
+#ifndef _YESPOWER_OPT_C_PASS_
+#define _YESPOWER_OPT_C_PASS_ 1
+#endif
+
+#if _YESPOWER_OPT_C_PASS_ == 1
+/*
+ * AVX and especially XOP speed up Salsa20 a lot, but needlessly result in
+ * extra instruction prefixes for pwxform (which we make more use of).  While
+ * no slowdown from the prefixes is generally observed on AMD CPUs supporting
+ * XOP, some slowdown is sometimes observed on Intel CPUs with AVX.
+ */
+#ifdef __XOP__
+#warning "Note: XOP is enabled.  That's great."
+#elif defined(__AVX__)
+#warning "Note: AVX is enabled.  That's OK."
+#elif defined(__SSE2__)
+#warning "Note: AVX and XOP are not enabled.  That's OK."
+#elif defined(__x86_64__) || defined(__i386__)
+#warning "SSE2 not enabled.  Expect poor performance."
+#else
+#warning "Note: building generic code for non-x86.  That's OK."
+#endif
+
+/*
+ * The SSE4 code version has fewer instructions than the generic SSE2 version,
+ * but all of the instructions are SIMD, thereby wasting the scalar execution
+ * units.  Thus, the generic SSE2 version below actually runs faster on some
+ * CPUs due to its balanced mix of SIMD and scalar instructions.
+ */
+#undef USE_SSE4_FOR_32BIT
+
+#ifdef __SSE2__
+/*
+ * GCC before 4.9 would by default unnecessarily use store/load (without
+ * SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV.
+ * This was tracked as GCC bug 54349.
+ * "-mtune=corei7" works around this, but is only supported for GCC 4.6+.
+ * We use inline asm for pre-4.6 GCC, further down this file.
+ */
+#if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \
+    !defined(__clang__) && !defined(__ICC)
+#pragma GCC target ("tune=corei7")
+#endif
+#include <emmintrin.h>
+#ifdef __XOP__
+#include <x86intrin.h>
+#endif
+#elif defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "insecure_memzero.h"
+#include "sha256.h"
+#include "sysendian.h"
+
+#include "yespower.h"
+
+#include "yespower-platform.c"
+
+#if __STDC_VERSION__ >= 199901L
+/* Have restrict */
+#elif defined(__GNUC__)
+#define restrict __restrict
+#else
+#define restrict
+#endif
+
+#ifdef __GNUC__
+#define unlikely(exp) __builtin_expect(exp, 0)
+#else
+#define unlikely(exp) (exp)
+#endif
+
+#ifdef __SSE__
+#define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint));
+#else
+#undef PREFETCH
+#endif
+
+typedef union {
+	uint32_t w[16];
+	uint64_t d[8];
+#ifdef __SSE2__
+	__m128i q[4];
+#endif
+} salsa20_blk_t;
+
+static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
+    salsa20_blk_t *Bout)
+{
+#define COMBINE(out, in1, in2) \
+	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
+	COMBINE(0, 0, 2)
+	COMBINE(1, 5, 7)
+	COMBINE(2, 2, 4)
+	COMBINE(3, 7, 1)
+	COMBINE(4, 4, 6)
+	COMBINE(5, 1, 3)
+	COMBINE(6, 6, 0)
+	COMBINE(7, 3, 5)
+#undef COMBINE
+}
+
+static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
+    salsa20_blk_t *Bout)
+{
+#define UNCOMBINE(out, in1, in2) \
+	Bout->w[out * 2] = Bin->d[in1]; \
+	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
+	UNCOMBINE(0, 0, 6)
+	UNCOMBINE(1, 5, 3)
+	UNCOMBINE(2, 2, 0)
+	UNCOMBINE(3, 7, 5)
+	UNCOMBINE(4, 4, 2)
+	UNCOMBINE(5, 1, 7)
+	UNCOMBINE(6, 6, 4)
+	UNCOMBINE(7, 3, 1)
+#undef UNCOMBINE
+}
+
+#ifdef __SSE2__
+#define DECL_X \
+	__m128i X0, X1, X2, X3;
+#define DECL_Y \
+	__m128i Y0, Y1, Y2, Y3;
+#define READ_X(in) \
+	X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
+#define WRITE_X(out) \
+	(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
+
+#ifdef __XOP__
+#define ARX(out, in1, in2, s) \
+	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
+#else
+#define ARX(out, in1, in2, s) { \
+	__m128i tmp = _mm_add_epi32(in1, in2); \
+	out = _mm_xor_si128(out, _mm_slli_epi32(tmp, s)); \
+	out = _mm_xor_si128(out, _mm_srli_epi32(tmp, 32 - s)); \
+}
+#endif
+
+#define SALSA20_2ROUNDS \
+	/* Operate on "columns" */ \
+	ARX(X1, X0, X3, 7) \
+	ARX(X2, X1, X0, 9) \
+	ARX(X3, X2, X1, 13) \
+	ARX(X0, X3, X2, 18) \
+	/* Rearrange data */ \
+	X1 = _mm_shuffle_epi32(X1, 0x93); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x39); \
+	/* Operate on "rows" */ \
+	ARX(X3, X0, X1, 7) \
+	ARX(X2, X3, X0, 9) \
+	ARX(X1, X2, X3, 13) \
+	ARX(X0, X1, X2, 18) \
+	/* Rearrange data */ \
+	X1 = _mm_shuffle_epi32(X1, 0x39); \
+	X2 = _mm_shuffle_epi32(X2, 0x4E); \
+	X3 = _mm_shuffle_epi32(X3, 0x93);
+
+/**
+ * Apply the Salsa20 core to the block provided in (X0 ... X3).
+ */
+#define SALSA20_wrapper(out, rounds) { \
+	__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
+	rounds \
+	(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \
+	(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \
+	(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \
+	(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \
+}
+
+/**
+ * Apply the Salsa20/2 core to the block provided in X.
+ */
+#define SALSA20_2(out) \
+	SALSA20_wrapper(out, SALSA20_2ROUNDS)
+
+#define SALSA20_8ROUNDS \
+	SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
+
+/**
+ * Apply the Salsa20/8 core to the block provided in X.
+ */
+#define SALSA20_8(out) \
+	SALSA20_wrapper(out, SALSA20_8ROUNDS)
+
+#define XOR_X(in) \
+	X0 = _mm_xor_si128(X0, (in).q[0]); \
+	X1 = _mm_xor_si128(X1, (in).q[1]); \
+	X2 = _mm_xor_si128(X2, (in).q[2]); \
+	X3 = _mm_xor_si128(X3, (in).q[3]);
+
+#define XOR_X_2(in1, in2) \
+	X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
+	X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
+	X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
+	X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
+
+#define XOR_X_WRITE_XOR_Y_2(out, in) \
+	(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \
+	(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \
+	(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \
+	(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \
+	X0 = _mm_xor_si128(X0, Y0); \
+	X1 = _mm_xor_si128(X1, Y1); \
+	X2 = _mm_xor_si128(X2, Y2); \
+	X3 = _mm_xor_si128(X3, Y3);
+
+#define INTEGERIFY _mm_cvtsi128_si32(X0)
+
+#else /* !defined(__SSE2__) */
+
+#define DECL_X \
+	salsa20_blk_t X;
+#define DECL_Y \
+	salsa20_blk_t Y;
+
+#define COPY(out, in) \
+	(out).d[0] = (in).d[0]; \
+	(out).d[1] = (in).d[1]; \
+	(out).d[2] = (in).d[2]; \
+	(out).d[3] = (in).d[3]; \
+	(out).d[4] = (in).d[4]; \
+	(out).d[5] = (in).d[5]; \
+	(out).d[6] = (in).d[6]; \
+	(out).d[7] = (in).d[7];
+
+#define READ_X(in) COPY(X, in)
+#define WRITE_X(out) COPY(out, X)
+
+/**
+ * salsa20(B):
+ * Apply the Salsa20 core to the provided block.
+ */
+static inline void salsa20(salsa20_blk_t *restrict B,
+    salsa20_blk_t *restrict Bout, uint32_t doublerounds)
+{
+	salsa20_blk_t X;
+#define x X.w
+
+	salsa20_simd_unshuffle(B, &X);
+
+	do {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	} while (--doublerounds);
+#undef x
+
+	{
+		uint32_t i;
+		salsa20_simd_shuffle(&X, Bout);
+		for (i = 0; i < 16; i += 4) {
+			B->w[i] = Bout->w[i] += B->w[i];
+			B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
+			B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
+			B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
+		}
+	}
+}
+
+/**
+ * Apply the Salsa20/2 core to the block provided in X.
+ */
+#define SALSA20_2(out) \
+	salsa20(&X, &out, 1);
+
+/**
+ * Apply the Salsa20/8 core to the block provided in X.
+ */
+#define SALSA20_8(out) \
+	salsa20(&X, &out, 4);
+
+#define XOR(out, in1, in2) \
+	(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
+	(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
+	(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
+	(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
+	(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
+	(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
+	(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
+	(out).d[7] = (in1).d[7] ^ (in2).d[7];
+
+#define XOR_X(in) XOR(X, X, in)
+#define XOR_X_2(in1, in2) XOR(X, in1, in2)
+#define XOR_X_WRITE_XOR_Y_2(out, in) \
+	XOR(Y, out, in) \
+	COPY(out, Y) \
+	XOR(X, X, Y)
+
+#define INTEGERIFY (uint32_t)X.d[0]
+#endif
+
+/**
+ * Apply the Salsa20 core to the block provided in X ^ in.
+ */
+#define SALSA20_XOR_MEM(in, out) \
+	XOR_X(in) \
+	SALSA20(out)
+
+#define SALSA20 SALSA20_8
+#else /* pass 2 */
+#undef SALSA20
+#define SALSA20 SALSA20_2
+#endif
+
+/**
+ * blockmix_salsa(Bin, Bout):
+ * Compute Bout = BlockMix_{salsa20, 1}(Bin).  The input Bin must be 128
+ * bytes in length; the output Bout must also be the same size.
+ */
+static inline void blockmix_salsa(const salsa20_blk_t *restrict Bin,
+    salsa20_blk_t *restrict Bout)
+{
+	DECL_X
+
+	READ_X(Bin[1])
+	SALSA20_XOR_MEM(Bin[0], Bout[0])
+	SALSA20_XOR_MEM(Bin[1], Bout[1])
+}
+
+static inline uint32_t blockmix_salsa_xor(const salsa20_blk_t *restrict Bin1,
+    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout)
+{
+	DECL_X
+
+	XOR_X_2(Bin1[1], Bin2[1])
+	XOR_X(Bin1[0])
+	SALSA20_XOR_MEM(Bin2[0], Bout[0])
+	XOR_X(Bin1[1])
+	SALSA20_XOR_MEM(Bin2[1], Bout[1])
+
+	return INTEGERIFY;
+}
+
+#if _YESPOWER_OPT_C_PASS_ == 1
+/* This is tunable, but it is part of what defines a yespower version */
+/* Version 0.5 */
+#define Swidth_0_5 8
+/* Version 1.0 */
+#define Swidth_1_0 11
+
+/* Not tunable in this implementation, hard-coded in a few places */
+#define PWXsimple 2
+#define PWXgather 4
+
+/* Derived value.  Not tunable on its own. */
+#define PWXbytes (PWXgather * PWXsimple * 8)
+
+/* (Maybe-)runtime derived values.  Not tunable on their own. */
+#define Swidth_to_Sbytes1(Swidth) ((1 << (Swidth)) * PWXsimple * 8)
+#define Swidth_to_Smask(Swidth) (((1 << (Swidth)) - 1) * PWXsimple * 8)
+#define Smask_to_Smask2(Smask) (((uint64_t)(Smask) << 32) | (Smask))
+
+/* These should be compile-time derived */
+#define Smask2_0_5 Smask_to_Smask2(Swidth_to_Smask(Swidth_0_5))
+#define Smask2_1_0 Smask_to_Smask2(Swidth_to_Smask(Swidth_1_0))
+
+typedef struct {
+	uint8_t *S0, *S1, *S2;
+	size_t w;
+	uint32_t Sbytes;
+} pwxform_ctx_t;
+
+#define DECL_SMASK2REG /* empty */
+#define MAYBE_MEMORY_BARRIER /* empty */
+
+#ifdef __SSE2__
+/*
+ * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
+ * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
+ * destination registers, whereas the shifts would require an extra move
+ * instruction for our code when building without AVX.  Unfortunately, PSHUFD
+ * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ)
+ * and somewhat slower on some non-Intel CPUs (luckily not including AMD
+ * Bulldozer and Piledriver).
+ */
+#ifdef __AVX__
+#define HI32(X) \
+	_mm_srli_si128((X), 4)
+#elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */
+#define HI32(X) \
+	_mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1))
+#else
+#define HI32(X) \
+	_mm_srli_epi64((X), 32)
+#endif
+
+#if defined(__x86_64__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
+#ifdef __AVX__
+#define MOVQ "vmovq"
+#else
+/* "movq" would be more correct, but "movd" is supported by older binutils
+ * due to an error in AMD's spec for x86-64. */
+#define MOVQ "movd"
+#endif
+#define EXTRACT64(X) ({ \
+	uint64_t result; \
+	__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
+	result; \
+})
+#elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
+/* MSVC and Open64 had bugs */
+#define EXTRACT64(X) _mm_cvtsi128_si64(X)
+#elif defined(__x86_64__) && defined(__SSE4_1__)
+/* No known bugs for this intrinsic */
+#include <smmintrin.h>
+#define EXTRACT64(X) _mm_extract_epi64((X), 0)
+#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
+/* 32-bit */
+#include <smmintrin.h>
+#if 0
+/* This is currently unused by the code below, which instead uses these two
+ * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
+#define EXTRACT64(X) \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
+#endif
+#else
+/* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
+#define EXTRACT64(X) \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
+	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
+#endif
+
+#if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
+/* 64-bit with AVX */
+/* Force use of 64-bit AND instead of two 32-bit ANDs */
+#undef DECL_SMASK2REG
+#if defined(__GNUC__) && !defined(__ICC)
+#define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
+/* Force use of lower-numbered registers to reduce number of prefixes, relying
+ * on out-of-order execution and register renaming. */
+#define FORCE_REGALLOC_1 \
+	__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
+#define FORCE_REGALLOC_2 \
+	__asm__("" : : "c" (lo));
+#else
+static volatile uint64_t Smask2var = Smask2;
+#define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
+#define FORCE_REGALLOC_1 /* empty */
+#define FORCE_REGALLOC_2 /* empty */
+#endif
+#define PWXFORM_SIMD(X) { \
+	uint64_t x; \
+	FORCE_REGALLOC_1 \
+	uint32_t lo = x = EXTRACT64(X) & Smask2reg; \
+	FORCE_REGALLOC_2 \
+	uint32_t hi = x >> 32; \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
+	X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
+}
+#elif defined(__x86_64__)
+/* 64-bit without AVX.  This relies on out-of-order execution and register
+ * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
+ * it runs great on Haswell. */
+#warning "Note: using x86-64 inline assembly for pwxform.  That's great."
+#undef MAYBE_MEMORY_BARRIER
+#define MAYBE_MEMORY_BARRIER \
+	__asm__("" : : : "memory");
+#ifdef __ILP32__ /* x32 */
+#define REGISTER_PREFIX "e"
+#else
+#define REGISTER_PREFIX "r"
+#endif
+#define PWXFORM_SIMD(X) { \
+	__m128i H; \
+	__asm__( \
+	    "movd %0, %%rax\n\t" \
+	    "pshufd $0xb1, %0, %1\n\t" \
+	    "andq %2, %%rax\n\t" \
+	    "pmuludq %1, %0\n\t" \
+	    "movl %%eax, %%ecx\n\t" \
+	    "shrq $0x20, %%rax\n\t" \
+	    "paddq (%3,%%" REGISTER_PREFIX "cx), %0\n\t" \
+	    "pxor (%4,%%" REGISTER_PREFIX "ax), %0\n\t" \
+	    : "+x" (X), "=x" (H) \
+	    : "d" (Smask2), "S" (S0), "D" (S1) \
+	    : "cc", "ax", "cx"); \
+}
+#elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
+/* 32-bit with SSE4.1 */
+#define PWXFORM_SIMD(X) { \
+	__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
+	__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
+	__m128i s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, s0); \
+	X = _mm_xor_si128(X, s1); \
+}
+#else
+/* 32-bit without SSE4.1 */
+#define PWXFORM_SIMD(X) { \
+	uint64_t x = EXTRACT64(X) & Smask2; \
+	__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
+	__m128i s1 = *(__m128i *)(S1 + (x >> 32)); \
+	X = _mm_mul_epu32(HI32(X), X); \
+	X = _mm_add_epi64(X, s0); \
+	X = _mm_xor_si128(X, s1); \
+}
+#endif
+
+#define PWXFORM_SIMD_WRITE(X, Sw) \
+	PWXFORM_SIMD(X) \
+	MAYBE_MEMORY_BARRIER \
+	*(__m128i *)(Sw + w) = X; \
+	MAYBE_MEMORY_BARRIER
+
+#define PWXFORM_ROUND \
+	PWXFORM_SIMD(X0) \
+	PWXFORM_SIMD(X1) \
+	PWXFORM_SIMD(X2) \
+	PWXFORM_SIMD(X3)
+
+#define PWXFORM_ROUND_WRITE4 \
+	PWXFORM_SIMD_WRITE(X0, S0) \
+	PWXFORM_SIMD_WRITE(X1, S1) \
+	w += 16; \
+	PWXFORM_SIMD_WRITE(X2, S0) \
+	PWXFORM_SIMD_WRITE(X3, S1) \
+	w += 16;
+
+#define PWXFORM_ROUND_WRITE2 \
+	PWXFORM_SIMD_WRITE(X0, S0) \
+	PWXFORM_SIMD_WRITE(X1, S1) \
+	w += 16; \
+	PWXFORM_SIMD(X2) \
+	PWXFORM_SIMD(X3)
+
+#else /* !defined(__SSE2__) */
+
+#define PWXFORM_SIMD(x0, x1) { \
+	uint64_t x = x0 & Smask2; \
+	uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
+	uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
+	x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
+	x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
+}
+
+#define PWXFORM_SIMD_WRITE(x0, x1, Sw) \
+	PWXFORM_SIMD(x0, x1) \
+	((uint64_t *)(Sw + w))[0] = x0; \
+	((uint64_t *)(Sw + w))[1] = x1;
+
+#define PWXFORM_ROUND \
+	PWXFORM_SIMD(X.d[0], X.d[1]) \
+	PWXFORM_SIMD(X.d[2], X.d[3]) \
+	PWXFORM_SIMD(X.d[4], X.d[5]) \
+	PWXFORM_SIMD(X.d[6], X.d[7])
+
+#define PWXFORM_ROUND_WRITE4 \
+	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
+	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
+	w += 16; \
+	PWXFORM_SIMD_WRITE(X.d[4], X.d[5], S0) \
+	PWXFORM_SIMD_WRITE(X.d[6], X.d[7], S1) \
+	w += 16;
+
+#define PWXFORM_ROUND_WRITE2 \
+	PWXFORM_SIMD_WRITE(X.d[0], X.d[1], S0) \
+	PWXFORM_SIMD_WRITE(X.d[2], X.d[3], S1) \
+	w += 16; \
+	PWXFORM_SIMD(X.d[4], X.d[5]) \
+	PWXFORM_SIMD(X.d[6], X.d[7])
+#endif
+
+#define PWXFORM \
+	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND \
+	PWXFORM_ROUND PWXFORM_ROUND PWXFORM_ROUND
+
+#define Smask2 Smask2_0_5
+
+#else /* pass 2 */
+
+#undef PWXFORM
+#define PWXFORM \
+	PWXFORM_ROUND_WRITE4 PWXFORM_ROUND_WRITE2 PWXFORM_ROUND_WRITE2 \
+	w &= Smask2; \
+	{ \
+		uint8_t *Stmp = S2; \
+		S2 = S1; \
+		S1 = S0; \
+		S0 = Stmp; \
+	}
+
+#undef Smask2
+#define Smask2 Smask2_1_0
+
+#endif
+
+/**
+ * blockmix_pwxform(Bin, Bout, r, S):
+ * Compute Bout = BlockMix_pwxform{salsa20, r, S}(Bin).  The input Bin must
+ * be 128r bytes in length; the output Bout must also be the same size.
+ */
+static void blockmix(const salsa20_blk_t *restrict Bin,
+    salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
+{
+	if (unlikely(!ctx)) {
+		blockmix_salsa(Bin, Bout);
+		return;
+	}
+
+	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
+#if _YESPOWER_OPT_C_PASS_ > 1
+	uint8_t *S2 = ctx->S2;
+	size_t w = ctx->w;
+#endif
+	size_t i;
+	DECL_X
+
+	/* Convert count of 128-byte blocks to max index of 64-byte block */
+	r = r * 2 - 1;
+
+	READ_X(Bin[r])
+
+	DECL_SMASK2REG
+
+	i = 0;
+	do {
+		XOR_X(Bin[i])
+		PWXFORM
+		if (unlikely(i >= r))
+			break;
+		WRITE_X(Bout[i])
+		i++;
+	} while (1);
+
+#if _YESPOWER_OPT_C_PASS_ > 1
+	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
+	ctx->w = w;
+#endif
+
+	SALSA20(Bout[i])
+}
+
+static uint32_t blockmix_xor(const salsa20_blk_t *restrict Bin1,
+    const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
+    size_t r, pwxform_ctx_t *restrict ctx)
+{
+	if (unlikely(!ctx))
+		return blockmix_salsa_xor(Bin1, Bin2, Bout);
+
+	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
+#if _YESPOWER_OPT_C_PASS_ > 1
+	uint8_t *S2 = ctx->S2;
+	size_t w = ctx->w;
+#endif
+	size_t i;
+	DECL_X
+
+	/* Convert count of 128-byte blocks to max index of 64-byte block */
+	r = r * 2 - 1;
+
+#ifdef PREFETCH
+	PREFETCH(&Bin2[r], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin2[i], _MM_HINT_T0)
+	}
+#endif
+
+	XOR_X_2(Bin1[r], Bin2[r])
+
+	DECL_SMASK2REG
+
+	i = 0;
+	r--;
+	do {
+		XOR_X(Bin1[i])
+		XOR_X(Bin2[i])
+		PWXFORM
+		WRITE_X(Bout[i])
+
+		XOR_X(Bin1[i + 1])
+		XOR_X(Bin2[i + 1])
+		PWXFORM
+
+		if (unlikely(i >= r))
+			break;
+
+		WRITE_X(Bout[i + 1])
+
+		i += 2;
+	} while (1);
+	i++;
+
+#if _YESPOWER_OPT_C_PASS_ > 1
+	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
+	ctx->w = w;
+#endif
+
+	SALSA20(Bout[i])
+
+	return INTEGERIFY;
+}
+
+static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
+    salsa20_blk_t *restrict Bin2,
+    size_t r, pwxform_ctx_t *restrict ctx)
+{
+	uint8_t *S0 = ctx->S0, *S1 = ctx->S1;
+#if _YESPOWER_OPT_C_PASS_ > 1
+	uint8_t *S2 = ctx->S2;
+	size_t w = ctx->w;
+#endif
+	size_t i;
+	DECL_X
+	DECL_Y
+
+	/* Convert count of 128-byte blocks to max index of 64-byte block */
+	r = r * 2 - 1;
+
+#ifdef PREFETCH
+	PREFETCH(&Bin2[r], _MM_HINT_T0)
+	for (i = 0; i < r; i++) {
+		PREFETCH(&Bin2[i], _MM_HINT_T0)
+	}
+#endif
+
+	XOR_X_2(Bin1out[r], Bin2[r])
+
+	DECL_SMASK2REG
+
+	i = 0;
+	r--;
+	do {
+		XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
+		PWXFORM
+		WRITE_X(Bin1out[i])
+
+		XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
+		PWXFORM
+
+		if (unlikely(i >= r))
+			break;
+
+		WRITE_X(Bin1out[i + 1])
+
+		i += 2;
+	} while (1);
+	i++;
+
+#if _YESPOWER_OPT_C_PASS_ > 1
+	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
+	ctx->w = w;
+#endif
+
+	SALSA20(Bin1out[i])
+
+	return INTEGERIFY;
+}
+
+#if _YESPOWER_OPT_C_PASS_ == 1
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
+{
+/*
+ * Our 64-bit words are in host byte order, which is why we don't just read
+ * w[0] here (would be wrong on big-endian).  Also, our 32-bit words are
+ * SIMD-shuffled, but we only care about the least significant 32 bits anyway.
+ */
+	return (uint32_t)B[2 * r - 1].d[0];
+}
+#endif
+
+/**
+ * smix1(B, r, N, V, XY, S):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 128r+64 bytes in length.  N must be even and at least 4.
+ * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY
+ * to a multiple of at least 16 bytes.
+ */
+static void smix1(uint8_t *B, size_t r, uint32_t N,
+    salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
+{
+	size_t s = 2 * r;
+	salsa20_blk_t *X = V, *Y = &V[s], *V_j;
+	uint32_t i, j, n;
+
+#if _YESPOWER_OPT_C_PASS_ == 1
+	for (i = 0; i < 2 * r; i++) {
+#else
+	for (i = 0; i < 2; i++) {
+#endif
+		const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64];
+		salsa20_blk_t *tmp = Y;
+		salsa20_blk_t *dst = &X[i];
+		size_t k;
+		for (k = 0; k < 16; k++)
+			tmp->w[k] = le32dec(&src->w[k]);
+		salsa20_simd_shuffle(tmp, dst);
+	}
+
+#if _YESPOWER_OPT_C_PASS_ > 1
+	for (i = 1; i < r; i++)
+		blockmix(&X[(i - 1) * 2], &X[i * 2], 1, ctx);
+#endif
+
+	blockmix(X, Y, r, ctx);
+	X = Y + s;
+	blockmix(Y, X, r, ctx);
+	j = integerify(X, r);
+
+	for (n = 2; n < N; n <<= 1) {
+		uint32_t m = (n < N / 2) ? n : (N - 1 - n);
+		for (i = 1; i < m; i += 2) {
+			Y = X + s;
+			j &= n - 1;
+			j += i - 1;
+			V_j = &V[j * s];
+			j = blockmix_xor(X, V_j, Y, r, ctx);
+			j &= n - 1;
+			j += i;
+			V_j = &V[j * s];
+			X = Y + s;
+			j = blockmix_xor(Y, V_j, X, r, ctx);
+		}
+	}
+	n >>= 1;
+
+	j &= n - 1;
+	j += N - 2 - n;
+	V_j = &V[j * s];
+	Y = X + s;
+	j = blockmix_xor(X, V_j, Y, r, ctx);
+	j &= n - 1;
+	j += N - 1 - n;
+	V_j = &V[j * s];
+	blockmix_xor(Y, V_j, XY, r, ctx);
+
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = &XY[i];
+		salsa20_blk_t *tmp = &XY[s];
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
+		size_t k;
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+/**
+ * smix2(B, r, N, Nloop, V, XY, S):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage XY must be 256r bytes in length.  N must be a power of 2 and at
+ * least 2.  Nloop must be even.  The array V must be aligned to a multiple of
+ * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes.
+ */
+static void smix2(uint8_t *B, size_t r, uint32_t N, uint32_t Nloop,
+    salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
+{
+	size_t s = 2 * r;
+	salsa20_blk_t *X = XY, *Y = &XY[s];
+	uint32_t i, j;
+
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64];
+		salsa20_blk_t *tmp = Y;
+		salsa20_blk_t *dst = &X[i];
+		size_t k;
+		for (k = 0; k < 16; k++)
+			tmp->w[k] = le32dec(&src->w[k]);
+		salsa20_simd_shuffle(tmp, dst);
+	}
+
+	j = integerify(X, r) & (N - 1);
+
+#if _YESPOWER_OPT_C_PASS_ == 1
+	if (Nloop > 2) {
+#endif
+		do {
+			salsa20_blk_t *V_j = &V[j * s];
+			j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1);
+			V_j = &V[j * s];
+			j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1);
+		} while (Nloop -= 2);
+#if _YESPOWER_OPT_C_PASS_ == 1
+	} else {
+		const salsa20_blk_t * V_j = &V[j * s];
+		j = blockmix_xor(X, V_j, Y, r, ctx) & (N - 1);
+		V_j = &V[j * s];
+		blockmix_xor(Y, V_j, X, r, ctx);
+	}
+#endif
+
+	for (i = 0; i < 2 * r; i++) {
+		const salsa20_blk_t *src = &X[i];
+		salsa20_blk_t *tmp = Y;
+		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
+		size_t k;
+		for (k = 0; k < 16; k++)
+			le32enc(&tmp->w[k], src->w[k]);
+		salsa20_simd_unshuffle(tmp, dst);
+	}
+}
+
+/**
+ * smix(B, r, N, V, XY, S):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage
+ * XY must be 256r bytes in length.  N must be a power of 2 and at least 16.
+ * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY
+ * to a multiple of at least 16 bytes (aligning them to 64 bytes as well saves
+ * cache lines, but it might also result in cache bank conflicts).
+ */
+static void smix(uint8_t *B, size_t r, uint32_t N,
+    salsa20_blk_t *V, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
+{
+#if _YESPOWER_OPT_C_PASS_ == 1
+	uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
+	uint32_t Nloop_rw = Nloop_all;
+
+	Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
+	Nloop_rw &= ~(uint32_t)1; /* round down to even */
+#else
+	uint32_t Nloop_rw = (N + 2) / 3; /* 1/3, round up */
+	Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
+#endif
+
+	smix1(B, 1, ctx->Sbytes / 128, (salsa20_blk_t *)ctx->S0, XY, NULL);
+	smix1(B, r, N, V, XY, ctx);
+	smix2(B, r, N, Nloop_rw /* must be > 2 */, V, XY, ctx);
+#if _YESPOWER_OPT_C_PASS_ == 1
+	if (Nloop_all > Nloop_rw)
+		smix2(B, r, N, 2, V, XY, ctx);
+#endif
+}
+
+#if _YESPOWER_OPT_C_PASS_ == 1
+#undef _YESPOWER_OPT_C_PASS_
+#define _YESPOWER_OPT_C_PASS_ 2
+#define blockmix_salsa blockmix_salsa_1_0
+#define blockmix_salsa_xor blockmix_salsa_xor_1_0
+#define blockmix blockmix_1_0
+#define blockmix_xor blockmix_xor_1_0
+#define blockmix_xor_save blockmix_xor_save_1_0
+#define smix1 smix1_1_0
+#define smix2 smix2_1_0
+#define smix smix_1_0
+#include "yespower-opt.c"
+#undef smix
+
+/**
+ * yespower(local, src, srclen, params, dst):
+ * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
+ * local is the thread-local data structure, allowing to preserve and reuse a
+ * memory allocation across calls, thereby reducing its overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int yespower(yespower_local_t *local,
+    const uint8_t *src, size_t srclen,
+    const yespower_params_t *params,
+    yespower_binary_t *dst)
+{
+	yespower_version_t version = params->version;
+	uint32_t N = params->N;
+	uint32_t r = params->r;
+	const uint8_t *pers = params->pers;
+	size_t perslen = params->perslen;
+	uint32_t Swidth;
+	size_t B_size, V_size, XY_size, need;
+	uint8_t *B, *S;
+	salsa20_blk_t *V, *XY;
+	pwxform_ctx_t ctx;
+	uint8_t sha256[32];
+
+	/* Sanity-check parameters */
+	if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) ||
+	    N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
+	    (N & (N - 1)) != 0 ||
+	    (!pers && perslen)) {
+		errno = EINVAL;
+		goto fail;
+	}
+
+	/* Allocate memory */
+	B_size = (size_t)128 * r;
+	V_size = B_size * N;
+	if (version == YESPOWER_0_5) {
+		XY_size = B_size * 2;
+		Swidth = Swidth_0_5;
+		ctx.Sbytes = 2 * Swidth_to_Sbytes1(Swidth);
+	} else {
+		XY_size = B_size + 64;
+		Swidth = Swidth_1_0;
+		ctx.Sbytes = 3 * Swidth_to_Sbytes1(Swidth);
+	}
+	need = B_size + V_size + XY_size + ctx.Sbytes;
+	if (local->aligned_size < need) {
+		if (free_region(local))
+			goto fail;
+		if (!alloc_region(local, need))
+			goto fail;
+	}
+	B = (uint8_t *)local->aligned;
+	V = (salsa20_blk_t *)((uint8_t *)B + B_size);
+	XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
+	S = (uint8_t *)XY + XY_size;
+	ctx.S0 = S;
+	ctx.S1 = S + Swidth_to_Sbytes1(Swidth);
+
+	SHA256_Buf(src, srclen, sha256);
+
+	if (version == YESPOWER_0_5) {
+		PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1,
+		    B, B_size);
+		memcpy(sha256, B, sizeof(sha256));
+		smix(B, r, N, V, XY, &ctx);
+		PBKDF2_SHA256(sha256, sizeof(sha256), B, B_size, 1,
+		    (uint8_t *)dst, sizeof(*dst));
+
+		if (pers) {
+			HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
+			    sha256);
+			SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
+		}
+	} else {
+		ctx.S2 = S + 2 * Swidth_to_Sbytes1(Swidth);
+		ctx.w = 0;
+
+		if (pers) {
+			src = pers;
+			srclen = perslen;
+		} else {
+			srclen = 0;
+		}
+
+		PBKDF2_SHA256(sha256, sizeof(sha256), src, srclen, 1, B, 128);
+		memcpy(sha256, B, sizeof(sha256));
+		smix_1_0(B, r, N, V, XY, &ctx);
+		HMAC_SHA256_Buf(B + B_size - 64, 64,
+		    sha256, sizeof(sha256), (uint8_t *)dst);
+	}
+
+	/* Success! */
+	return 0;
+
+fail:
+	memset(dst, 0xff, sizeof(*dst));
+	return -1;
+}
+
+/**
+ * yespower_tls(src, srclen, params, dst):
+ * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
+ * The memory allocation is maintained internally using thread-local storage.
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int yespower_tls(const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst)
+{
+	static __thread int initialized = 0;
+	static __thread yespower_local_t local;
+
+	if (!initialized) {
+		init_region(&local);
+		initialized = 1;
+	}
+
+	return yespower(&local, src, srclen, params, dst);
+}
+
+int yespower_init_local(yespower_local_t *local)
+{
+	init_region(local);
+	return 0;
+}
+
+int yespower_free_local(yespower_local_t *local)
+{
+	return free_region(local);
+}
+#endif
diff --git a/yespower-1.0.1/yespower-platform.c b/yespower-1.0.1/yespower-platform.c
new file mode 100644
index 000000000..2b1a03f0f
--- /dev/null
+++ b/yespower-1.0.1/yespower-platform.c
@@ -0,0 +1,107 @@
+/*-
+ * Copyright 2013-2018 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef __unix__
+#include <sys/mman.h>
+#endif
+
+#define HUGEPAGE_THRESHOLD		(12 * 1024 * 1024)
+
+#ifdef __x86_64__
+#define HUGEPAGE_SIZE			(2 * 1024 * 1024)
+#else
+#undef HUGEPAGE_SIZE
+#endif
+
+static void *alloc_region(yespower_region_t *region, size_t size)
+{
+	size_t base_size = size;
+	uint8_t *base, *aligned;
+#ifdef MAP_ANON
+	int flags =
+#ifdef MAP_NOCORE
+	    MAP_NOCORE |
+#endif
+	    MAP_ANON | MAP_PRIVATE;
+#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE)
+	size_t new_size = size;
+	const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1;
+	if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) {
+		flags |= MAP_HUGETLB;
+/*
+ * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of
+ * huge page size, so let's round up to huge page size here.
+ */
+		new_size = size + hugepage_mask;
+		new_size &= ~hugepage_mask;
+	}
+	base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	if (base != MAP_FAILED) {
+		base_size = new_size;
+	} else if (flags & MAP_HUGETLB) {
+		flags &= ~MAP_HUGETLB;
+		base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+	}
+
+#else
+	base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+	if (base == MAP_FAILED)
+		base = NULL;
+	aligned = base;
+#elif defined(HAVE_POSIX_MEMALIGN)
+	if ((errno = posix_memalign((void **)&base, 64, size)) != 0)
+		base = NULL;
+	aligned = base;
+#else
+	base = aligned = NULL;
+	if (size + 63 < size) {
+		errno = ENOMEM;
+	} else if ((base = malloc(size + 63)) != NULL) {
+		aligned = base + 63;
+		aligned -= (uintptr_t)aligned & 63;
+	}
+#endif
+	region->base = base;
+	region->aligned = aligned;
+	region->base_size = base ? base_size : 0;
+	region->aligned_size = base ? size : 0;
+	return aligned;
+}
+
+static inline void init_region(yespower_region_t *region)
+{
+	region->base = region->aligned = NULL;
+	region->base_size = region->aligned_size = 0;
+}
+
+static int free_region(yespower_region_t *region)
+{
+	if (region->base) {
+#ifdef MAP_ANON
+		if (munmap(region->base, region->base_size))
+			return -1;
+#else
+		free(region->base);
+#endif
+	}
+	init_region(region);
+	return 0;
+}
diff --git a/yespower-1.0.1/yespower-ref.c b/yespower-1.0.1/yespower-ref.c
new file mode 100644
index 000000000..ad0f607ea
--- /dev/null
+++ b/yespower-1.0.1/yespower-ref.c
@@ -0,0 +1,582 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013-2019 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ *
+ * This is a proof-of-work focused fork of yescrypt, including reference and
+ * cut-down implementation of the obsolete yescrypt 0.5 (based off its first
+ * submission to PHC back in 2014) and a new proof-of-work specific variation
+ * known as yespower 1.0.  The former is intended as an upgrade for
+ * cryptocurrencies that already use yescrypt 0.5 and the latter may be used
+ * as a further upgrade (hard fork) by those and other cryptocurrencies.  The
+ * version of algorithm to use is requested through parameters, allowing for
+ * both algorithms to co-exist in client and miner implementations (such as in
+ * preparation for a hard-fork).
+ *
+ * This is the reference implementation.  Its purpose is to provide a simple
+ * human- and machine-readable specification that implementations intended
+ * for actual use should be tested against.  It is deliberately mostly not
+ * optimized, and it is not meant to be used in production.  Instead, use
+ * yespower-opt.c.
+ */
+
+#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose."
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sha256.h"
+#include "sysendian.h"
+
+#include "yespower.h"
+
+static void blkcpy(uint32_t *dst, const uint32_t *src, size_t count)
+{
+	do {
+		*dst++ = *src++;
+	} while (--count);
+}
+
+static void blkxor(uint32_t *dst, const uint32_t *src, size_t count)
+{
+	do {
+		*dst++ ^= *src++;
+	} while (--count);
+}
+
+/**
+ * salsa20(B):
+ * Apply the Salsa20 core to the provided block.
+ */
+static void salsa20(uint32_t B[16], uint32_t rounds)
+{
+	uint32_t x[16];
+	size_t i;
+
+	/* SIMD unshuffle */
+	for (i = 0; i < 16; i++)
+		x[i * 5 % 16] = B[i];
+
+	for (i = 0; i < rounds; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns */
+		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+
+		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+
+		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+
+		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+
+		/* Operate on rows */
+		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+
+		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+
+		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+
+		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+#undef R
+	}
+
+	/* SIMD shuffle */
+	for (i = 0; i < 16; i++)
+		B[i] += x[i * 5 % 16];
+}
+
+/**
+ * blockmix_salsa(B):
+ * Compute B = BlockMix_{salsa20, 1}(B).  The input B must be 128 bytes in
+ * length.
+ */
+static void blockmix_salsa(uint32_t *B, uint32_t rounds)
+{
+	uint32_t X[16];
+	size_t i;
+
+	/* 1: X <-- B_{2r - 1} */
+	blkcpy(X, &B[16], 16);
+
+	/* 2: for i = 0 to 2r - 1 do */
+	for (i = 0; i < 2; i++) {
+		/* 3: X <-- H(X xor B_i) */
+		blkxor(X, &B[i * 16], 16);
+		salsa20(X, rounds);
+
+		/* 4: Y_i <-- X */
+		/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
+		blkcpy(&B[i * 16], X, 16);
+	}
+}
+
+/*
+ * These are tunable, but they must meet certain constraints and are part of
+ * what defines a yespower version.
+ */
+#define PWXsimple 2
+#define PWXgather 4
+/* Version 0.5 */
+#define PWXrounds_0_5 6
+#define Swidth_0_5 8
+/* Version 1.0 */
+#define PWXrounds_1_0 3
+#define Swidth_1_0 11
+
+/* Derived values.  Not tunable on their own. */
+#define PWXbytes (PWXgather * PWXsimple * 8)
+#define PWXwords (PWXbytes / sizeof(uint32_t))
+#define rmin ((PWXbytes + 127) / 128)
+
+/* Runtime derived values.  Not tunable on their own. */
+#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8)
+#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8)
+
+typedef struct {
+	yespower_version_t version;
+	uint32_t salsa20_rounds;
+	uint32_t PWXrounds, Swidth, Sbytes, Smask;
+	uint32_t *S;
+	uint32_t (*S0)[2], (*S1)[2], (*S2)[2];
+	size_t w;
+} pwxform_ctx_t;
+
+/**
+ * pwxform(B):
+ * Transform the provided block using the provided S-boxes.
+ */
+static void pwxform(uint32_t *B, pwxform_ctx_t *ctx)
+{
+	uint32_t (*X)[PWXsimple][2] = (uint32_t (*)[PWXsimple][2])B;
+	uint32_t (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2;
+	uint32_t Smask = ctx->Smask;
+	size_t w = ctx->w;
+	size_t i, j, k;
+
+	/* 1: for i = 0 to PWXrounds - 1 do */
+	for (i = 0; i < ctx->PWXrounds; i++) {
+		/* 2: for j = 0 to PWXgather - 1 do */
+		for (j = 0; j < PWXgather; j++) {
+			uint32_t xl = X[j][0][0];
+			uint32_t xh = X[j][0][1];
+			uint32_t (*p0)[2], (*p1)[2];
+
+			/* 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8) */
+			p0 = S0 + (xl & Smask) / sizeof(*S0);
+			/* 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8) */
+			p1 = S1 + (xh & Smask) / sizeof(*S1);
+
+			/* 5: for k = 0 to PWXsimple - 1 do */
+			for (k = 0; k < PWXsimple; k++) {
+				uint64_t x, s0, s1;
+
+				/* 6: B_{j,k} <-- (hi(B_{j,k}) * lo(B_{j,k}) + S0_{p0,k}) xor S1_{p1,k} */
+				s0 = ((uint64_t)p0[k][1] << 32) + p0[k][0];
+				s1 = ((uint64_t)p1[k][1] << 32) + p1[k][0];
+
+				xl = X[j][k][0];
+				xh = X[j][k][1];
+
+				x = (uint64_t)xh * xl;
+				x += s0;
+				x ^= s1;
+
+				X[j][k][0] = x;
+				X[j][k][1] = x >> 32;
+			}
+
+			if (ctx->version != YESPOWER_0_5 &&
+			    (i == 0 || j < PWXgather / 2)) {
+				if (j & 1) {
+					for (k = 0; k < PWXsimple; k++) {
+						S1[w][0] = X[j][k][0];
+						S1[w][1] = X[j][k][1];
+						w++;
+					}
+				} else {
+					for (k = 0; k < PWXsimple; k++) {
+						S0[w + k][0] = X[j][k][0];
+						S0[w + k][1] = X[j][k][1];
+					}
+				}
+			}
+		}
+	}
+
+	if (ctx->version != YESPOWER_0_5) {
+		/* 14: (S0, S1, S2) <-- (S2, S0, S1) */
+		ctx->S0 = S2;
+		ctx->S1 = S0;
+		ctx->S2 = S1;
+		/* 15: w <-- w mod 2^Swidth */
+		ctx->w = w & ((1 << ctx->Swidth) * PWXsimple - 1);
+	}
+}
+
+/**
+ * blockmix_pwxform(B, ctx, r):
+ * Compute B = BlockMix_pwxform{salsa20, ctx, r}(B).  The input B must be
+ * 128r bytes in length.
+ */
+static void blockmix_pwxform(uint32_t *B, pwxform_ctx_t *ctx, size_t r)
+{
+	uint32_t X[PWXwords];
+	size_t r1, i;
+
+	/* Convert 128-byte blocks to PWXbytes blocks */
+	/* 1: r_1 <-- 128r / PWXbytes */
+	r1 = 128 * r / PWXbytes;
+
+	/* 2: X <-- B'_{r_1 - 1} */
+	blkcpy(X, &B[(r1 - 1) * PWXwords], PWXwords);
+
+	/* 3: for i = 0 to r_1 - 1 do */
+	for (i = 0; i < r1; i++) {
+		/* 4: if r_1 > 1 */
+		if (r1 > 1) {
+			/* 5: X <-- X xor B'_i */
+			blkxor(X, &B[i * PWXwords], PWXwords);
+		}
+
+		/* 7: X <-- pwxform(X) */
+		pwxform(X, ctx);
+
+		/* 8: B'_i <-- X */
+		blkcpy(&B[i * PWXwords], X, PWXwords);
+	}
+
+	/* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */
+	i = (r1 - 1) * PWXbytes / 64;
+
+	/* 11: B_i <-- H(B_i) */
+	salsa20(&B[i * 16], ctx->salsa20_rounds);
+
+#if 1 /* No-op with our current pwxform settings, but do it to make sure */
+	/* 12: for i = i + 1 to 2r - 1 do */
+	for (i++; i < 2 * r; i++) {
+		/* 13: B_i <-- H(B_i xor B_{i-1}) */
+		blkxor(&B[i * 16], &B[(i - 1) * 16], 16);
+		salsa20(&B[i * 16], ctx->salsa20_rounds);
+	}
+#endif
+}
+
+/**
+ * integerify(B, r):
+ * Return the result of parsing B_{2r-1} as a little-endian integer.
+ */
+static uint32_t integerify(const uint32_t *B, size_t r)
+{
+/*
+ * Our 32-bit words are in host byte order.  Also, they are SIMD-shuffled, but
+ * we only care about the least significant 32 bits anyway.
+ */
+	const uint32_t *X = &B[(2 * r - 1) * 16];
+	return X[0];
+}
+
+/**
+ * p2floor(x):
+ * Largest power of 2 not greater than argument.
+ */
+static uint32_t p2floor(uint32_t x)
+{
+	uint32_t y;
+	while ((y = x & (x - 1)))
+		x = y;
+	return x;
+}
+
+/**
+ * wrap(x, i):
+ * Wrap x to the range 0 to i-1.
+ */
+static uint32_t wrap(uint32_t x, uint32_t i)
+{
+	uint32_t n = p2floor(i);
+	return (x & (n - 1)) + (i - n);
+}
+
+/**
+ * smix1(B, r, N, V, X, ctx):
+ * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage X must be 128r bytes in length.
+ */
+static void smix1(uint32_t *B, size_t r, uint32_t N,
+    uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx)
+{
+	size_t s = 32 * r;
+	uint32_t i, j;
+	size_t k;
+
+	/* 1: X <-- B */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]);
+
+	if (ctx->version != YESPOWER_0_5) {
+		for (k = 1; k < r; k++) {
+			blkcpy(&X[k * 32], &X[(k - 1) * 32], 32);
+			blockmix_pwxform(&X[k * 32], ctx, 1);
+		}
+	}
+
+	/* 2: for i = 0 to N - 1 do */
+	for (i = 0; i < N; i++) {
+		/* 3: V_i <-- X */
+		blkcpy(&V[i * s], X, s);
+
+		if (i > 1) {
+			/* j <-- Wrap(Integerify(X), i) */
+			j = wrap(integerify(X, r), i);
+
+			/* X <-- X xor V_j */
+			blkxor(X, &V[j * s], s);
+		}
+
+		/* 4: X <-- H(X) */
+		if (V != ctx->S)
+			blockmix_pwxform(X, ctx, r);
+		else
+			blockmix_salsa(X, ctx->salsa20_rounds);
+	}
+
+	/* B' <-- X */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]);
+}
+
+/**
+ * smix2(B, r, N, Nloop, V, X, ctx):
+ * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
+ * length; the temporary storage V must be 128rN bytes in length; the temporary
+ * storage X must be 128r bytes in length.  The value N must be a power of 2
+ * greater than 1.
+ */
+static void smix2(uint32_t *B, size_t r, uint32_t N, uint32_t Nloop,
+    uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx)
+{
+	size_t s = 32 * r;
+	uint32_t i, j;
+	size_t k;
+
+	/* X <-- B */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			X[k * 16 + i] = le32dec(&B[k * 16 + (i * 5 % 16)]);
+
+	/* 6: for i = 0 to N - 1 do */
+	for (i = 0; i < Nloop; i++) {
+		/* 7: j <-- Integerify(X) mod N */
+		j = integerify(X, r) & (N - 1);
+
+		/* 8.1: X <-- X xor V_j */
+		blkxor(X, &V[j * s], s);
+		/* V_j <-- X */
+		if (Nloop != 2)
+			blkcpy(&V[j * s], X, s);
+
+		/* 8.2: X <-- H(X) */
+		blockmix_pwxform(X, ctx, r);
+	}
+
+	/* 10: B' <-- X */
+	for (k = 0; k < 2 * r; k++)
+		for (i = 0; i < 16; i++)
+			le32enc(&B[k * 16 + (i * 5 % 16)], X[k * 16 + i]);
+}
+
+/**
+ * smix(B, r, N, p, t, V, X, ctx):
+ * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
+ * temporary storage V must be 128rN bytes in length; the temporary storage
+ * X must be 128r bytes in length.  The value N must be a power of 2 and at
+ * least 16.
+ */
+static void smix(uint32_t *B, size_t r, uint32_t N,
+    uint32_t *V, uint32_t *X, pwxform_ctx_t *ctx)
+{
+	uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
+	uint32_t Nloop_rw = Nloop_all;
+
+	Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
+	if (ctx->version == YESPOWER_0_5) {
+		Nloop_rw &= ~(uint32_t)1; /* round down to even */
+	} else {
+		Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
+	}
+
+	smix1(B, 1, ctx->Sbytes / 128, ctx->S, X, ctx);
+	smix1(B, r, N, V, X, ctx);
+	smix2(B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx);
+	smix2(B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx);
+}
+
+/**
+ * yespower(local, src, srclen, params, dst):
+ * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
+ *
+ * Return 0 on success; or -1 on error.
+ */
+int yespower(yespower_local_t *local,
+    const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst)
+{
+	yespower_version_t version = params->version;
+	uint32_t N = params->N;
+	uint32_t r = params->r;
+	const uint8_t *pers = params->pers;
+	size_t perslen = params->perslen;
+	int retval = -1;
+	size_t B_size, V_size;
+	uint32_t *B, *V, *X, *S;
+	pwxform_ctx_t ctx;
+	uint32_t sha256[8];
+
+	memset(dst, 0xff, sizeof(*dst));
+
+	/* Sanity-check parameters */
+	if ((version != YESPOWER_0_5 && version != YESPOWER_1_0) ||
+	    N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
+	    (N & (N - 1)) != 0 || r < rmin ||
+	    (!pers && perslen)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Allocate memory */
+	B_size = (size_t)128 * r;
+	V_size = B_size * N;
+	if ((V = malloc(V_size)) == NULL)
+		return -1;
+	if ((B = malloc(B_size)) == NULL)
+		goto free_V;
+	if ((X = malloc(B_size)) == NULL)
+		goto free_B;
+	ctx.version = version;
+	if (version == YESPOWER_0_5) {
+		ctx.salsa20_rounds = 8;
+		ctx.PWXrounds = PWXrounds_0_5;
+		ctx.Swidth = Swidth_0_5;
+		ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth);
+	} else {
+		ctx.salsa20_rounds = 2;
+		ctx.PWXrounds = PWXrounds_1_0;
+		ctx.Swidth = Swidth_1_0;
+		ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth);
+	}
+	if ((S = malloc(ctx.Sbytes)) == NULL)
+		goto free_X;
+	ctx.S = S;
+	ctx.S0 = (uint32_t (*)[2])S;
+	ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple;
+	ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple;
+	ctx.Smask = Swidth_to_Smask(ctx.Swidth);
+	ctx.w = 0;
+
+	SHA256_Buf(src, srclen, (uint8_t *)sha256);
+
+	if (version != YESPOWER_0_5) {
+		if (pers) {
+			src = pers;
+			srclen = perslen;
+		} else {
+			srclen = 0;
+		}
+	}
+
+	/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
+	PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256),
+	    src, srclen, 1, (uint8_t *)B, B_size);
+
+	blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0]));
+
+	/* 3: B_i <-- MF(B_i, N) */
+	smix(B, r, N, V, X, &ctx);
+
+	if (version == YESPOWER_0_5) {
+		/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
+		PBKDF2_SHA256((uint8_t *)sha256, sizeof(sha256),
+		    (uint8_t *)B, B_size, 1, (uint8_t *)dst, sizeof(*dst));
+
+		if (pers) {
+			HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
+			    (uint8_t *)sha256);
+			SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
+		}
+	} else {
+		HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
+		    sha256, sizeof(sha256), (uint8_t *)dst);
+	}
+
+	/* Success! */
+	retval = 0;
+
+	/* Free memory */
+	free(S);
+free_X:
+	free(X);
+free_B:
+	free(B);
+free_V:
+	free(V);
+
+	return retval;
+}
+
+int yespower_tls(const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst)
+{
+/* The reference implementation doesn't use thread-local storage */
+	return yespower(NULL, src, srclen, params, dst);
+}
+
+int yespower_init_local(yespower_local_t *local)
+{
+/* The reference implementation doesn't use the local structure */
+	local->base = local->aligned = NULL;
+	local->base_size = local->aligned_size = 0;
+	return 0;
+}
+
+int yespower_free_local(yespower_local_t *local)
+{
+/* The reference implementation frees its memory in yespower() */
+	(void)local; /* unused */
+	return 0;
+}
diff --git a/yespower-1.0.1/yespower.h b/yespower-1.0.1/yespower.h
new file mode 100644
index 000000000..b388d7217
--- /dev/null
+++ b/yespower-1.0.1/yespower.h
@@ -0,0 +1,130 @@
+/*-
+ * Copyright 2009 Colin Percival
+ * Copyright 2013-2018 Alexander Peslyak
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+#ifndef _YESPOWER_H_
+#define _YESPOWER_H_
+
+#include <stdint.h>
+#include <stdlib.h> /* for size_t */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Internal type used by the memory allocator.  Please do not use it directly.
+ * Use yespower_local_t instead.
+ */
+typedef struct {
+	void *base, *aligned;
+	size_t base_size, aligned_size;
+} yespower_region_t;
+
+/**
+ * Type for thread-local (RAM) data structure.
+ */
+typedef yespower_region_t yespower_local_t;
+
+/*
+ * Type for yespower algorithm version numbers.
+ */
+typedef enum { YESPOWER_0_5 = 5, YESPOWER_1_0 = 10 } yespower_version_t;
+
+/**
+ * yespower parameters combined into one struct.
+ */
+typedef struct {
+	yespower_version_t version;
+	uint32_t N, r;
+	const uint8_t *pers;
+	size_t perslen;
+} yespower_params_t;
+
+/**
+ * A 256-bit yespower hash.
+ */
+typedef struct {
+	unsigned char uc[32];
+} yespower_binary_t;
+
+/**
+ * yespower_init_local(local):
+ * Initialize the thread-local (RAM) data structure.  Actual memory allocation
+ * is currently fully postponed until a call to yespower().
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yespower_init_local(yespower_local_t *local);
+
+/**
+ * yespower_free_local(local):
+ * Free memory that may have been allocated for an initialized thread-local
+ * (RAM) data structure.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as local is local to the thread.
+ */
+extern int yespower_free_local(yespower_local_t *local);
+
+/**
+ * yespower(local, src, srclen, params, dst):
+ * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
+ * local is the thread-local data structure, allowing to preserve and reuse a
+ * memory allocation across calls, thereby reducing processing overhead.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * local must be initialized with yespower_init_local().
+ *
+ * MT-safe as long as local and dst are local to the thread.
+ */
+extern int yespower(yespower_local_t *local,
+    const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst);
+
+/**
+ * yespower_tls(src, srclen, params, dst):
+ * Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
+ * The memory allocation is maintained internally using thread-local storage.
+ *
+ * Return 0 on success; or -1 on error.
+ *
+ * MT-safe as long as dst is local to the thread.
+ */
+extern int yespower_tls(const uint8_t *src, size_t srclen,
+    const yespower_params_t *params, yespower_binary_t *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_YESPOWER_H_ */