diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 8e4f5e43a1..fd1a928fc9 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -171,6 +171,7 @@ option(OQS_ENABLE_SIG_MAYO "Enable mayo algorithm family" ON) cmake_dependent_option(OQS_ENABLE_SIG_mayo_1 "" ON "OQS_ENABLE_SIG_MAYO" OFF) cmake_dependent_option(OQS_ENABLE_SIG_mayo_2 "" ON "OQS_ENABLE_SIG_MAYO" OFF) cmake_dependent_option(OQS_ENABLE_SIG_mayo_3 "" ON "OQS_ENABLE_SIG_MAYO" OFF) +cmake_dependent_option(OQS_ENABLE_SIG_mayo_5 "" ON "OQS_ENABLE_SIG_MAYO" OFF) ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_END if((OQS_MINIMAL_BUILD STREQUAL "ON")) @@ -521,6 +522,12 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") +if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) + cmake_dependent_option(OQS_ENABLE_SIG_mayo_5_avx2 "" ON "OQS_ENABLE_SIG_mayo_5" OFF) +endif() +endif() + ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_CONDITIONAL_END # Set XKCP (Keccak) required for Sphincs AVX2 code even if OpenSSL3 SHA3 is used: diff --git a/README.md b/README.md index 436192deec..f1350bd9d7 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ All names other than `ML-KEM` and `ML-DSA` are subject to change. `liboqs` makes - **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5 - **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024 -- **MAYO**: MAYO\_1, MAYO\_2, MAYO\_3 +- **MAYO**: MAYO\_1, MAYO\_2, MAYO\_3, MAYO\_5† - **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87) - **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple - **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple diff --git a/docs/algorithms/sig/mayo.md b/docs/algorithms/sig/mayo.md index 3f2205c1ac..1bf05fd44f 100644 --- a/docs/algorithms/sig/mayo.md +++ b/docs/algorithms/sig/mayo.md @@ -17,6 +17,7 @@ | MAYO\_1 | NA | EUF-CMA | 1 | 1168 | 24 | 321 | | MAYO\_2 | NA | EUF-CMA | 1 | 5488 | 24 | 180 | | MAYO\_3 | NA | EUF-CMA | 3 | 2656 | 32 | 577 | +| MAYO\_5 | NA | EUF-CMA | 5 | 5008 | 40 | 838 | ## MAYO\_1 implementation characteristics @@ -47,6 +48,15 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**. Are implementations chosen based on runtime CPU feature detection? **Yes**. +## MAYO\_5 implementation characteristics + +| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? | +|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| +| [Primary Source](#primary-source) | opt | All | All | None | True | True | False | +| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | True | + +Are implementations chosen based on runtime CPU feature detection? **Yes**. + ## Explanation of Terms - **Large Stack Usage**: Implementations identified as having such may cause failures when running in threads or in constrained environments. \ No newline at end of file diff --git a/docs/algorithms/sig/mayo.yml b/docs/algorithms/sig/mayo.yml index 6395da707f..4cbd20d7ff 100644 --- a/docs/algorithms/sig/mayo.yml +++ b/docs/algorithms/sig/mayo.yml @@ -110,3 +110,34 @@ parameter-sets: no-secret-dependent-branching-claimed: true no-secret-dependent-branching-checked-by-valgrind: true large-stack-usage: false +- name: MAYO_5 + claimed-nist-level: 5 + claimed-security: EUF-CMA + length-public-key: 5008 + length-secret-key: 40 + length-signature: 838 + implementations-switch-on-runtime-cpu-features: true + implementations: + - upstream: primary-upstream + upstream-id: opt + supported-platforms: all + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: avx2 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Darwin + - Linux + required_flags: + - avx2 + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: true diff --git a/docs/cbom.json b/docs/cbom.json index e7b1e38c9b..1b2f5fca93 100644 --- a/docs/cbom.json +++ b/docs/cbom.json @@ -1,23 +1,23 @@ { "bomFormat": "CBOM", "specVersion": "1.4-cbom-1.0", - "serialNumber": "urn:uuid:0366b737-0d62-4de3-92e9-adfae8c1326a", + "serialNumber": "urn:uuid:dbce4160-9dc5-450c-b83c-0cb0345d7045", "version": 1, "metadata": { - "timestamp": "2024-05-08T14:51:11.690367", + "timestamp": "2024-05-24T18:35:02.761477", "component": { "type": "library", - "bom-ref": "pkg:github/open-quantum-safe/liboqs@248c78af58129f2d59faed9cace2567c82d1b992", + "bom-ref": "pkg:github/open-quantum-safe/liboqs@580d494efeee25edd354eaa2e0fe6581d552b827", "name": "liboqs", - "version": "248c78af58129f2d59faed9cace2567c82d1b992" + "version": "580d494efeee25edd354eaa2e0fe6581d552b827" } }, "components": [ { "type": "library", - "bom-ref": "pkg:github/open-quantum-safe/liboqs@248c78af58129f2d59faed9cace2567c82d1b992", + "bom-ref": "pkg:github/open-quantum-safe/liboqs@580d494efeee25edd354eaa2e0fe6581d552b827", "name": "liboqs", - "version": "248c78af58129f2d59faed9cace2567c82d1b992" + "version": "580d494efeee25edd354eaa2e0fe6581d552b827" }, { "type": "crypto-asset", @@ -1659,6 +1659,46 @@ "nistQuantumSecurityLevel": 3 } }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO_5:generic", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO_5", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "generic" + }, + "nistQuantumSecurityLevel": 5 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO_5:x86_64", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO_5", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "x86_64" + }, + "nistQuantumSecurityLevel": 5 + } + }, { "type": "crypto-asset", "bom-ref": "alg:ML-DSA-44-ipd:generic", @@ -2288,7 +2328,7 @@ ], "dependencies": [ { - "ref": "pkg:github/open-quantum-safe/liboqs@248c78af58129f2d59faed9cace2567c82d1b992", + "ref": "pkg:github/open-quantum-safe/liboqs@580d494efeee25edd354eaa2e0fe6581d552b827", "dependsOn": [ "alg:BIKE-L1:x86_64", "alg:BIKE-L3:x86_64", @@ -2372,6 +2412,8 @@ "alg:MAYO_2:x86_64", "alg:MAYO_3:generic", "alg:MAYO_3:x86_64", + "alg:MAYO_5:generic", + "alg:MAYO_5:x86_64", "alg:ML-DSA-44-ipd:generic", "alg:ML-DSA-44-ipd:x86_64", "alg:ML-DSA-65-ipd:generic", @@ -3016,6 +3058,21 @@ ], "dependencyType": "uses" }, + { + "ref": "alg:MAYO_5:generic", + "dependsOn": [ + "alg:sha3" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO_5:x86_64", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, { "ref": "alg:ML-DSA-44-ipd:generic", "dependsOn": [ diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index 37653dcbcb..35428af275 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -329,3 +329,8 @@ sigs: pqclean_scheme: mayo_3 pretty_name_full: MAYO_3 signed_msg_order: sig_then_msg + - + scheme: "5" + pqclean_scheme: mayo_5 + pretty_name_full: MAYO_5 + signed_msg_order: sig_then_msg diff --git a/src/common/aes/aes_ossl.c b/src/common/aes/aes_ossl.c index 4b78129f9d..c7dc5b9445 100644 --- a/src/common/aes/aes_ossl.c +++ b/src/common/aes/aes_ossl.c @@ -68,7 +68,7 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_ static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new()); - assert(ctr_ctx != NULL); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); uint8_t iv_ctr[16]; if (iv_len == 12) { memcpy(iv_ctr, iv, 12); @@ -94,11 +94,12 @@ static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const voi static void AES128_CTR_inc_init(const uint8_t *key, void **schedule) { *schedule = malloc(sizeof(struct key_schedule)); + OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); + struct key_schedule *ks = (struct key_schedule *) *schedule; EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)(); - assert(ctr_ctx != NULL); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); - OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); ks->for_ECB = 0; ks->ctx = ctr_ctx; memcpy(ks->key, key, 16); @@ -139,11 +140,12 @@ static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) { static void AES256_CTR_inc_init(const uint8_t *key, void **schedule) { *schedule = malloc(sizeof(struct key_schedule)); + OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); + struct key_schedule *ks = (struct key_schedule *) *schedule; EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)(); - assert(ctr_ctx != NULL); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); - OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); ks->for_ECB = 0; ks->ctx = ctr_ctx; memcpy(ks->key, key, 32); @@ -190,7 +192,7 @@ static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_ static void AES256_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)(); - assert(ctr_ctx != NULL); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); uint8_t iv_ctr[16]; if (iv_len == 12) { memcpy(iv_ctr, iv, 12); diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index d7648a60e9..8550bdd65d 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -197,4 +197,6 @@ #cmakedefine OQS_ENABLE_SIG_mayo_2_avx2 1 #cmakedefine OQS_ENABLE_SIG_mayo_3 1 #cmakedefine OQS_ENABLE_SIG_mayo_3_avx2 1 +#cmakedefine OQS_ENABLE_SIG_mayo_5 1 +#cmakedefine OQS_ENABLE_SIG_mayo_5_avx2 1 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_ENABLE_DEFINES_END diff --git a/src/sig/mayo/CMakeLists.txt b/src/sig/mayo/CMakeLists.txt index 612390dd2d..86b5ebaab1 100644 --- a/src/sig/mayo/CMakeLists.txt +++ b/src/sig/mayo/CMakeLists.txt @@ -59,4 +59,22 @@ if(OQS_ENABLE_SIG_mayo_3_avx2) set(_MAYO_OBJS ${_MAYO_OBJS} $) endif() +if(OQS_ENABLE_SIG_mayo_5) + add_library(mayo_5_opt OBJECT sig_mayo_5.c pqmayo_mayo_5_opt/api.c pqmayo_mayo_5_opt/arithmetic.c pqmayo_mayo_5_opt/mayo.c pqmayo_mayo_5_opt/params.c) + target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + target_include_directories(mayo_5_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_5_opt) + target_include_directories(mayo_5_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_5_avx2) + add_library(mayo_5_avx2 OBJECT pqmayo_mayo_5_avx2/api.c pqmayo_mayo_5_avx2/arithmetic.c pqmayo_mayo_5_avx2/mayo.c pqmayo_mayo_5_avx2/params.c) + target_include_directories(mayo_5_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_5_avx2) + target_include_directories(mayo_5_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_5_avx2 PRIVATE -mavx2) + target_compile_options(mayo_5_avx2 PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + set(MAYO_OBJS ${_MAYO_OBJS} PARENT_SCOPE) diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo_5_avx2/LICENSE new file mode 100644 index 0000000000..8f71f43fee --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo_5_avx2/NOTICE new file mode 100644 index 0000000000..53da47c21c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/NOTICE @@ -0,0 +1,13 @@ +Copyright 2023 the MAYO team. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo_5_avx2/aes_ctr.h new file mode 100644 index 0000000000..8e41c30f2c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/aes_ctr.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef AESCTR_H +#define AESCTR_H + +#include +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/api.c b/src/sig/mayo/pqmayo_mayo_5_avx2/api.c new file mode 100644 index 0000000000..f2e861e9c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_5 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/api.h b/src/sig/mayo/pqmayo_mayo_5_avx2/api.h new file mode 100644 index 0000000000..fa1ebbb790 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 40 +#define CRYPTO_PUBLICKEYBYTES 5008 +#define CRYPTO_BYTES 838 + +#define CRYPTO_ALGNAME "MAYO_5" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_128.h new file mode 100644 index 0000000000..27b367e940 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_128.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static + inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_64.h new file mode 100644 index 0000000000..9f7535c878 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_64.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_96.h new file mode 100644 index 0000000000..86359679fb --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_96.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_common.h new file mode 100644 index 0000000000..eeb13dc0bd --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_common.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include +#include + +#define K_OVER_2 ((K_MAX+1)/2) + +static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, + 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, + 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01 +}; + +// +// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper! +// +static inline __m256i tbl32_gf16_multab2( uint8_t b ) { + + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) { + return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f); +} + +static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) { + __m256i multab_l = tbl32_gf16_multab2( b ); + __m256i multab_h = _mm256_slli_epi16( multab_l, 4 ); + + return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) ); +} + +static +inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){ + // build multiplication tables + for (size_t r = 0; r < V_MAX; r++) + { + for (size_t c = 0; c < O_MAX; c+=2) + { + O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4); + } + } +} + + +static +inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){ + // build multiplication tables + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4); + } +#if K_MAX % 2 == 1 + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]); +#endif + } +} + +static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a}; + + +static +inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) { + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])); +#endif + } +} + +static +inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) { + // build multiplication tables + size_t r; + for (size_t c = 0; c < O_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ; +#endif + } +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form.h new file mode 100644 index 0000000000..fa69de0ab2 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + + +// +// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/. +// +static inline __m256i tbl32_gf16_multab( uint8_t b ) { + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +/* put matrix in row echelon form with ones on first nonzero entries in constant time*/ +static inline void EF(unsigned char *A, int _nrows, int _ncols) { + + (void) _nrows; + (void) _ncols; + + #define nrows M_MAX + #define ncols (K_MAX * O_MAX + 1) + + #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32) + #define MAX_COLS (AVX_REGS_PER_ROW * 32) + + __m256i _pivot_row[AVX_REGS_PER_ROW]; + __m256i A_avx[AVX_REGS_PER_ROW* M_MAX]; + + unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row; + unsigned char* A_bytes = (unsigned char*) A_avx; + + // load A in the tail of AVX2 registers + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) + { + A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ]; + } + } + + // pivot row is secret, pivot col is not + unsigned char inverse; + int pivot_row = 0; + int pivot_col = MAYO_MAX(MAX_COLS - ncols,0); + for (; pivot_col < MAX_COLS-128; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-96; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-64; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-32; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS; pivot_col++) { + #include "echelon_form_loop.h" + } + + // write the matrix A back + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j]; + } + } + mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32); + mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows); +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form_loop.h new file mode 100644 index 0000000000..b8b29741c4 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form_loop.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 + +int pivot_col_rounded = pivot_col/32; + +int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS); +int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols); +/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/ + +/* zero out pivot row */ +for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) { + _pivot_row[i] = _mm256_set1_epi8(0); +} + +/* try to get a pivot row in constant time */ +unsigned char pivot = 0; +uint32_t pivot_is_zero = -1; +for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row); + uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row); + __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) ); + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j]; + } + pivot = pivot_row_bytes[pivot_col]; + pivot_is_zero = ~ct_compare_32((int) pivot, 0); +} + +/* multiply pivot row by inverse of pivot */ +inverse = inverse_f(pivot); +__m256i inverse_multab = tbl32_gf16_multab(inverse); + +for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]); +} + +/* conditionally write pivot row to the correct row, if there is a nonzero pivot */ +/* eliminate entries below pivot */ +for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row)); + unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col]; + + __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim); + if (row <= pivot_row_upper_bound) { + __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero); + for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { + A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^ + _mm256_shuffle_epi8(multab, _pivot_row[col]); + } + } else { + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]); + } + } +} + +pivot_row += (-(int32_t)(~pivot_is_zero)); + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/mem.h b/src/sig/mayo/pqmayo_mayo_5_avx2/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/params.c b/src/sig/mayo/pqmayo_mayo_5_avx2/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_128.h new file mode 100644 index 0000000000..27b416adce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_128.h @@ -0,0 +1,524 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_128_H +#define SHUFFLE_ARITHMETIC_128_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + cols_used ++; + __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0; + acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1; + } + } +} + +static +inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1; + } + } +} + + +static +inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k )); + __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1)); + __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2)); + __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3)); + + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1); + } + } +} + + +static +inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + +static +inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_64.h new file mode 100644 index 0000000000..defff86f8f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_64.h @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_64_H +#define SHUFFLE_ARITHMETIC_64_H + +#include +#include +#include +#include + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t; + } + } +} + + +static +inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } + } +} + +static +inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + pos); + pos += (V_MAX -c - 1); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k )); + __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1)); + + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4)); + _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t); + } + } +} + + +static +inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static +inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +static +inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular +// same as mayo_12_P1_times_Vt_avx2 +static +inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc); +} + +static +inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P3 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static inline +void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { + (void) size; + int m_vecs_stored = 0; + + for (int r = 0; r < O_MAX; ++r) { + const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r)); + __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in[0]; + m_vecs_stored++; + for (int c = r + 1; c < O_MAX; ++c) { + const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c)); + const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r)); + _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in2[0] ^ _in3[0]; + m_vecs_stored++; + } + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_96.h new file mode 100644 index 0000000000..9b3a69d567 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_96.h @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_96_H +#define SHUFFLE_ARITHMETIC_96_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + cols_used ++; + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + // P1 times S1 + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 times S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){ + mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc); +} + +static +inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_avx2/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_avx2/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/api.c b/src/sig/mayo/pqmayo_mayo_5_opt/api.c new file mode 100644 index 0000000000..f2e861e9c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_5 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/api.h b/src/sig/mayo/pqmayo_mayo_5_opt/api.h new file mode 100644 index 0000000000..fa1ebbb790 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 40 +#define CRYPTO_PUBLICKEYBYTES 5008 +#define CRYPTO_BYTES 838 + +#define CRYPTO_ALGNAME "MAYO_5" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_128.h new file mode 100644 index 0000000000..418c308e2f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_128.h @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 8;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_64.h new file mode 100644 index 0000000000..a70b7a3118 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_64.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +static inline uint32_t mul_table(uint8_t b){ + uint32_t x = ((uint32_t) b) * 0x08040201; + + uint32_t high_nibble_mask = 0xf0f0f0f0; + + uint32_t high_half = x & high_nibble_mask; + return (x ^ (high_half >> 4) ^ (high_half >> 3)); +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 4;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < legs; i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_96.h new file mode 100644 index 0000000000..a38f89e454 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_96.h @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 6;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_common.h new file mode 100644 index 0000000000..d337bc238c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_common.h @@ -0,0 +1,469 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include + +#ifndef MAYO_VARIANT +static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) { + + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2); + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2); + + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2); + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2); + + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2); + + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2); + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_copy(m_legs, bins + 1 * m_legs * 2, out); +} +#endif + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *PS) { + + const int n = o + v; +#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128) + (void)m; +#else + const int m_legs = m / 32; +#endif + + /* Old approach which is constant time but doesn't have to be + unsigned char S1[V_MAX*K_MAX]; + unsigned char S2[O_MAX*K_MAX]; + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2. + */ + + // use more stack efficient version for MAYO_3 and MAYO_5 + #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78 + uint64_t accumulator[M_MAX * N_MAX] = {0}; + int P1_used; + int P3_used; + for (int col = 0; col < k; col++) { + for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) { + accumulator[i] = 0; + } + P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8); +#else + bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P1_used ++; + } + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 ); +#else + bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 ); +#endif + } + } + + P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8); +#else + bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P3_used ++; + } + } + + for (int row = 0; row < n; row++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8); +#else + bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2); +#endif + } + } + + #else + + alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0}; + int P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P1_used ++; + } + + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 ); + } +#endif + } + } + + int P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P3_used ++; + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < n * k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8); + i++; +#else + m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2); + i++; +#endif + } + + #endif +} + + +static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){ + alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0}; + #if !defined(MAYO_VARIANT) + const int m_legs = m/32; + #else + (void) m; + #endif + for (int row = 0; row < k; row++) { + for (int j = 0; j < n; j++) { + for (int col = 0; col < k; col += 1) { + #if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 ); + #else + m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 ); + #endif + } + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < k*k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8); + i++; +#else + m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2); + i++; +#endif + } +} + + +// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_rows; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies the transpose of a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_cols; r++) { + for (int c = 0; c < mat_rows; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} + +// multiplies a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_rows; r++) { + for (int c = 0; c < mat_cols; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo_5_opt/echelon_form.h new file mode 100644 index 0000000000..82505847c9 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/echelon_form.h @@ -0,0 +1,152 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ECHELON_FORM_H +#define ECHELON_FORM_H + +#include +#include +#include +#include + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static inline unsigned char +m_extract_element(const uint64_t *in, int index) { + const int leg = index / 16; + const int offset = index % 16; + + return (in[leg] >> (offset*4)) & 0xF; +} + +static inline void +ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) { + int i; + unsigned char *out8 = (unsigned char *)out; + for(i = 0; i+1 < ncols; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4); +#else + out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4); +#endif + } + if (ncols % 2 == 1){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0); +#else + out8[i/2] = (in[i+0] << 0); +#endif + } +} + +static inline void +ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) { + const unsigned char *in8 = (const unsigned char *)in; + for(int i = 0; i < legs * 16; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF; + out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + out[i] = (in8[i/2]) & 0xF; + out[i+1] = (in8[i/2] >> 4); +#endif + } +} + + +// put matrix in row echelon form with ones on first nonzero entries *in +// constant time* +static inline void EF(unsigned char *A, int nrows, int ncols) { + + alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 }; + + int row_len = (ncols + 15) / 16; + + // nibbleslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols); + } + + // pivot row is secret, pivot col is not + + unsigned char inverse; + int pivot_row = 0; + for (int pivot_col = 0; pivot_col < ncols; pivot_col++) { + + int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols); + int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col); + // the pivot row is guaranteed to be between these lower and upper bounds if + // A has full rank + + // zero out pivot row + for (int i = 0; i < row_len; i++) { + _pivot_row[i] = 0; + _pivot_row2[i] = 0; + } + + // try to get a pivot row in constant time + unsigned char pivot = 0; + uint64_t pivot_is_zero = -1; + for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + + uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row); + uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row); + + for (int j = 0; j < row_len; j++) { + _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) & + packed_A[row * row_len + j]; + } + pivot = m_extract_element(_pivot_row, pivot_col); + pivot_is_zero = ~ct_compare_64((int) pivot, 0); + } + + // multiply pivot row by inverse of pivot + inverse = inverse_f(pivot); + vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2); + + // conditionally write pivot row to the correct row, if there is a nonzero + // pivot + for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) { + uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero; + uint64_t do_not_copy = ~do_copy; + for (int col = 0; col < row_len; col++) { + packed_A[row * row_len + col] = + (do_not_copy & packed_A[row * row_len + col]) + + (do_copy & _pivot_row2[col]); + } + } + + // eliminate entries below pivot + for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (row > pivot_row); + unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col); + + vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim, + packed_A + row * row_len); + } + + pivot_row += (-(int64_t)(~pivot_is_zero)); + } + + unsigned char temp[(O_MAX * K_MAX + 1 + 15)]; + + // unbitslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_unpack_m_vec(row_len, packed_A + i * row_len, temp); + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = temp[j]; + } + } + + mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15); + mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/mayo.c b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/mayo.h b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/mem.h b/src/sig/mayo/pqmayo_mayo_5_opt/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/params.c b/src/sig/mayo/pqmayo_mayo_5_opt/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_opt/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo_5_opt/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i + +#include + +#if defined(OQS_ENABLE_SIG_mayo_5) + +OQS_SIG *OQS_SIG_mayo_5_new(void) { + + OQS_SIG *sig = malloc(sizeof(OQS_SIG)); + if (sig == NULL) { + return NULL; + } + sig->method_name = OQS_SIG_alg_mayo_5; + sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo"; + + sig->claimed_nist_level = 5; + sig->euf_cma = true; + + sig->length_public_key = OQS_SIG_mayo_5_length_public_key; + sig->length_secret_key = OQS_SIG_mayo_5_length_secret_key; + sig->length_signature = OQS_SIG_mayo_5_length_signature; + + sig->keypair = OQS_SIG_mayo_5_keypair; + sig->sign = OQS_SIG_mayo_5_sign; + sig->verify = OQS_SIG_mayo_5_verify; + + return sig; +} + +extern int pqmayo_MAYO_5_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_5_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_5_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); + +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +extern int pqmayo_MAYO_5_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_5_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_5_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +#endif + +OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_keypair(public_key, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) { +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#endif +} + +#endif diff --git a/src/sig/sig.c b/src/sig/sig.c index 6722ca2931..bab752c607 100644 --- a/src/sig/sig.c +++ b/src/sig/sig.c @@ -42,7 +42,8 @@ OQS_API const char *OQS_SIG_alg_identifier(size_t i) { OQS_SIG_alg_sphincs_shake_256s_simple, OQS_SIG_alg_mayo_1, OQS_SIG_alg_mayo_2, - OQS_SIG_alg_mayo_3,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END + OQS_SIG_alg_mayo_3, + OQS_SIG_alg_mayo_5,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END }; if (i >= OQS_SIG_algs_length) { return NULL; @@ -256,6 +257,13 @@ OQS_API int OQS_SIG_alg_is_enabled(const char *method_name) { #else return 0; #endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) { +#ifdef OQS_ENABLE_SIG_mayo_5 + return 1; +#else + return 0; +#endif ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ENABLED_CASE_END } else { return 0; @@ -463,6 +471,13 @@ OQS_API OQS_SIG *OQS_SIG_new(const char *method_name) { #else return NULL; #endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) { +#ifdef OQS_ENABLE_SIG_mayo_5 + return OQS_SIG_mayo_5_new(); +#else + return NULL; +#endif ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_NEW_CASE_END // EDIT-WHEN-ADDING-SIG } else { diff --git a/src/sig/sig.h b/src/sig/sig.h index c545489b4e..f847104aa5 100644 --- a/src/sig/sig.h +++ b/src/sig/sig.h @@ -88,12 +88,14 @@ extern "C" { #define OQS_SIG_alg_mayo_2 "MAYO_2" /** Algorithm identifier for MAYO_3 */ #define OQS_SIG_alg_mayo_3 "MAYO_3" +/** Algorithm identifier for MAYO_5 */ +#define OQS_SIG_alg_mayo_5 "MAYO_5" ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END // EDIT-WHEN-ADDING-SIG ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START /** Number of algorithm identifiers above. */ -#define OQS_SIG_algs_length 28 +#define OQS_SIG_algs_length 29 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END /** diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json index 1da20fcfa8..9e5f229090 100644 --- a/tests/KATs/sig/kats.json +++ b/tests/KATs/sig/kats.json @@ -39,6 +39,10 @@ "all": "f66b95dda153b7df00610aa018f0644146e7e564b33562c51bb088c40fb0dcb2", "single": "dbc49f4fdfa0de69d416051215cb53c042c4a329d325452d079f3734b7467a6b" }, + "MAYO_5": { + "single": "f2c1c69045c7d15e714a04119965e8a7007ef54f9293158587560227c97b237d", + "all": "7b230c2626f57159a243d8dfc69c62cb94dd0f179dd2b4f2ef3606deb6404477" + }, "ML-DSA-44": { "all": "183bc0c4398ade4fc17b6a7d876b82545a96331139a4f27269c95664b8c483f9", "single": "e6f3ec4dc0b02dd3bcbbc6b105190e1890ca0bb3f802e2b571f0d70f3993a2e1" diff --git a/tests/kat_sig.c b/tests/kat_sig.c index b420b3809b..876a720aaf 100644 --- a/tests/kat_sig.c +++ b/tests/kat_sig.c @@ -302,6 +302,16 @@ OQS_STATUS combine_message_signature(uint8_t **signed_msg, size_t *signed_msg_le memcpy(*signed_msg, signature, signature_len); memcpy(*signed_msg + signature_len, msg, msg_len); return OQS_SUCCESS; + } else if (0 == strcmp(sig->method_name, "MAYO_5")) { + // signed_msg = signature || msg + *signed_msg_len = signature_len + msg_len; + *signed_msg = malloc(*signed_msg_len); + if (*signed_msg == NULL) { + return OQS_ERROR; + } + memcpy(*signed_msg, signature, signature_len); + memcpy(*signed_msg + signature_len, msg, msg_len); + return OQS_SUCCESS; ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_COMBINE_MESSAGE_SIGNATURE_END } else { return OQS_ERROR; diff --git a/tests/test_sig.c b/tests/test_sig.c index eb3ab0af12..094cc6d1cb 100644 --- a/tests/test_sig.c +++ b/tests/test_sig.c @@ -224,17 +224,30 @@ int main(int argc, char **argv) { OQS_STATUS rc; #if OQS_USE_PTHREADS #define MAX_LEN_SIG_NAME_ 64 - pthread_t thread; - struct thread_data td; - td.alg_name = alg_name; - int trc = pthread_create(&thread, NULL, test_wrapper, &td); - if (trc) { - fprintf(stderr, "ERROR: Creating pthread\n"); - OQS_destroy(); - return EXIT_FAILURE; + // don't run MAYO_5 in threads because of large stack usage + char no_thread_sig_patterns[][MAX_LEN_SIG_NAME_] = {"MAYO_5"}; + int test_in_thread = 1; + for (size_t i = 0 ; i < sizeof(no_thread_sig_patterns) / MAX_LEN_SIG_NAME_; ++i) { + if (strstr(alg_name, no_thread_sig_patterns[i]) != NULL) { + test_in_thread = 0; + break; + } + } + if (test_in_thread) { + pthread_t thread; + struct thread_data td; + td.alg_name = alg_name; + int trc = pthread_create(&thread, NULL, test_wrapper, &td); + if (trc) { + fprintf(stderr, "ERROR: Creating pthread\n"); + OQS_destroy(); + return EXIT_FAILURE; + } + pthread_join(thread, NULL); + rc = td.rc; + } else { + rc = sig_test_correctness(alg_name); } - pthread_join(thread, NULL); - rc = td.rc; #else rc = sig_test_correctness(alg_name); #endif