diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 8e4f5e43a1..fd1a928fc9 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -171,6 +171,7 @@ option(OQS_ENABLE_SIG_MAYO "Enable mayo algorithm family" ON)
 cmake_dependent_option(OQS_ENABLE_SIG_mayo_1 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
 cmake_dependent_option(OQS_ENABLE_SIG_mayo_2 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
 cmake_dependent_option(OQS_ENABLE_SIG_mayo_3 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_5 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_END
 
 if((OQS_MINIMAL_BUILD STREQUAL "ON"))
@@ -521,6 +522,12 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
 endif()
 endif()
 
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_mayo_5_avx2 "" ON "OQS_ENABLE_SIG_mayo_5" OFF)
+endif()
+endif()
+
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_CONDITIONAL_END
 
 # Set XKCP (Keccak) required for Sphincs AVX2 code even if OpenSSL3 SHA3 is used:
diff --git a/README.md b/README.md
index 436192deec..f1350bd9d7 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ All names other than `ML-KEM` and `ML-DSA` are subject to change. `liboqs` makes
 <!--- OQS_TEMPLATE_FRAGMENT_LIST_SIGS_START -->
 - **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5
 - **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024
-- **MAYO**: MAYO\_1, MAYO\_2, MAYO\_3
+- **MAYO**: MAYO\_1, MAYO\_2, MAYO\_3, MAYO\_5†
 - **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87)
 - **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple
 - **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple
diff --git a/docs/algorithms/sig/mayo.md b/docs/algorithms/sig/mayo.md
index 3f2205c1ac..1bf05fd44f 100644
--- a/docs/algorithms/sig/mayo.md
+++ b/docs/algorithms/sig/mayo.md
@@ -17,6 +17,7 @@
 |     MAYO\_1     | NA                    | EUF-CMA          |                    1 |                      1168 |                        24 |                      321 |
 |     MAYO\_2     | NA                    | EUF-CMA          |                    1 |                      5488 |                        24 |                      180 |
 |     MAYO\_3     | NA                    | EUF-CMA          |                    3 |                      2656 |                        32 |                      577 |
+|     MAYO\_5     | NA                    | EUF-CMA          |                    5 |                      5008 |                        40 |                      838 |
 
 ## MAYO\_1 implementation characteristics
 
@@ -47,6 +48,15 @@ Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
 Are implementations chosen based on runtime CPU feature detection? **Yes**.
 
+## MAYO\_5 implementation characteristics
+
+|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt                      | All                         | All                             | None                    | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Darwin,Linux                    | AVX2                    | True                               | True                                           | True                 |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
 ## Explanation of Terms
 
 - **Large Stack Usage**: Implementations identified as having such may cause failures when running in threads or in constrained environments.
\ No newline at end of file
diff --git a/docs/algorithms/sig/mayo.yml b/docs/algorithms/sig/mayo.yml
index 6395da707f..4cbd20d7ff 100644
--- a/docs/algorithms/sig/mayo.yml
+++ b/docs/algorithms/sig/mayo.yml
@@ -110,3 +110,34 @@ parameter-sets:
     no-secret-dependent-branching-claimed: true
     no-secret-dependent-branching-checked-by-valgrind: true
     large-stack-usage: false
+- name: MAYO_5
+  claimed-nist-level: 5
+  claimed-security: EUF-CMA
+  length-public-key: 5008
+  length-secret-key: 40
+  length-signature: 838
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: opt
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Darwin
+      - Linux
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: true
diff --git a/docs/cbom.json b/docs/cbom.json
index e7b1e38c9b..1b2f5fca93 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -1,23 +1,23 @@
 {
   "bomFormat": "CBOM",
   "specVersion": "1.4-cbom-1.0",
-  "serialNumber": "urn:uuid:0366b737-0d62-4de3-92e9-adfae8c1326a",
+  "serialNumber": "urn:uuid:dbce4160-9dc5-450c-b83c-0cb0345d7045",
   "version": 1,
   "metadata": {
-    "timestamp": "2024-05-08T14:51:11.690367",
+    "timestamp": "2024-05-24T18:35:02.761477",
     "component": {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@248c78af58129f2d59faed9cace2567c82d1b992",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@580d494efeee25edd354eaa2e0fe6581d552b827",
       "name": "liboqs",
-      "version": "248c78af58129f2d59faed9cace2567c82d1b992"
+      "version": "580d494efeee25edd354eaa2e0fe6581d552b827"
     }
   },
   "components": [
     {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@248c78af58129f2d59faed9cace2567c82d1b992",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@580d494efeee25edd354eaa2e0fe6581d552b827",
       "name": "liboqs",
-      "version": "248c78af58129f2d59faed9cace2567c82d1b992"
+      "version": "580d494efeee25edd354eaa2e0fe6581d552b827"
     },
     {
       "type": "crypto-asset",
@@ -1659,6 +1659,46 @@
         "nistQuantumSecurityLevel": 3
       }
     },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO_5:generic",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO_5",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO_5:x86_64",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO_5",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
     {
       "type": "crypto-asset",
       "bom-ref": "alg:ML-DSA-44-ipd:generic",
@@ -2288,7 +2328,7 @@
   ],
   "dependencies": [
     {
-      "ref": "pkg:github/open-quantum-safe/liboqs@248c78af58129f2d59faed9cace2567c82d1b992",
+      "ref": "pkg:github/open-quantum-safe/liboqs@580d494efeee25edd354eaa2e0fe6581d552b827",
       "dependsOn": [
         "alg:BIKE-L1:x86_64",
         "alg:BIKE-L3:x86_64",
@@ -2372,6 +2412,8 @@
         "alg:MAYO_2:x86_64",
         "alg:MAYO_3:generic",
         "alg:MAYO_3:x86_64",
+        "alg:MAYO_5:generic",
+        "alg:MAYO_5:x86_64",
         "alg:ML-DSA-44-ipd:generic",
         "alg:ML-DSA-44-ipd:x86_64",
         "alg:ML-DSA-65-ipd:generic",
@@ -3016,6 +3058,21 @@
       ],
       "dependencyType": "uses"
     },
+    {
+      "ref": "alg:MAYO_5:generic",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO_5:x86_64",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
     {
       "ref": "alg:ML-DSA-44-ipd:generic",
       "dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index 37653dcbcb..35428af275 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -329,3 +329,8 @@ sigs:
         pqclean_scheme: mayo_3
         pretty_name_full: MAYO_3
         signed_msg_order: sig_then_msg
+      -
+        scheme: "5"
+        pqclean_scheme: mayo_5
+        pretty_name_full: MAYO_5
+        signed_msg_order: sig_then_msg
diff --git a/src/common/aes/aes_ossl.c b/src/common/aes/aes_ossl.c
index 4b78129f9d..c7dc5b9445 100644
--- a/src/common/aes/aes_ossl.c
+++ b/src/common/aes/aes_ossl.c
@@ -68,7 +68,7 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
 
 static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
 	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new());
-	assert(ctr_ctx != NULL);
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
 	uint8_t iv_ctr[16];
 	if (iv_len == 12) {
 		memcpy(iv_ctr, iv, 12);
@@ -94,11 +94,12 @@ static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const voi
 
 static void AES128_CTR_inc_init(const uint8_t *key, void **schedule) {
 	*schedule = malloc(sizeof(struct key_schedule));
+	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
+
 	struct key_schedule *ks = (struct key_schedule *) *schedule;
 	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
-	assert(ctr_ctx != NULL);
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
 
-	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
 	ks->for_ECB = 0;
 	ks->ctx = ctr_ctx;
 	memcpy(ks->key, key, 16);
@@ -139,11 +140,12 @@ static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) {
 
 static void AES256_CTR_inc_init(const uint8_t *key, void **schedule) {
 	*schedule = malloc(sizeof(struct key_schedule));
+	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
+
 	struct key_schedule *ks = (struct key_schedule *) *schedule;
 	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
-	assert(ctr_ctx != NULL);
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
 
-	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
 	ks->for_ECB = 0;
 	ks->ctx = ctr_ctx;
 	memcpy(ks->key, key, 32);
@@ -190,7 +192,7 @@ static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
 
 static void AES256_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
 	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
-	assert(ctr_ctx != NULL);
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
 	uint8_t iv_ctr[16];
 	if (iv_len == 12) {
 		memcpy(iv_ctr, iv, 12);
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index d7648a60e9..8550bdd65d 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -197,4 +197,6 @@
 #cmakedefine OQS_ENABLE_SIG_mayo_2_avx2 1
 #cmakedefine OQS_ENABLE_SIG_mayo_3 1
 #cmakedefine OQS_ENABLE_SIG_mayo_3_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_5 1
+#cmakedefine OQS_ENABLE_SIG_mayo_5_avx2 1
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_ENABLE_DEFINES_END
diff --git a/src/sig/mayo/CMakeLists.txt b/src/sig/mayo/CMakeLists.txt
index 612390dd2d..86b5ebaab1 100644
--- a/src/sig/mayo/CMakeLists.txt
+++ b/src/sig/mayo/CMakeLists.txt
@@ -59,4 +59,22 @@ if(OQS_ENABLE_SIG_mayo_3_avx2)
     set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_3_avx2>)
 endif()
 
+if(OQS_ENABLE_SIG_mayo_5)
+    add_library(mayo_5_opt OBJECT sig_mayo_5.c pqmayo_mayo_5_opt/api.c pqmayo_mayo_5_opt/arithmetic.c pqmayo_mayo_5_opt/mayo.c pqmayo_mayo_5_opt/params.c)
+    target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    target_include_directories(mayo_5_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_5_opt)
+    target_include_directories(mayo_5_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_5_opt>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_5_avx2)
+    add_library(mayo_5_avx2 OBJECT pqmayo_mayo_5_avx2/api.c pqmayo_mayo_5_avx2/arithmetic.c pqmayo_mayo_5_avx2/mayo.c pqmayo_mayo_5_avx2/params.c)
+    target_include_directories(mayo_5_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo_5_avx2)
+    target_include_directories(mayo_5_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_5_avx2 PRIVATE -mavx2)
+    target_compile_options(mayo_5_avx2 PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_5_avx2>)
+endif()
+
 set(MAYO_OBJS ${_MAYO_OBJS} PARENT_SCOPE)
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo_5_avx2/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo_5_avx2/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo_5_avx2/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/api.c b/src/sig/mayo/pqmayo_mayo_5_avx2/api.c
new file mode 100644
index 0000000000..f2e861e9c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_5
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/api.h b/src/sig/mayo/pqmayo_mayo_5_avx2/api.h
new file mode 100644
index 0000000000..fa1ebbb790
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 40
+#define CRYPTO_PUBLICKEYBYTES 5008
+#define CRYPTO_BYTES 838
+
+#define CRYPTO_ALGNAME "MAYO_5"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+    inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdint.h>
+#include <immintrin.h>
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+    0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+    0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+    return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+    __m256i multab_l = tbl32_gf16_multab2( b );
+    __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+    return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static 
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+    // build multiplication tables 
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        for (size_t c = 0; c < O_MAX; c+=2)
+        {
+            O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+        }
+    }
+}
+
+
+static 
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            V_multabs[K_OVER_2*c +  r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+        }
+#if K_MAX % 2 == 1
+        V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+    }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, 
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, 
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, 
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, 
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, 
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, 
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, 
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, 
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, 
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, 
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, 
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, 
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, 
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, 
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static 
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+    }
+}
+
+static 
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+    }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <immintrin.h>
+#include <stdint.h>
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+    (void) _nrows;
+    (void) _ncols;
+
+    #define nrows M_MAX
+    #define ncols (K_MAX * O_MAX + 1)
+
+    #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+    #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+    __m256i _pivot_row[AVX_REGS_PER_ROW];
+    __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+    unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+    unsigned char* A_bytes = (unsigned char*) A_avx;
+
+    // load A in the tail of AVX2 registers
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++)
+        {
+            A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+        }
+    }
+
+    // pivot row is secret, pivot col is not
+    unsigned char inverse;
+    int pivot_row = 0;
+    int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+    for (; pivot_col < MAX_COLS-128; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-96; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-64; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-32; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+
+    // write the matrix A back
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+        }
+    }
+    mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+    mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+    _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+        row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+    uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+    uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+    __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+    for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+        _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+    }
+    pivot = pivot_row_bytes[pivot_col];
+    pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+    _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+    unsigned char below_pivot =  (unsigned char) (ct_is_greater_than(row, pivot_row));
+    unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+    __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+    if (row <= pivot_row_upper_bound) {
+        __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+        for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { 
+            A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+                                                    _mm256_shuffle_epi8(multab, _pivot_row[col]);
+        }
+    } else {
+        for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+            A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+        }
+    }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/mem.h b/src/sig/mayo/pqmayo_mayo_5_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/params.c b/src/sig/mayo/pqmayo_mayo_5_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            cols_used ++;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); 
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[(2*r*O_MAX) + 2*k]     ^= temp[2*k]     ^ _mm256_slli_epi16(t0,4);
+            acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+            acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+static 
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]  ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(k*O_MAX) + 2*c]      ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c]     ^= temp[2*k+1] ^ t0;
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+            __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+            __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k),     acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+        }
+    }
+}
+
+
+static 
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c]     ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]     ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c]     ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+            pos += (V_MAX -c - 1);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256(acc + (r*O_MAX + k    ), acc0 ^ temp[k  ] ^ _mm256_slli_epi16(t,4));
+            _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+    (void) size;
+    int m_vecs_stored = 0;
+
+    for (int r = 0; r < O_MAX; ++r) {
+        const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+        __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+        _out[0] = _in[0];
+        m_vecs_stored++;
+        for (int c = r + 1; c < O_MAX; ++c) {
+            const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+            const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+            _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+            _out[0] = _in2[0] ^ _in3[0];
+            m_vecs_stored++;
+        }
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+    
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        cols_used ++;
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+      const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        // P1 times S1
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 times S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+    mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/LICENSE b/src/sig/mayo/pqmayo_mayo_5_opt/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/NOTICE b/src/sig/mayo/pqmayo_mayo_5_opt/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/aes_ctr.h b/src/sig/mayo/pqmayo_mayo_5_opt/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/api.c b/src/sig/mayo/pqmayo_mayo_5_opt/api.c
new file mode 100644
index 0000000000..f2e861e9c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_5
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/api.h b/src/sig/mayo/pqmayo_mayo_5_opt/api.h
new file mode 100644
index 0000000000..fa1ebbb790
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 40
+#define CRYPTO_PUBLICKEYBYTES 5008
+#define CRYPTO_BYTES 838
+
+#define CRYPTO_ALGNAME "MAYO_5"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 8;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+    uint32_t x = ((uint32_t) b) * 0x08040201;
+
+    uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+    uint32_t high_half = x & high_nibble_mask;
+    return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 4;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < legs; i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 6;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdalign.h>
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  6 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins +  7 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  4 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  9 * m_legs * 2);
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  2 * m_legs * 2);
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+    m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+    const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+    (void)m;
+#else
+    const int m_legs = m / 32;
+#endif
+
+    /* Old approach which is constant time but doesn't have to be
+    unsigned char S1[V_MAX*K_MAX];
+    unsigned char S2[O_MAX*K_MAX];
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+    */
+
+    // use more stack efficient version for MAYO_3 and MAYO_5
+    #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+    uint64_t accumulator[M_MAX * N_MAX] = {0};
+    int P1_used;
+    int P3_used;
+    for (int col = 0; col < k; col++) {
+        for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+            accumulator[i] = 0;
+        }
+        P1_used = 0;
+        for (int row = 0; row < v; row++) {
+            for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+                bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P1_used ++;
+            }
+
+            for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+                bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+            }
+        }
+
+        P3_used = 0;
+        for (int row = v; row < n; row++) {
+            for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+                bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P3_used ++;
+            }
+        }
+
+        for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+           multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+           multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+           multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+           bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+        }
+    }
+
+    #else
+
+    alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+    int P1_used = 0;
+    for (int row = 0; row < v; row++) {
+        for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P1_used ++;
+        }
+
+
+        for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+            }
+#endif
+        }
+    }
+
+    int P3_used = 0;
+    for (int row = v; row < n; row++) {
+        for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P3_used ++;
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+        i++;
+#endif
+    }
+
+    #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int  n, uint64_t *SPS){
+    alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+    #if !defined(MAYO_VARIANT)
+    const int m_legs = m/32;
+    #else
+    (void) m;
+    #endif
+    for (int row = 0; row < k; row++) {
+        for (int j = 0; j < n; j++) {
+            for (int col = 0; col < k; col += 1) {
+                #if defined(MAYO_VARIANT) && (M_MAX == 64)
+                    vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                    vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                    vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+                #else
+                    m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+                #endif
+            }
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+        i++;
+#endif
+    }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_cols; r++) {
+        for (int c = 0; c < mat_rows; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_rows; r++) {
+        for (int c = 0; c < mat_cols; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo_5_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include <stdalign.h>
+#include <stdint.h>
+#include <mem.h>
+#include <arithmetic.h>
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+    const int leg = index / 16;
+    const int offset = index % 16;
+
+    return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+    int i;
+    unsigned char *out8 = (unsigned char *)out;
+    for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+        out8[i/2]  = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+    }
+    if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0);
+#else
+        out8[i/2]  = (in[i+0] << 0);
+#endif
+    }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+    const unsigned char *in8 = (const unsigned char *)in;
+    for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out[i]   = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+        out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        out[i]   = (in8[i/2]) & 0xF;
+        out[i+1] = (in8[i/2] >> 4);
+#endif
+    }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+    alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+    int row_len = (ncols + 15) / 16;
+
+    // nibbleslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+    }
+
+    // pivot row is secret, pivot col is not
+
+    unsigned char inverse;
+    int pivot_row = 0;
+    for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+        int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+        int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+        // the pivot row is guaranteed to be between these lower and upper bounds if
+        // A has full rank
+
+        // zero out pivot row
+        for (int i = 0; i < row_len; i++) {
+            _pivot_row[i] = 0;
+            _pivot_row2[i] = 0;
+        }
+
+        // try to get a pivot row in constant time
+        unsigned char pivot = 0;
+        uint64_t pivot_is_zero = -1;
+        for (int row = pivot_row_lower_bound;
+                row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+            uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+            uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+            for (int j = 0; j < row_len; j++) {
+                _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                                 packed_A[row * row_len + j];
+            }
+            pivot = m_extract_element(_pivot_row, pivot_col);
+            pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+        }
+
+        // multiply pivot row by inverse of pivot
+        inverse = inverse_f(pivot);
+        vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+        // conditionally write pivot row to the correct row, if there is a nonzero
+        // pivot
+        for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+            uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+            uint64_t do_not_copy = ~do_copy;
+            for (int col = 0; col < row_len; col++) {
+                packed_A[row * row_len + col] =
+                    (do_not_copy & packed_A[row * row_len + col]) +
+                    (do_copy & _pivot_row2[col]);
+            }
+        }
+
+        // eliminate entries below pivot
+        for (int row = pivot_row_lower_bound; row < nrows; row++) {
+            unsigned char below_pivot = (row > pivot_row);
+            unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+            vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+                                    packed_A + row * row_len);                            
+        }
+
+        pivot_row += (-(int64_t)(~pivot_is_zero));
+    }
+
+    unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+    // unbitslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = temp[j];
+        }
+    }
+
+    mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+    mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/mayo.c b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/mayo.h b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/mem.h b/src/sig/mayo/pqmayo_mayo_5_opt/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/params.c b/src/sig/mayo/pqmayo_mayo_5_opt/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo_5_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo_5_opt/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo_5_opt/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/sig_mayo.h b/src/sig/mayo/sig_mayo.h
index 07be2859ff..08717fb4d7 100644
--- a/src/sig/mayo/sig_mayo.h
+++ b/src/sig/mayo/sig_mayo.h
@@ -38,4 +38,15 @@ OQS_API OQS_STATUS OQS_SIG_mayo_3_sign(uint8_t *signature, size_t *signature_len
 OQS_API OQS_STATUS OQS_SIG_mayo_3_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
 #endif
 
+#if defined(OQS_ENABLE_SIG_mayo_5)
+#define OQS_SIG_mayo_5_length_public_key 5008
+#define OQS_SIG_mayo_5_length_secret_key 40
+#define OQS_SIG_mayo_5_length_signature 838
+
+OQS_SIG *OQS_SIG_mayo_5_new(void);
+OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
 #endif
diff --git a/src/sig/mayo/sig_mayo_5.c b/src/sig/mayo/sig_mayo_5.c
new file mode 100644
index 0000000000..ca19cfbf91
--- /dev/null
+++ b/src/sig/mayo/sig_mayo_5.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_mayo.h>
+
+#if defined(OQS_ENABLE_SIG_mayo_5)
+
+OQS_SIG *OQS_SIG_mayo_5_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_mayo_5;
+	sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo";
+
+	sig->claimed_nist_level = 5;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_mayo_5_length_public_key;
+	sig->length_secret_key = OQS_SIG_mayo_5_length_secret_key;
+	sig->length_signature = OQS_SIG_mayo_5_length_signature;
+
+	sig->keypair = OQS_SIG_mayo_5_keypair;
+	sig->sign = OQS_SIG_mayo_5_sign;
+	sig->verify = OQS_SIG_mayo_5_verify;
+
+	return sig;
+}
+
+extern int pqmayo_MAYO_5_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_5_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_5_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+extern int pqmayo_MAYO_5_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_5_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_5_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/sig.c b/src/sig/sig.c
index 6722ca2931..bab752c607 100644
--- a/src/sig/sig.c
+++ b/src/sig/sig.c
@@ -42,7 +42,8 @@ OQS_API const char *OQS_SIG_alg_identifier(size_t i) {
 		OQS_SIG_alg_sphincs_shake_256s_simple,
 		OQS_SIG_alg_mayo_1,
 		OQS_SIG_alg_mayo_2,
-		OQS_SIG_alg_mayo_3,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END
+		OQS_SIG_alg_mayo_3,
+		OQS_SIG_alg_mayo_5,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END
 	};
 	if (i >= OQS_SIG_algs_length) {
 		return NULL;
@@ -256,6 +257,13 @@ OQS_API int OQS_SIG_alg_is_enabled(const char *method_name) {
 #else
 		return 0;
 #endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) {
+#ifdef OQS_ENABLE_SIG_mayo_5
+		return 1;
+#else
+		return 0;
+#endif
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ENABLED_CASE_END
 	} else {
 		return 0;
@@ -463,6 +471,13 @@ OQS_API OQS_SIG *OQS_SIG_new(const char *method_name) {
 #else
 		return NULL;
 #endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) {
+#ifdef OQS_ENABLE_SIG_mayo_5
+		return OQS_SIG_mayo_5_new();
+#else
+		return NULL;
+#endif
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_NEW_CASE_END
 		// EDIT-WHEN-ADDING-SIG
 	} else {
diff --git a/src/sig/sig.h b/src/sig/sig.h
index c545489b4e..f847104aa5 100644
--- a/src/sig/sig.h
+++ b/src/sig/sig.h
@@ -88,12 +88,14 @@ extern "C" {
 #define OQS_SIG_alg_mayo_2 "MAYO_2"
 /** Algorithm identifier for MAYO_3 */
 #define OQS_SIG_alg_mayo_3 "MAYO_3"
+/** Algorithm identifier for MAYO_5 */
+#define OQS_SIG_alg_mayo_5 "MAYO_5"
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END
 // EDIT-WHEN-ADDING-SIG
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START
 
 /** Number of algorithm identifiers above. */
-#define OQS_SIG_algs_length 28
+#define OQS_SIG_algs_length 29
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END
 
 /**
diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json
index 1da20fcfa8..9e5f229090 100644
--- a/tests/KATs/sig/kats.json
+++ b/tests/KATs/sig/kats.json
@@ -39,6 +39,10 @@
     "all": "f66b95dda153b7df00610aa018f0644146e7e564b33562c51bb088c40fb0dcb2",
     "single": "dbc49f4fdfa0de69d416051215cb53c042c4a329d325452d079f3734b7467a6b"
   },
+  "MAYO_5": {
+    "single": "f2c1c69045c7d15e714a04119965e8a7007ef54f9293158587560227c97b237d",
+    "all": "7b230c2626f57159a243d8dfc69c62cb94dd0f179dd2b4f2ef3606deb6404477"
+  },
   "ML-DSA-44": {
     "all": "183bc0c4398ade4fc17b6a7d876b82545a96331139a4f27269c95664b8c483f9",
     "single": "e6f3ec4dc0b02dd3bcbbc6b105190e1890ca0bb3f802e2b571f0d70f3993a2e1"
diff --git a/tests/kat_sig.c b/tests/kat_sig.c
index b420b3809b..876a720aaf 100644
--- a/tests/kat_sig.c
+++ b/tests/kat_sig.c
@@ -302,6 +302,16 @@ OQS_STATUS combine_message_signature(uint8_t **signed_msg, size_t *signed_msg_le
 		memcpy(*signed_msg, signature, signature_len);
 		memcpy(*signed_msg + signature_len, msg, msg_len);
 		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "MAYO_5")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
 		///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_COMBINE_MESSAGE_SIGNATURE_END
 	} else {
 		return OQS_ERROR;
diff --git a/tests/test_sig.c b/tests/test_sig.c
index eb3ab0af12..094cc6d1cb 100644
--- a/tests/test_sig.c
+++ b/tests/test_sig.c
@@ -224,17 +224,30 @@ int main(int argc, char **argv) {
 	OQS_STATUS rc;
 #if OQS_USE_PTHREADS
 #define MAX_LEN_SIG_NAME_ 64
-	pthread_t thread;
-	struct thread_data td;
-	td.alg_name = alg_name;
-	int trc = pthread_create(&thread, NULL, test_wrapper, &td);
-	if (trc) {
-		fprintf(stderr, "ERROR: Creating pthread\n");
-		OQS_destroy();
-		return EXIT_FAILURE;
+	// don't run MAYO_5 in threads because of large stack usage
+	char no_thread_sig_patterns[][MAX_LEN_SIG_NAME_]  = {"MAYO_5"};
+	int test_in_thread = 1;
+	for (size_t i = 0 ; i < sizeof(no_thread_sig_patterns) / MAX_LEN_SIG_NAME_; ++i) {
+		if (strstr(alg_name, no_thread_sig_patterns[i]) != NULL) {
+			test_in_thread = 0;
+			break;
+		}
+	}
+	if (test_in_thread) {
+		pthread_t thread;
+		struct thread_data td;
+		td.alg_name = alg_name;
+		int trc = pthread_create(&thread, NULL, test_wrapper, &td);
+		if (trc) {
+			fprintf(stderr, "ERROR: Creating pthread\n");
+			OQS_destroy();
+			return EXIT_FAILURE;
+		}
+		pthread_join(thread, NULL);
+		rc = td.rc;
+	} else {
+		rc = sig_test_correctness(alg_name);
 	}
-	pthread_join(thread, NULL);
-	rc = td.rc;
 #else
 	rc = sig_test_correctness(alg_name);
 #endif