Skip to content

Commit

Permalink
Add NEON implementation for armv7 and aarch64 platforms (#615)
Browse files Browse the repository at this point in the history
* Configure build system to compile arch-specific extension for Arm NEON

* Add NEON implementation for `util/simd/vector.h`

* Add `tranpose16x16` implementation with NEON

* Fix wrong macro used in dynamic dispatch with NEON enabled

* Add NEON implementation of score vectors in `score_vector_int8.h`

* Enable `benchmark_transpose` when compiled with NEON support

* Add NEON implementation in `score_vector_int16.h`

* Implement `SwipeProfile` with NEON in `swipe.h`

* Add remaining unimplemented code in `score_vector.h`

* Enable SIMD swipe and benchmarks when compiling for NEON

* Add guards to compile NEON code on Armv7 platforms

* Add NEON horizontal sums implementations using `vpadalq` cascades

* Add fallback `ScoreVector::cmp_mask` implementation for Armv7 NEON

* Add NEON support to `finger_print.h` and `hash_set.h`

* Enable NEON architecture in `banded_3frame_swipe.cpp`

* Add remaining benchmarks for which there exist a NEON implementation

* List NEON in compile-time and runtime feature lists

* Update define macros for compilation of NEON architecture library

* Fix some define guards in `util/simd` headers

* Fix define guards in `ungapped_simd` when compiling for Aarch64

* Fix `cmp_mask` implementations for NEON `ScoreVector` instantiations

* Rewrite NEON `transpose` without Aarch64-specific instructions

* Use Arm NEON include guards instead of architecture for banded swipe code

* Add `expand_from_8bit` implementation for Armv7 NEON

* Rewrite 16x16 NEON transpose without `vst1q_s8_x4` for Armv7 compatibility

* Benchmark scalar 16x16 transpose to compare to vectorized code

* Fix include guards in `swipe_wrapper.cpp` preventing the use of NEON vectors

* Remove `Deque::Iterator::operator-` when compiling for Arm for `armv7` compatibility

* Fix `benchmark_transpose` not using a valid scalar implementation

* Refactor `cmp_mask` implementation into common inline function for NEON

* Add Arm NEON implementation to `sse_dist.h`

* Add Arm NEON implementation of `BitVector::one_count` using `vcntq_u8` intrinsic

* Fix `reduce_seq_aarch64` definition in `sse_dist.h`

* Implement `ScoreVector<int16_t>(unsigned, Register)` for NEON

* Fix potential overflow in NEON code of `BitVector::one_count`

* Fix `CMakeLists.txt` and `simd.h` to only build NEON on `armv7` with build support

* Setup runtime detection of `NEON` for `armv7` in `simd.cpp`

* Setup dispatch for NEON in `dispatch.h`

* Fix scope of NEON helpers in `simd.h`

* Change NEON architecture ID in `CMakeLists.txt` to avoid conflict with AVX512

* Fix remaining use of `::SIMD` namespace in NEON code

* Fix NEON `letter_mask` not disabling sequence mask based on macros

* Revert back commented code in `deque.h`

* Fix iteration in `hauser_correction.cpp`

* Use compile-time macro to avoid `Deque::Iterator::operator-` overload

* Fix benchmarks not running properly for NEON

* Fix `armv7l` implementation of `vmaskq_s8`

* Fix `table.h` for 32-bit platforms
  • Loading branch information
althonos authored Jan 31, 2024
1 parent dd43519 commit 56f5d14
Show file tree
Hide file tree
Showing 27 changed files with 1,011 additions and 102 deletions.
42 changes: 39 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
cmake_minimum_required (VERSION 2.6)
project (DIAMOND)
include(CheckCXXCompilerFlag)
include(CheckSymbolExists)
include(CheckTypeSize)

option(BUILD_STATIC "BUILD_STATIC" OFF)
option(EXTRA "EXTRA" OFF)
option(STATIC_LIBGCC "STATIC_LIBGCC" OFF)
option(STATIC_LIBSTDC++ "STATIC_LIBSTDC++" OFF)
option(X86 "X86" ON)
option(ARM "ARM" OFF)
option(AARCH64 "AARCH64" OFF)
option(STRICT_BAND "STRICT_BAND" ON)
option(LEFTMOST_SEED_FILTER "LEFTMOST_SEED_FILTER" ON)
option(SEQ_MASK "SEQ_MASK" ON)
Expand All @@ -26,10 +30,12 @@ set(MAX_SHAPE_LEN 19)
set(BLAST_INCLUDE_DIR "" CACHE STRING "BLAST_INCLUDE_DIR")
set(BLAST_LIBRARY_DIR "" CACHE STRING "BLAST_LIBRARY_DIR")

if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*)")
set(X86 OFF)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
set(AARCH64 ON)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(X86 OFF)
set(ARM ON)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
set(X86 OFF)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec")
Expand Down Expand Up @@ -114,6 +120,9 @@ function(set_cxx_standard std flag)
endif()
endfunction(set_cxx_standard)

check_type_size(ptrdiff_t SIZEOF_PTRDIFF_T)
check_type_size(int SIZEOF_INT)

check_cxx_compiler_flag("-std=gnu++14" HAS_GNUPP14)
check_cxx_compiler_flag("-std=gnu++17" HAS_GNUPP17)
if(HAS_GNUPP17 OR ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
Expand Down Expand Up @@ -197,6 +206,31 @@ if(X86)
endif()
endif(X86)

# NEON is mandatory on Aarch64
if(AARCH64)
add_definitions(-DWITH_NEON)
add_library(arch_neon OBJECT ${DISPATCH_OBJECTS})
target_include_directories(arch_neon PRIVATE "${CMAKE_SOURCE_DIR}/src/lib")
target_compile_options(arch_neon PUBLIC -DDISPATCH_ARCH=ARCH_NEON -DARCH_ID=4 -D__ARM_NEON -D__aarch64__ -DEigen=Eigen_NEON)
endif(AARCH64)

# NEON is optional on Armv7, so we need to check for compiler support,
# and for the <sys/auxv.h> header used for runtime detection.
if(ARM)
check_symbol_exists(getauxval "sys/auxv.h" HAVE_GETAUXVAL)
check_cxx_compiler_flag("-mfpu=neon" HAVE_MFPU_NEON)
if(HAVE_MFPU_NEON)
add_definitions(-DWITH_NEON)
add_definitions(-DHAVE_MFPU_NEON)
add_library(arch_neon OBJECT ${DISPATCH_OBJECTS})
target_include_directories(arch_neon PRIVATE "${CMAKE_SOURCE_DIR}/src/lib")
target_compile_options(arch_neon PUBLIC -DDISPATCH_ARCH=ARCH_NEON -DARCH_ID=4 -D__ARM_NEON -DEigen=Eigen_NEON -mfpu=neon)
endif()
if(HAVE_GETAUXVAL)
add_definitions(-DHAVE_GETAUXVAL)
endif()
endif(ARM)

set(OBJECTS
src/run/main.cpp
src/basic/config.cpp
Expand Down Expand Up @@ -394,6 +428,8 @@ if(X86)
else()
add_executable(diamond $<TARGET_OBJECTS:arch_generic> $<TARGET_OBJECTS:arch_sse4_1> $<TARGET_OBJECTS:arch_avx2> ${OBJECTS} ${BLAST_OBJ} ${ZSTD_OBJ})
endif()
elseif(ARM OR AARCH64)
add_executable(diamond $<TARGET_OBJECTS:arch_generic> $<TARGET_OBJECTS:arch_neon> ${OBJECTS} ${BLAST_OBJ} ${ZSTD_OBJ})
else()
add_executable(diamond $<TARGET_OBJECTS:arch_generic> ${OBJECTS} ${BLAST_OBJ} ${ZSTD_OBJ})
endif()
Expand Down Expand Up @@ -489,4 +525,4 @@ SET(SP -DTEST_DIR=${CMAKE_SOURCE_DIR}/src/test -P ${CMAKE_SOURCE_DIR}/src/test/t
add_test(NAME blastp COMMAND ${CMAKE_COMMAND} -DNAME=blastp "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa -p1" ${SP})
add_test(NAME blastp-mid-sens COMMAND ${CMAKE_COMMAND} -DNAME=blastp-mid-sens "-DARGS=blastp -q ${TD}/3.faa -d ${TD}/4.faa --mid-sensitive -p1" ${SP})
add_test(NAME blastp-f0 COMMAND ${CMAKE_COMMAND} -DNAME=blastp-f0 "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa -f0 -p1" ${SP})
add_test(NAME diamond COMMAND diamond test)
add_test(NAME diamond COMMAND diamond test)
12 changes: 11 additions & 1 deletion src/basic/value.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,16 @@ static inline __m256i letter_mask(__m256i x) {
}
#endif

#ifdef __ARM_NEON
static inline int8x16_t letter_mask(int8x16_t x) {
#ifdef SEQ_MASK
return vandq_s8(x, vdupq_n_s8(LETTER_MASK));
#else
return x;
#endif
}
#endif

extern const ValueTraits amino_acid_traits;
extern const ValueTraits nucleotide_traits;
extern ValueTraits value_traits;
Expand Down Expand Up @@ -163,4 +173,4 @@ using DictId = int64_t;
using Score = int32_t;
using TaxId = int32_t;
using CentroidId = OId;
using SuperBlockId = int32_t;
using SuperBlockId = int32_t;
6 changes: 3 additions & 3 deletions src/dp/scan_diags.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void scan_diags128(const LongScoreProfile<int8_t>& qp, Sequence s, int d_begin,
max4.store(scores + 96);
for (int i = 0; i < 128; ++i)
out[i] = ScoreTraits<Sv>::int_score(scores[i]);
#elif defined(__SSE4_1__)
#elif defined(__SSE4_1__) | defined(__ARM_NEON)
using Sv = ScoreVector<int8_t, SCHAR_MIN>;
const int qlen = (int)qp.length();

Expand Down Expand Up @@ -150,7 +150,7 @@ void scan_diags64(const LongScoreProfile<int8_t>& qp, Sequence s, int d_begin, i
max2.store(scores + 32);
for (int i = 0; i < 64; ++i)
out[i] = ScoreTraits<Sv>::int_score(scores[i]);
#elif defined(__SSE4_1__)
#elif defined(__SSE4_1__) | defined(__ARM_NEON)
using Sv = ScoreVector<int8_t, SCHAR_MIN>;
const int qlen = (int)qp.length();

Expand Down Expand Up @@ -225,7 +225,7 @@ void scan_diags(const LongScoreProfile<int8_t>& qp, Sequence s, int d_begin, int
max2.store(scores + 32);
for (int i = 0; i < 64; ++i)
out[i] = ScoreTraits<Sv>::int_score(scores[i]);
#elif defined(__SSE4_1__)
#elif defined(__SSE4_1__) | defined(__ARM_NEON)
using Sv = ScoreVector<int8_t, SCHAR_MIN>;
const int qlen = (int)qp.length();

Expand Down
16 changes: 15 additions & 1 deletion src/dp/score_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,20 @@ static inline void store_sv(const DISPATCH_ARCH::ScoreVector<_t, DELTA> &sv, _p

#endif

#ifdef __ARM_NEON
template<int DELTA>
static inline void store_sv(const DISPATCH_ARCH::ScoreVector<int8_t, DELTA> &sv, int8_t *dst)
{
vst1q_s8(dst, sv.data_);
}

template<int DELTA>
static inline void store_sv(const DISPATCH_ARCH::ScoreVector<int16_t, DELTA> &sv, int16_t *dst)
{
vst1q_s16(dst, sv.data_);
}
#endif

static inline int extract_channel(const int32_t v, const int i) {
return v;
}
Expand All @@ -194,4 +208,4 @@ static inline void saturate(Sv& v) {

static inline void saturate(int32_t& v) {
v = std::max(v, 0);
}
}
163 changes: 160 additions & 3 deletions src/dp/score_vector_int16.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Copyright (C) 2016-2021 Max Planck Society for the Advancement of Science e.V.
Benjamin Buchfink
Code developed by Benjamin Buchfink <benjamin.buchfink@tue.mpg.de>
Arm NEON port contributed by Martin Larralde <martin.larralde@embl.de>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -196,6 +197,162 @@ static inline ScoreVector<int16_t, DELTA> blend(const ScoreVector<int16_t, DELTA
return ScoreVector<int16_t, DELTA>(_mm256_blendv_epi8(v.data_, w.data_, mask.data_));
}

#elif defined(__ARM_NEON)

template<int DELTA>
struct ScoreVector<int16_t, DELTA>
{

typedef int16x8_t Register;

inline ScoreVector() :
data_(vdupq_n_s16(DELTA))
{}

explicit ScoreVector(int x)
{
data_ = vdupq_n_s16(x);
}

explicit ScoreVector(int16_t x)
{
data_ = vdupq_n_s16(x);
}

explicit ScoreVector(int16x8_t data) :
data_(data)
{ }

explicit ScoreVector(const int16_t *x):
data_(vld1q_s16(x))
{}

explicit ScoreVector(const uint16_t *x) :
data_(vreinterpretq_s16_u16(vld1q_u16(x)))
{}

#ifdef __aarch64__
ScoreVector(unsigned a, Register seq)
{
const int8x16_t* row = reinterpret_cast<const int8x16_t*>(&score_matrix.matrix8()[a << 5]);

int8x16_t high_mask = vreinterpretq_s8_s16(vshlq_n_s16(vreinterpretq_s16_s8(vandq_s8(vreinterpretq_s8_s16(seq), vdupq_n_s8('\x10'))), 3));
int8x16_t seq_low = vorrq_s8(vreinterpretq_s8_s16(seq), high_mask);
int8x16_t seq_high = vorrq_s8(vreinterpretq_s8_s16(seq), veorq_s8(high_mask, vdupq_n_s8('\x80')));

int8x16_t r1 = vld1q_s8(reinterpret_cast<const int8_t*>(row));
int8x16_t r2 = vld1q_s8(reinterpret_cast<const int8_t*>(row + 1));

int8x16_t s1 = vqtbl1q_s8(r1, vandq_u8(vreinterpretq_u8_s8(seq_low), vdupq_n_u8(0x8F)));
int8x16_t s2 = vqtbl1q_s8(r2, vandq_u8(vreinterpretq_u8_s8(seq_high), vdupq_n_u8(0x8F)));

data_ = vorrq_s16(vreinterpretq_s16_s8(s1), vreinterpretq_s16_s8(s2));
data_ = vandq_s16(data_, vdupq_n_s16(255));
data_ = vqsubq_s16(data_, vdupq_n_s16(score_matrix.bias()));
}
#endif

ScoreVector operator+(const ScoreVector&rhs) const
{
return ScoreVector(vqaddq_s16(data_, rhs.data_));
}

ScoreVector operator-(const ScoreVector&rhs) const
{
return ScoreVector(vqsubq_s16(data_, rhs.data_));
}

ScoreVector& operator+=(const ScoreVector& rhs) {
data_ = vqaddq_s16(data_, rhs.data_);
return *this;
}

ScoreVector& operator-=(const ScoreVector&rhs)
{
data_ = vqsubq_s16(data_, rhs.data_);
return *this;
}

ScoreVector& operator &=(const ScoreVector& rhs) {
data_ = vandq_s16(data_, rhs.data_);
return *this;
}

ScoreVector& operator++() {
data_ = vqaddq_s16(data_, vdupq_n_s16(1));
return *this;
}

ScoreVector operator==(const ScoreVector&v) const {
return ScoreVector(vreinterpretq_s16_u16(vceqq_s16(data_, v.data_)));
}

friend uint32_t cmp_mask(const ScoreVector&v, const ScoreVector&w) {
return vmaskq_s8(vreinterpretq_s8_u16(vceqq_s16(v.data_, w.data_)));
}

ScoreVector& max(const ScoreVector&rhs)
{
data_ = vmaxq_s16(data_, rhs.data_);
return *this;
}

friend ScoreVector max(const ScoreVector& lhs, const ScoreVector&rhs)
{
return ScoreVector(vmaxq_s16(lhs.data_, rhs.data_));
}

friend ScoreVector blend(const ScoreVector&v, const ScoreVector&w, const ScoreVector&mask) {
/* Use a signed shift right to create a mask with the sign bit */
uint16x8_t mask_ = vreinterpretq_u16_s16(vshrq_n_s16(mask.data_, 7));
return ScoreVector(vbslq_s16(mask_, w.data_, v.data_));
}

void store(int16_t *ptr) const
{
vst1q_s16(ptr, data_);
}

int16_t operator[](int i) const {
// return vgetq_lane_s16(data_, i);
int16_t tmp[8];
vst1q_s16(tmp, data_);
return tmp[i];
}

ScoreVector& set(int i, int16_t x) {
// vsetq_lane_s16(x, data_, i);
int16_t tmp[8];
vst1q_s16(tmp, data_);
tmp[i] = x;
data_ = vld1q_s16(tmp);
return *this;
}

void expand_from_8bit() {
int8x16_t mask = vdupq_n_s8(0x80);
int8x16_t sign = vreinterpretq_s8_u8(vceqq_s8(vandq_s8(vreinterpretq_s8_s16(data_), mask), mask));
#ifdef __aarch64__
data_ = vreinterpretq_s16_s8(vzip1q_s8(vreinterpretq_s8_s16(data_), sign));
#else
int8x16x2_t tmp = vzipq_s8(vreinterpretq_s8_s16(data_), sign);
data_ = vreinterpretq_s16_s8(tmp.val[0]);
#endif
}

friend std::ostream& operator<<(std::ostream& s, ScoreVector v)
{
int16_t x[8];
v.store(x);
for (unsigned i = 0; i < 8; ++i)
printf("%3i ", (int)x[i]);
return s;
}

int16x8_t data_;

};

#elif defined(__SSE2__)

template<int DELTA>
Expand Down Expand Up @@ -374,7 +531,7 @@ static inline int16_t extract(ScoreVector<int16_t, DELTA> sv) {

#endif

#ifdef __SSE2__
#if defined(__SSE2__) | defined(__ARM_NEON)

template<int DELTA>
struct ScoreTraits<ScoreVector<int16_t, DELTA>>
Expand Down Expand Up @@ -445,7 +602,7 @@ struct ScoreTraits<ScoreVector<int16_t, DELTA>>

}

#ifdef __SSE2__
#if defined(__SSE2__) | defined(__ARM_NEON)

template<int DELTA>
static inline int16_t extract_channel(const DISPATCH_ARCH::ScoreVector<int16_t, DELTA>& v, int i) {
Expand All @@ -457,4 +614,4 @@ static inline void set_channel(DISPATCH_ARCH::ScoreVector<int16_t, DELTA>& v, co
v.set(i, x);
}

#endif
#endif
Loading

0 comments on commit 56f5d14

Please # to comment.