diff --git a/CMakeLists.txt b/CMakeLists.txt index c14b98f3c..84c1d3579 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,14 @@ set(MMSEQS_CXX_FLAGS "-fsigned-char") # SIMD instruction sets support set(MMSEQS_ARCH "") -if (HAVE_AVX2) +if (HAVE_AVX512) + if (CMAKE_COMPILER_IS_CLANG) + set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx512bw -mcx16") + else () + set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx512bw -mcx16 -Wa,-q") + endif () + set(X64 1 CACHE INTERNAL "") +elseif (HAVE_AVX2) if (CMAKE_COMPILER_IS_CLANG) set(MMSEQS_ARCH "${MMSEQS_ARCH} -mavx2 -mcx16") else () diff --git a/lib/simd/simd.h b/lib/simd/simd.h index 77eaf5101..7c9f277c3 100644 --- a/lib/simd/simd.h +++ b/lib/simd/simd.h @@ -46,7 +46,11 @@ //#define AVX512 //#endif -#if defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE) +#if defined(SIMDE_X86_AVX512BW_NATIVE) +#define AVX512BW +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) || defined(AVX512) || defined(SIMDE_X86_AVX2_NATIVE) #define AVX2 #endif @@ -694,4 +698,4 @@ inline float ScalarProd20(const float* qi, const float* tj) { // + tj[18] * qi[18] + tj[19] * qi[19]; } -#endif //SIMD_H +#endif //SIMD_H \ No newline at end of file diff --git a/src/alignment/StripedSmithWaterman.cpp b/src/alignment/StripedSmithWaterman.cpp index 29ca26def..7807dc380 100644 --- a/src/alignment/StripedSmithWaterman.cpp +++ b/src/alignment/StripedSmithWaterman.cpp @@ -22,6 +22,10 @@ /* Written by Michael Farrar, 2006 (alignment), Mengyao Zhao (SSW Library) and Martin Steinegger (change structure add aa composition, profile and AVX2 support). Please send bug reports and/or suggestions to martin.steinegger@snu.ac.kr. + + AVX512BW support + Modified Copyright (C) 2025 Intel Corporation + Contacts: Ghanshyam Chandra */ #include "Parameters.h" #include "StripedSmithWaterman.h" @@ -56,6 +60,9 @@ SmithWaterman::SmithWaterman(size_t maxSequenceLength, int aaSize, bool aaBiasCo profile = new s_profile(); // query profile profile->profile_byte = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); +#ifdef AVX512BW + profile->profile_byteBW = (__m512i*)mem_align(AVX512_ALIGN_INT, aaSize * segSize * sizeof(__m512i)); +#endif profile->profile_word = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); profile->profile_rev_byte = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); profile->profile_rev_word = (simd_int*)mem_align(ALIGN_INT, aaSize * segSize * sizeof(simd_int)); @@ -115,6 +122,9 @@ SmithWaterman::~SmithWaterman(){ free(vHmax); free(target_profile_byte); free(profile->profile_byte); +#ifdef AVX512BW + free(profile->profile_byteBW); +#endif free(profile->profile_word); free(profile->profile_rev_byte); free(profile->profile_rev_word); @@ -157,6 +167,37 @@ SmithWaterman::~SmithWaterman(){ delete profile; } +#ifdef AVX512BW + /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */ + template + void SmithWaterman::createQueryProfile_AVX512BW(__m512i *profile, const int8_t *query_sequence, const int8_t * composition_bias, const int8_t *mat, + const int32_t query_length, const int32_t aaSize, uint8_t bias, const int32_t offset, const int32_t entryLength) + { + const int32_t segLen = (query_length + Elements - 1) / Elements; + T* t = (T*) profile; + for (int32_t nt = 0; LIKELY(nt < aaSize); nt++) { + for (int32_t i = 0; i < segLen; i++) { + int32_t j = i; + for (size_t segNum = 0; LIKELY(segNum < Elements) ; segNum++) { + // if will be optmized out by compiler + if(type == SUBSTITUTIONMATRIX) { // substitution score for query_seq constrained by nt + // query_sequence starts from 1 to n + *t++ = ( j >= query_length) ? bias : mat[nt * aaSize + query_sequence[j + offset ]] + composition_bias[j + offset] + bias; // mat[nt][q[j]] mat eq 20*20 + } if(type == PROFILE) { + // profile starts by 0 + // *t++ = (j >= query_length) ? bias : (mat[nt * entryLength + (j + (offset - 1))] + bias); //mat eq L*20 // mat[nt][j] + *t++ = (j >= query_length) ? bias : mat[nt * entryLength + j + offset] + bias; + // // profile starts by 0 // TODO: offset? + // *t++ = (j >= query_length) ? bias : mat[nt * entryLength + j + offset] + bias; //mat eq L*20 // mat[nt][j] + // printf("(%1d, %1d) ", j , *(t-1)); + } + j += segLen; + } + } + } + } +#endif + /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */ template @@ -214,6 +255,21 @@ void SmithWaterman::createTargetProfile(simd_int *profile, const int8_t *mat, co } } } + + +#ifdef AVX512BW + template + void SmithWaterman::updateQueryProfile_AVX512BW(__m512i *profile, const int32_t query_length, const int32_t aaSize, + uint8_t shift) { + const int32_t segLen = (query_length + Elements - 1) / Elements; + T* t = (T*) profile; + for (uint32_t i = 0; i < segLen * Elements * aaSize; i++) { + t[i] += shift; + } + } +#endif + + template void SmithWaterman::updateQueryProfile(simd_int *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift) { @@ -347,6 +403,9 @@ s_align SmithWaterman::ssw_align_private ( if (db_bias > profile->bias) { uint8_t shift = abs(profile->bias - db_bias); updateQueryProfile(profile->profile_byte, profile->query_length, profile->alphabetSize, shift); + #ifdef AVX512BW + updateQueryProfile_AVX512BW(profile->profile_byteBW, profile->query_length, profile->alphabetSize, shift); + #endif } profile->bias = std::max(db_bias, profile->bias); createTargetProfile(db_profile_byte, db_mat, db_length, profile->alphabetSize - 1, profile->bias); @@ -1307,6 +1366,10 @@ void SmithWaterman::ssw_init(const Sequence* q, // create byte version of profiles createQueryProfile(profile->profile_byte, profile->query_sequence, NULL, profile->mat, q->L, alphabetSize, profile->bias, 0, q->L); + #ifdef AVX512BW + createQueryProfile_AVX512BW(profile->profile_byteBW, profile->query_sequence, NULL, + profile->mat, q->L, alphabetSize, profile->bias, 0, q->L); + #endif createConsensProfile(profile->consens_byte, profile->query_consens_sequence, q->L, 0); #ifdef GAP_POS_SCORING createGapProfile(profile->profile_gDelOpen_byte, profile->profile_gDelClose_byte, @@ -1330,7 +1393,10 @@ void SmithWaterman::ssw_init(const Sequence* q, } else { // create byte version of query profile createQueryProfile(profile->profile_byte, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0); - // create word version of query profile + #ifdef AVX512BW + createQueryProfile_AVX512BW(profile->profile_byteBW, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, bias, 0, 0); + #endif + // create word version of query profile createQueryProfile(profile->profile_word, profile->query_sequence, profile->composition_bias, profile->mat, q->L, alphabetSize, 0, 0, 0); // create linear version of word profile for (int32_t i = 0; i< alphabetSize; i++) { @@ -1735,63 +1801,91 @@ inline F simd_hmax(const F * in, unsigned int n) { return current; } +#ifdef AVX512BW +template +__m512i shift_right(__m512i a, __m512i carry = _mm512_setzero_si512()) { + static_assert(0 <= N && N <= 64); + if constexpr (N == 0) return a; + if constexpr (N == 64) return carry; + if constexpr (N % 4 == 0) return _mm512_alignr_epi32(carry, a, N / 4); + else { + __m512i a0 = shift_right<(N / 16 + 1) * 16>(a, carry); + __m512i a1 = shift_right<(N / 16) * 16>(a, carry); + return _mm512_alignr_epi8(a0, a1, N % 16); + } +} +template +__m512i shift_right_AVX512(__m512i a, __m512i carry = _mm512_setzero_si512()) { + return shift_right<64 - N>(carry, a); +} +#endif + int SmithWaterman::ungapped_alignment(const unsigned char *db_sequence, int32_t db_length) { -#define SWAP(tmp, arg1, arg2) tmp = arg1; arg1 = arg2; arg2 = tmp; - - int i; // position in query bands (0,..,W-1) - int j; // position in db sequence (0,..,dbseq_length-1) - int element_count = (VECSIZE_INT * 4); - const int W = (profile->query_length + (element_count - 1)) / - element_count; // width of bands in query and score matrix = hochgerundetes LQ/16 - - simd_int *p; - simd_int S; // 16 unsigned bytes holding S(b*W+i,j) (b=0,..,15) - simd_int Smax = simdi_setzero(); - simd_int Soffset; // all scores in query profile are shifted up by Soffset to obtain pos values - simd_int *s_prev, *s_curr; // pointers to Score(i-1,j-1) and Score(i,j), resp. - simd_int *qji; // query profile score in row j (for residue x_j) - simd_int *s_prev_it, *s_curr_it; - simd_int *query_profile_it = (simd_int *) profile->profile_byte; - - // Load the score offset to all 16 unsigned byte elements of Soffset - Soffset = simdi8_set(profile->bias); - s_curr = vHStore; - s_prev = vHLoad; - - memset(vHStore, 0, W * sizeof(simd_int)); - memset(vHLoad, 0, W * sizeof(simd_int)); - - for (j = 0; j < db_length; ++j) // loop over db sequence positions - { - - // Get address of query scores for row j + // AVX512BW + #ifdef AVX512BW + int element_count = AVX512_VECSIZE_INT * 4; + const int W = (profile->query_length + (element_count - 1)) / element_count; + __m512i S, Smax = _mm512_setzero_si512(); + __m512i *s_prev = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); + __m512i *s_curr = (__m512i*) mem_align(AVX512_ALIGN_INT, segSize * sizeof(__m512i)); + memset(s_prev, 0, W * sizeof(__m512i)); + memset(s_curr, 0, W * sizeof(__m512i)); + __m512i *qji, *s_prev_it, *s_curr_it; + __m512i *query_profile_it = (__m512i*)profile->profile_byteBW; + __m512i Soffset = _mm512_set1_epi8(profile->bias); + #else // AVX2 or SSE2 + int element_count = VECSIZE_INT * 4; + const int W = (profile->query_length + (element_count - 1)) / element_count; + simd_int S, Smax = simdi_setzero(); + simd_int *s_prev = vHLoad, *s_curr = vHStore; + memset(vHLoad, 0, W * sizeof(simd_int)); + memset(vHStore, 0, W * sizeof(simd_int)); + simd_int *qji, *s_prev_it, *s_curr_it; + simd_int *query_profile_it = (simd_int*)profile->profile_byte; + simd_int Soffset = simdi8_set(profile->bias); + #endif + + // main Smith-Waterman loop + for (int j = 0; j < db_length; ++j) { qji = query_profile_it + db_sequence[j] * W; - // Load the next S value - S = simdi_load(s_curr + W - 1); - S = simdi8_shiftl(S, 1); + #ifdef AVX512BW + S = _mm512_load_si512(s_curr + W - 1); + S = shift_right_AVX512<1>(S); + #else + S = simdi_load(s_curr + W - 1); + S = simdi8_shiftl(S, 1); + #endif - // Swap s_prev and s_curr, smax_prev and smax_curr - SWAP(p, s_prev, s_curr); + // swap the buffers + std::swap(s_prev, s_curr); s_curr_it = s_curr; s_prev_it = s_prev; - for (i = 0; i < W; ++i) // loop over query band positions - { - // Saturated addition and subtraction to score S(i,j) - S = simdui8_adds(S, *(qji++)); // S(i,j) = S(i-1,j-1) + (q(i,x_j) + Soffset) - S = simdui8_subs(S, Soffset); // S(i,j) = max(0, S(i,j) - Soffset) - simdi_store(s_curr_it++, S); // store S to s_curr[i] - Smax = simdui8_max(Smax, S); // Smax(i,j) = max(Smax(i,j), S(i,j)) - - // Load the next S and Smax values - S = simdi_load(s_prev_it++); + for (int i = 0; i < W; ++i) { + #ifdef AVX512BW + S = _mm512_adds_epu8(S, *qji++); + S = _mm512_subs_epu8(S, Soffset); + _mm512_store_si512(s_curr_it++, S); + Smax = _mm512_max_epu8(Smax, S); + S = _mm512_load_si512(s_prev_it++); + #else + S = simdui8_adds(S, *qji++); + S = simdui8_subs(S, Soffset); + simdi_store(s_curr_it++, S); + Smax = simdui8_max(Smax, S); + S = simdi_load(s_prev_it++); + #endif } } - int score = simd_hmax((unsigned char *) &Smax, element_count); - /* return largest score */ + int score = simd_hmax((unsigned char*)&Smax, element_count); + + + #ifdef AVX512BW + free(s_curr); free(s_prev); + #endif + return score; -#undef SWAP -} +} \ No newline at end of file diff --git a/src/alignment/StripedSmithWaterman.h b/src/alignment/StripedSmithWaterman.h index d4d3aa555..fa49db38c 100644 --- a/src/alignment/StripedSmithWaterman.h +++ b/src/alignment/StripedSmithWaterman.h @@ -79,6 +79,9 @@ class SmithWaterman{ // TODO: private or public? struct s_profile{ simd_int* profile_byte; // 0: none + #ifdef AVX512BW + __m512i* profile_byteBW; // 0: none + #endif simd_int* profile_word; // 0: none simd_int* profile_rev_byte; // 0: none simd_int* profile_rev_word; // 0: none @@ -279,6 +282,10 @@ class SmithWaterman{ simd_int* vHStore; simd_int* vHLoad; +#ifdef AVX512BW + __m512i* vHStoreBW; + __m512i* vHLoadBW; +#endif simd_int* vE; simd_int* vHmax; uint8_t * maxColumn; @@ -387,6 +394,11 @@ class SmithWaterman{ size_t query_id; size_t target_id; + #ifdef AVX512BW + template + void createQueryProfile_AVX512BW(__m512i *profile, const int8_t *query_sequence, const int8_t * composition_bias, const int8_t *mat, + const int32_t query_length, const int32_t aaSize, uint8_t bias, const int32_t offset, const int32_t entryLength); + #endif template void createQueryProfile(simd_int *profile, const int8_t *query_sequence, const int8_t * composition_bias, @@ -403,6 +415,11 @@ class SmithWaterman{ void createTargetProfile(simd_int *profile, const int8_t *mat, const int target_length, const int32_t aaSize, uint8_t bias); +#ifdef AVX512BW + template + void updateQueryProfile_AVX512BW(__m512i *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift); +#endif + template void updateQueryProfile(simd_int *profile, const int32_t query_length, const int32_t aaSize, uint8_t shift); @@ -410,4 +427,4 @@ class SmithWaterman{ void reverseMat(int8_t *rev_mat, const int8_t *mat, const int32_t aaSize, const int32_t target_length); }; -#endif /* SMITH_WATERMAN_SSE2_H */ +#endif /* SMITH_WATERMAN_SSE2_H */ \ No newline at end of file