From 4d671a8a84189ed9e8bcae839679f3a3bb72f7de Mon Sep 17 00:00:00 2001 From: "yintong.ustc@bytedance.com" Date: Mon, 17 Feb 2025 09:58:55 +0000 Subject: [PATCH 1/4] FIX] document_test: resolve std::remove compilation error The build failed due to missing header when using std::remove in the C++17 test section. The compiler incorrectly resolved to the C library remove() function instead of STL algorithm. This change: 1. Adds explicit #include for STL algorithms Fixes compilation error: error: cannot convert 'ValueIterator' to 'const char*' note: initializing argument 1 of 'int remove(const char*)' --- tests/document_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/document_test.cpp b/tests/document_test.cpp index eb48bcb..7e187a5 100644 --- a/tests/document_test.cpp +++ b/tests/document_test.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include "gtest/gtest.h" #include "sonic/dom/generic_document.h" From c27836caaba0620a0d830de033cafe6836e25a5f Mon Sep 17 00:00:00 2001 From: "yintong.ustc@bytedance.com" Date: Mon, 17 Feb 2025 09:59:43 +0000 Subject: [PATCH 2/4] riscv: add rvv support --- CMakeLists.txt | 1 + cmake/set_arch_flags.cmake | 4 + .../internal/arch/common/riscv_common/base.h | 70 +++ .../arch/common/riscv_common/skip.inc.h | 160 +++++++ include/sonic/internal/arch/rvv-128/base.h | 35 ++ include/sonic/internal/arch/rvv-128/itoa.h | 118 +++++ include/sonic/internal/arch/rvv-128/quote.h | 201 ++++++++ include/sonic/internal/arch/rvv-128/simd.h | 430 ++++++++++++++++++ include/sonic/internal/arch/rvv-128/skip.h | 72 +++ include/sonic/internal/arch/rvv-128/str2int.h | 40 ++ include/sonic/internal/arch/rvv-128/unicode.h | 105 +++++ include/sonic/internal/arch/simd_dispatch.h | 5 + .../sonic/internal/arch/sonic_cpu_feature.h | 4 + 13 files changed, 1245 insertions(+) create mode 100644 include/sonic/internal/arch/common/riscv_common/base.h create mode 100644 include/sonic/internal/arch/common/riscv_common/skip.inc.h create mode 100644 include/sonic/internal/arch/rvv-128/base.h create mode 100644 include/sonic/internal/arch/rvv-128/itoa.h create mode 100644 include/sonic/internal/arch/rvv-128/quote.h create mode 100644 include/sonic/internal/arch/rvv-128/simd.h create mode 100644 include/sonic/internal/arch/rvv-128/skip.h create mode 100644 include/sonic/internal/arch/rvv-128/str2int.h create mode 100644 include/sonic/internal/arch/rvv-128/unicode.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 36772ce..7589c81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ option(BUILD_UNITTEST "Build unittest." ON) option(BUILD_FUZZ "Build fuzz." OFF) option(BUILD_BENCH "Build benchmark." OFF) option(ENABLE_SVE2_128 "Build for Arm SVE2 with 128 bit vector size" OFF) +option(ENABLE_RVV_128 "Build for RISC-V RVV with 128 bit vector size" ON) set(CMAKE_CXX_EXTENSIONS OFF) if(BUILD_UNITTEST) diff --git a/cmake/set_arch_flags.cmake b/cmake/set_arch_flags.cmake index 7975cd0..fd25591 100644 --- a/cmake/set_arch_flags.cmake +++ b/cmake/set_arch_flags.cmake @@ -8,6 +8,10 @@ function(set_arch_flags target arch) else() target_compile_options(${target} PRIVATE -march=armv8-a) endif() + elseif(arch MATCHES "riscv64") + if(ENABLE_RVV_128) + target_compile_options(${target} PRIVATE -march=rv64gcv_zvl128b -mrvv-vector-bits=zvl) + endif() else() message(FATAL_ERROR "Unsupported architecture: ${arch}") endif() diff --git a/include/sonic/internal/arch/common/riscv_common/base.h b/include/sonic/internal/arch/common/riscv_common/base.h new file mode 100644 index 0000000..3e2c3ab --- /dev/null +++ b/include/sonic/internal/arch/common/riscv_common/base.h @@ -0,0 +1,70 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#include + +namespace sonic_json { +namespace internal { +namespace riscv_common { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. + +sonic_force_inline int TrailingZeroes(uint64_t input_num) { + //////// + // You might expect the next line to be equivalent to + // return (int)_tzcnt_u64(input_num); + // but the generated code differs and might be less efficient? + //////// + return __builtin_ctzll(input_num); +} + +/* result might be undefined when input_num is zero */ +sonic_force_inline uint64_t ClearLowestBit(uint64_t input_num) { + return input_num & (input_num - 1); +} + +/* result might be undefined when input_num is zero */ +sonic_force_inline int LeadingZeroes(uint64_t input_num) { + return __builtin_clzll(input_num); +} + +sonic_force_inline long long int CountOnes(uint64_t input_num) { + return __builtin_popcountll(input_num); +} + +sonic_force_inline uint64_t PrefixXor(uint64_t bitmask) { + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +template +sonic_force_inline void Xmemcpy(void* dst_, const void* src_, size_t chunks) { + std::memcpy(dst_, src_, chunks * ChunkSize); +} + +} // namespace riscv_common +} // namespace internal +} // namespace sonic_json \ No newline at end of file diff --git a/include/sonic/internal/arch/common/riscv_common/skip.inc.h b/include/sonic/internal/arch/common/riscv_common/skip.inc.h new file mode 100644 index 0000000..8877702 --- /dev/null +++ b/include/sonic/internal/arch/common/riscv_common/skip.inc.h @@ -0,0 +1,160 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef VEC_LEN +#error "Define vector length firstly!" +#endif + +// pos is the after the ending quote +sonic_force_inline int SkipString(const uint8_t *data, size_t &pos, + size_t len) { + const static int kEscaped = 2; + const static int kNormal = 1; + const static int kUnclosed = 0; + uint16_t quote_bits = 0; + uint16_t bs_bits = 0; + int ret = kNormal; + while (pos + VEC_LEN <= len) { + vuint8m1_t v = __riscv_vle8_v_u8m1(data + pos, 16); + bs_bits = to_bitmask( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16)); + quote_bits = to_bitmask( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16)); + if (((bs_bits - 1) & quote_bits) != 0) { + pos += TrailingZeroes(quote_bits) + 1; + return ret; + } + if (bs_bits) { + ret = kEscaped; + pos += (TrailingZeroes(bs_bits) + 2); + while (pos < len) { + if (data[pos] == '\\') { + pos += 2; + } else { + break; + } + } + } else { + pos += VEC_LEN; + } + } + while (pos < len) { + if (data[pos] == '\\') { + if (pos + 1 >= len) { + return kUnclosed; + } + ret = kEscaped; + pos += 2; + continue; + } + if (data[pos++] == '"') { + return ret; + } + }; + return kUnclosed; +} + +template +sonic_force_inline uint64_t GetStringBits(const uint8_t *data, + uint64_t &prev_instring, + uint64_t &prev_escaped) { + const T v(data); + uint64_t escaped = 0; + uint64_t bs_bits = v.eq('\\'); + if (bs_bits) { + escaped = common::GetEscaped<64>(prev_escaped, bs_bits); + } else { + escaped = prev_escaped; + prev_escaped = 0; + } + uint64_t quote_bits = v.eq('"') & ~escaped; + uint64_t in_string = PrefixXor(quote_bits) ^ prev_instring; + prev_instring = uint64_t(static_cast(in_string) >> 63); + return in_string; +} + +// GetNextToken find the next characters in tokens and update the position to +// it. +template +sonic_force_inline uint8_t GetNextToken(const uint8_t *data, size_t &pos, + size_t len, const char (&tokens)[N]) { + while (pos + VEC_LEN <= len) { + vuint8m1_t v = __riscv_vle8_v_u8m1(data + pos, 16); + vbool8_t vor = __riscv_vmclr_m_b8(16); + for (size_t i = 0; i < N - 1; i++) { + vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8( + v, __riscv_vmv_v_x_u8m1((uint8_t)(tokens[i]), 16), 16); + vor = __riscv_vmor_mm_b8(vor, cmp_res, 16); + } + + uint16_t next = to_bitmask(vor); + if (next) { + pos += TrailingZeroes(next); + return data[pos]; + } + pos += VEC_LEN; + } + while (pos < len) { + for (size_t i = 0; i < N - 1; i++) { + if (data[pos] == tokens[i]) { + return tokens[i]; + } + } + pos++; + } + return '\0'; +} + +template +sonic_force_inline bool skip_container(const uint8_t *data, size_t &pos, + size_t len, uint8_t left, + uint8_t right) { + uint64_t prev_instring = 0, prev_escaped = 0, instring; + int rbrace_num = 0, lbrace_num = 0, last_lbrace_num; + const uint8_t *p; + while (pos + 64 <= len) { + p = data + pos; +#define SKIP_LOOP() \ + { \ + instring = GetStringBits(p, prev_instring, prev_escaped); \ + T v(p); \ + last_lbrace_num = lbrace_num; \ + uint64_t rbrace = v.eq(right) & ~instring; \ + uint64_t lbrace = v.eq(left) & ~instring; \ + /* traverse each '}' */ \ + while (rbrace > 0) { \ + rbrace_num++; \ + lbrace_num = last_lbrace_num + CountOnes((rbrace - 1) & lbrace); \ + bool is_closed = lbrace_num < rbrace_num; \ + if (is_closed) { \ + sonic_assert(rbrace_num == lbrace_num + 1); \ + pos += TrailingZeroes(rbrace) + 1; \ + return true; \ + } \ + rbrace &= (rbrace - 1); \ + } \ + lbrace_num = last_lbrace_num + CountOnes(lbrace); \ + } + SKIP_LOOP(); + pos += 64; + } + uint8_t buf[64] = {0}; + std::memcpy(buf, data + pos, len - pos); + p = buf; + SKIP_LOOP(); +#undef SKIP_LOOP + return false; +} \ No newline at end of file diff --git a/include/sonic/internal/arch/rvv-128/base.h b/include/sonic/internal/arch/rvv-128/base.h new file mode 100644 index 0000000..8e74d1d --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/base.h @@ -0,0 +1,35 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#include + +#include "../common/riscv_common/base.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { +using sonic_json::internal::riscv_common::ClearLowestBit; +using sonic_json::internal::riscv_common::CountOnes; +using sonic_json::internal::riscv_common::LeadingZeroes; +using sonic_json::internal::riscv_common::PrefixXor; +using sonic_json::internal::riscv_common::TrailingZeroes; +using sonic_json::internal::riscv_common::Xmemcpy; +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json \ No newline at end of file diff --git a/include/sonic/internal/arch/rvv-128/itoa.h b/include/sonic/internal/arch/rvv-128/itoa.h new file mode 100644 index 0000000..a6e7551 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/itoa.h @@ -0,0 +1,118 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { +// Convert num {abcd} to {axxxx, abxxxx, abcxxxx, abcdxxxx} +static sonic_force_inline vuint16m1_t Utoa_4_helper(uint16_t num) { + uint16_t v = num << 2; + vuint16m1_t v00 = __riscv_vmv_v_x_u16m1(v, 4); + + vuint16m1_t kVecDiv = __riscv_vreinterpret_v_u64m1_u16m1( + __riscv_vmv_v_x_u64m1(0x80003334147b20c5, 1)); + vuint32m1_t v01 = __riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmulu_vv_u32m2(v00, kVecDiv, 4)); + vuint16m1_t v02 = + __riscv_vnsrl_wx_u16m1(__riscv_vlmul_ext_v_u32m1_u32m2(v01), 16, 4); + vuint16m1_t kVecShift = __riscv_vreinterpret_v_u64m1_u16m1( + __riscv_vmv_v_x_u64m1(0x8000200008000080, 1)); + vuint32m1_t v03 = __riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmulu_vv_u32m2(v02, kVecShift, 4)); + return __riscv_vreinterpret_v_u32m1_u16m1(v03); +} + +static sonic_force_inline vuint16m1_t rvv_uzp2q_u16(vuint16m1_t a, + vuint16m1_t b, size_t vl) { + vuint16m2_t ab = + __riscv_vset_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16m1_u16m2(a), 1, b); + vuint32m2_t ab_ = __riscv_vreinterpret_v_u16m2_u32m2(ab); + vuint16m1_t res = __riscv_vnsrl_wx_u16m1(ab_, 16, 16); + return res; +} + +static sonic_force_inline vuint8m1_t vqmovun_s16(vint16m1_t a) { + vuint16m1_t a_non_neg = + __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(a, 0, 8)); + return __riscv_vlmul_ext_v_u8mf2_u8m1( + __riscv_vnclipu_wx_u8mf2(a_non_neg, 0, __RISCV_VXRM_RDN, 8)); +} + +// Convert num's each digit as packed 16-bit in a vector. +// num's digits as abcdefgh (high bits is 0 if not enough) +// The converted vector is { a, b, c, d, e, f, g, h } +sonic_force_inline vuint16m1_t UtoaNeon(uint32_t num) { + uint16_t hi = num % 10000; // {efgh} + uint16_t lo = num / 10000; // {abcd} + + // v10 = {a, ab, abc, abcd, e, ef, efg, efgh} + vuint16m1_t v10 = rvv_uzp2q_u16(Utoa_4_helper(lo), Utoa_4_helper(hi), 8); + + // v12 = {0, a0, ab0, abc0, 0, e0, ef0, efg0} + vuint16m1_t v11 = __riscv_vmul_vv_u16m1(v10, __riscv_vmv_v_x_u16m1(10, 8), 8); + + vuint16m1_t v12 = __riscv_vreinterpret_v_u64m1_u16m1( + __riscv_vsll_vx_u64m1(__riscv_vreinterpret_v_u16m1_u64m1(v11), 16, 2)); + // v13 = {a, b, c, d, e, f, g, h} + vuint16m1_t v13 = __riscv_vsub_vv_u16m1(v10, v12, 8); + return v13; +} + +static sonic_force_inline char *Utoa_8(uint32_t val, char *out) { + /* convert to digits */ + vuint16m1_t v0 = UtoaNeon(val); + vuint16m1_t v1 = __riscv_vmv_v_x_u16m1(0, 8); + + /* convert to bytes, add '0' */ + vuint8m1_t v2 = __riscv_vslideup_vx_u8m1( + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v0)), + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v1)), 8, 16); + vuint8m1_t v3 = __riscv_vadd_vv_u8m1(v2, __riscv_vmv_v_x_u8m1('0', 16), 16); + + /* store high 64 bits */ + __riscv_vse8_v_u8m1((uint8_t *)(out), v3, 16); + return out + 8; +} + +static sonic_force_inline char *Utoa_16(uint64_t val, char *out) { + /* remaining digits */ + vuint16m1_t v0 = UtoaNeon((uint32_t)(val / 100000000)); + + vuint16m1_t v1 = UtoaNeon((uint32_t)(val % 100000000)); + + /* convert to bytes, add '0' */ + vuint8m1_t v2 = __riscv_vslideup_vx_u8m1( + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v0)), + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v1)), 8, 16); + + vuint8m1_t v3 = __riscv_vadd_vv_u8m1(v2, __riscv_vmv_v_x_u8m1('0', 16), 16); + + __riscv_vse8_v_u8m1((uint8_t *)(out), v3, 16); + return out + 16; +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/rvv-128/quote.h b/include/sonic/internal/arch/rvv-128/quote.h new file mode 100644 index 0000000..ba52c1e --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/quote.h @@ -0,0 +1,201 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "../common/quote_common.h" +#include "../common/quote_tables.h" +#include "base.h" +#include "unicode.h" + +// Not check the buffer size of dst, src must be a valid UTF-8 string with +// null-terminator. + +#define VEC_LEN 16 +#define PAGE_SIZE 4096 + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +#define MOVE_N_CHARS(src, N) \ + { \ + (src) += (N); \ + nb -= (N); \ + dst += (N); \ + } + +static sonic_force_inline long CopyAndGetEscapMask128(const char *src, + char *dst) { + vuint8m1_t v = + __riscv_vle8_v_u8m1(reinterpret_cast(src), 16); + __riscv_vse8_v_u8m1(reinterpret_cast(dst), v, 16); + + vbool8_t m1 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16); + vbool8_t m2 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16); + vbool8_t m3 = + __riscv_vmsltu_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\x20', 16), 16); + + vbool8_t m4 = __riscv_vmor_mm_b8(m1, m2, 16); + vbool8_t m5 = __riscv_vmor_mm_b8(m3, m4, 16); + + return __riscv_vfirst_m_b8(m5, 16); +} + +sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { + *dst++ = '"'; + sonic_assert(nb < (1ULL << 32)); + long mm; + int cn; + + /* VEC_LEN byte loop */ + while (nb >= VEC_LEN) { + /* check for matches */ + // TODO: optimize: exploit the simd bitmask in the escape block. + if ((mm = cn = CopyAndGetEscapMask128(src, dst)) >= 0) { + /* move to next block */ + MOVE_N_CHARS(src, cn); + DoEscape(src, dst, nb); + } else { + /* move to next block */ + MOVE_N_CHARS(src, VEC_LEN); + } + } + + if (nb > 0) { + char tmp_src[64] = {127}; + const char *src_r; +#ifdef SONIC_USE_SANITIZE + if (0) { +#else + /* This code would cause address sanitizer report heap-buffer-overflow. */ + if (((size_t)(src) & (PAGE_SIZE - 1)) <= (PAGE_SIZE - 64)) { + src_r = src; +#endif + } else { + std::memcpy(tmp_src, src, nb); + src_r = tmp_src; + } + while (int(nb) > 0) { + long tmp = CopyAndGetEscapMask128(src_r, dst); + cn = mm = (tmp >= static_cast(nb) ? -1 : tmp); + if (mm >= 0) { + MOVE_N_CHARS(src_r, cn); + DoEscape(src_r, dst, nb); + } else { + dst += nb; + nb = 0; + } + } + } + + *dst++ = '"'; + return dst; +} + +sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { +#define SONIC_REPEAT8(v) {v v v v v v v v} + + uint8_t *dst = src; + uint8_t *sdst = src; + while (1) { + find: + auto block = StringBlock::Find(src); + if (block.HasQuoteFirst()) { + int idx = block.QuoteIndex(); + src += idx; + *src++ = '\0'; + return src - sdst - 1; + } + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } + if (!block.HasBackslash()) { + src += VEC_LEN; + goto find; + } + + /* find out where the backspace is */ + auto bs_dist = block.BsIndex(); + src += bs_dist; + dst = src; + cont: + uint8_t escape_char = src[1]; + if (sonic_unlikely(escape_char == 'u')) { + if (!handle_unicode_codepoint(const_cast(&src), &dst)) { + err = kParseErrorEscapedUnicode; + return 0; + } + } else { + *dst = kEscapedMap[escape_char]; + if (sonic_unlikely(*dst == 0u)) { + err = kParseErrorEscapedFormat; + return 0; + } + src += 2; + dst += 1; + } + // fast path for continous escaped chars + if (*src == '\\') { + bs_dist = 0; + goto cont; + } + + find_and_move: + // Copy the next n bytes, and find the backslash and quote in them. + vuint8m1_t v = __riscv_vle8_v_u8m1(src, 16); + block = StringBlock::Find(v); + // If the next thing is the end quote, copy and return + if (block.HasQuoteFirst()) { + // we encountered quotes first. Move dst to point to quotes and exit + while (1) { + SONIC_REPEAT8(if (sonic_unlikely(*src == '"')) break; + else { *dst++ = *src++; }); + } + *dst = '\0'; + src++; + return dst - sdst; + } + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } + if (!block.HasBackslash()) { + /* they are the same. Since they can't co-occur, it means we + * encountered neither. */ + __riscv_vse8_v_u8m1(dst, v, 16); + src += VEC_LEN; + dst += VEC_LEN; + goto find_and_move; + } + while (1) { + SONIC_REPEAT8(if (sonic_unlikely(*src == '\\')) break; + else { *dst++ = *src++; }); + } + goto cont; + } + sonic_assert(false); +#undef SONIC_REPEAT8 +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json \ No newline at end of file diff --git a/include/sonic/internal/arch/rvv-128/simd.h b/include/sonic/internal/arch/rvv-128/simd.h new file mode 100644 index 0000000..1e5bc87 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/simd.h @@ -0,0 +1,430 @@ +// Copyright 2018-2019 The simdjson authors + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file may have been modified by ByteDance authors. All ByteDance +// Modifications are Copyright 2022 ByteDance Authors. + +#pragma once + +#include +#include + +namespace sonic_json { +namespace internal { +namespace rvv_128 { +// for rvv-128 +sonic_force_inline uint16_t to_bitmask(vbool8_t v) { + return __riscv_vmv_x_s_u16m1_u16(__riscv_vreinterpret_v_b8_u16m1(v)); +} + +typedef vuint8m1_t vuint8x16_t + __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint8m1_t vint8x16_t + __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); + +template +struct simd8; + +// +// Base class of simd8 and simd8, both of which use vuint8x16_t +// internally. +// +template +struct base_u8 { + vuint8x16_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + sonic_force_inline base_u8(const vuint8x16_t _value) : value(_value) {} + sonic_force_inline operator const vuint8x16_t&() const { return this->value; } + sonic_force_inline operator vuint8x16_t&() { return this->value; } + + // Bit operations + sonic_force_inline simd8 operator|(const simd8 other) const { + return __riscv_vor_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 operator&(const simd8 other) const { + return __riscv_vand_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 operator^(const simd8 other) const { + return __riscv_vxor_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 bit_andnot(const simd8 other) const { + return __riscv_vand_vv_u8m1(*this, __riscv_vnot_v_u8m1(other, 16), 16); + } + sonic_force_inline simd8 operator~() const { return *this ^ 0xFFu; } + sonic_force_inline simd8& operator|=(const simd8 other) { + auto this_cast = static_cast*>(this); + *this_cast = *this_cast | other; + return *this_cast; + } + sonic_force_inline simd8& operator&=(const simd8 other) { + auto this_cast = static_cast*>(this); + *this_cast = *this_cast & other; + return *this_cast; + } + sonic_force_inline simd8& operator^=(const simd8 other) { + auto this_cast = static_cast*>(this); + *this_cast = *this_cast ^ other; + return *this_cast; + } + + friend sonic_force_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return to_bitmask(__riscv_vmseq_vv_u8m1_b8(lhs, rhs, 16)); + } + + template + sonic_force_inline simd8 prev(const simd8 prev_chunk) const { + vuint8x16_t prev_chunk_slidedown = + __riscv_vslidedown_vx_u8m1(prev_chunk, 16 - N, 16); + return __riscv_vslideup_vx_u8m1(prev_chunk_slidedown, *this, N, 16); + } +}; + +// Unsigned bytes +template <> +struct simd8 : base_u8 { + static sonic_force_inline vuint8x16_t splat(uint8_t _value) { + return __riscv_vmv_v_x_u8m1(_value, 16); + } + static sonic_force_inline vuint8x16_t zero() { + return __riscv_vmv_v_x_u8m1(0, 16); + } + static sonic_force_inline vuint8x16_t load(const uint8_t* values) { + return __riscv_vle8_v_u8m1(values, 16); + } + + sonic_force_inline simd8(const vuint8x16_t _value) + : base_u8(_value) {} + // Zero constructor + sonic_force_inline simd8() : simd8(zero()) {} + // Array constructor + sonic_force_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + sonic_force_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization + sonic_force_inline simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, + uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, + uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8(vuint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + + // Repeat 16 values as many times as necessary (usually for lookup tables) + sonic_force_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, + uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Store to array + sonic_force_inline void store(uint8_t dst[16]) const { + return __riscv_vse8_v_u8m1(dst, *this, 16); + } + + // Saturated math + sonic_force_inline simd8 saturating_add( + const simd8 other) const { + return __riscv_vsaddu_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 saturating_sub( + const simd8 other) const { + return __riscv_vssubu_vv_u8m1(*this, other, 16); + } + + // Addition/subtraction are the same for signed and unsigned + sonic_force_inline simd8 operator+( + const simd8 other) const { + return __riscv_vadd_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 operator-( + const simd8 other) const { + return __riscv_vsub_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8& operator+=(const simd8 other) { + *this = *this + other; + return *this; + } + sonic_force_inline simd8& operator-=(const simd8 other) { + *this = *this - other; + return *this; + } + + // Order-specific operations + sonic_force_inline uint8_t max_val() const { + return __riscv_vmv_x_s_u8m1_u8( + __riscv_vredmaxu_vs_u8m1_u8m1(*this, __riscv_vmv_v_x_u8m1(0, 16), 16)); + } + sonic_force_inline uint8_t min_val() const { + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1( + *this, __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), 16)); + } + sonic_force_inline simd8 max_val(const simd8 other) const { + return __riscv_vmaxu_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 min_val(const simd8 other) const { + return __riscv_vminu_vv_u8m1(*this, other, 16); + } + sonic_force_inline uint16_t operator<=(const simd8 other) const { + return to_bitmask(__riscv_vmsleu_vv_u8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator>=(const simd8 other) const { + return to_bitmask(__riscv_vmsgeu_vv_u8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator<(const simd8 other) const { + return to_bitmask(__riscv_vmsltu_vv_u8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator>(const simd8 other) const { + return to_bitmask(__riscv_vmsgtu_vv_u8m1_b8(*this, other, 16)); + } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true + // = nonzero. For ARM, returns all 1's. + sonic_force_inline simd8 gt_bits(const simd8 other) const { + return simd8(*this > other); + } + // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true + // = nonzero. For ARM, returns all 1's. + sonic_force_inline simd8 lt_bits(const simd8 other) const { + return simd8(*this < other); + } + + // Bit-specific operations + sonic_force_inline uint16_t any_bits_set(simd8 bits) const { + return to_bitmask(__riscv_vmsgtu_vx_u8m1_b8( + __riscv_vand_vv_u8m1(*this, bits, 16), 0, 16)); + } + sonic_force_inline bool any_bits_set_anywhere() const { + return this->max_val() != 0; + } + sonic_force_inline bool any_bits_set_anywhere(simd8 bits) const { + return (*this & bits).any_bits_set_anywhere(); + } + template + sonic_force_inline simd8 shr() const { + const int b_half = N >> 1; + vuint8m1_t srl1 = __riscv_vsrl_vx_u8m1(*this, b_half, 16); + return __riscv_vsrl_vx_u8m1(srl1, b_half + (N & 0x1), 16); + + // return vshrq_n_u8(*this, N); + } + template + sonic_force_inline simd8 shl() const { + return __riscv_vsll_vx_u8m1(*this, N, 16); + } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + sonic_force_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + + template + sonic_force_inline simd8 lookup_16(L replace0, L replace1, L replace2, + L replace3, L replace4, L replace5, + L replace6, L replace7, L replace8, + L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } + + template + sonic_force_inline simd8 apply_lookup_16_to( + const simd8 original) { + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8(simd8(original), 16, 16); + return __riscv_vmerge_vxm_u8m1( + __riscv_vrgather_vv_u8m1(*this, simd8(original), 16), 0, mask, + 16); + // return vqtbl1q_u8(*this, simd8(original)); + } +}; + +// Signed bytes +template <> +struct simd8 { + vint8x16_t value; + + static sonic_force_inline simd8 splat(int8_t _value) { + return __riscv_vmv_v_x_i8m1(_value, 16); + } + static sonic_force_inline simd8 zero() { + return __riscv_vmv_v_x_i8m1(0, 16); + } + static sonic_force_inline simd8 load(const int8_t values[16]) { + return __riscv_vle8_v_i8m1(values, 16); + } + + // Conversion from/to SIMD register + sonic_force_inline simd8(const vint8x16_t _value) : value{_value} {} + sonic_force_inline operator const vint8x16_t&() const { return this->value; } + sonic_force_inline operator vint8x16_t&() { return this->value; } + + // Zero constructor + sonic_force_inline simd8() : simd8(zero()) {} + // Splat constructor + sonic_force_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + sonic_force_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + sonic_force_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8(vint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + sonic_force_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, + int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Store to array + sonic_force_inline void store(int8_t dst[16]) const { + return __riscv_vse8_v_i8m1(dst, *this, 16); + } + + sonic_force_inline explicit operator simd8() const { + return __riscv_vreinterpret_v_i8m1_u8m1(this->value); + } + + // Math + sonic_force_inline simd8 operator+(const simd8 other) const { + return __riscv_vadd_vv_i8m1(*this, other, 16); + } + sonic_force_inline simd8 operator-(const simd8 other) const { + return __riscv_vsub_vv_i8m1(*this, other, 16); + } + sonic_force_inline simd8& operator+=(const simd8 other) { + *this = *this + other; + return *this; + } + sonic_force_inline simd8& operator-=(const simd8 other) { + *this = *this - other; + return *this; + } + + // Order-sensitive comparisons + sonic_force_inline simd8 max_val(const simd8 other) const { + return __riscv_vmax_vv_i8m1(*this, other, 16); + } + sonic_force_inline simd8 min_val(const simd8 other) const { + return __riscv_vmin_vv_i8m1(*this, other, 16); + } + sonic_force_inline uint16_t operator>(const simd8 other) const { + return to_bitmask(__riscv_vmsgt_vv_i8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator<(const simd8 other) const { + return to_bitmask(__riscv_vmslt_vv_i8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator==(const simd8 other) const { + return to_bitmask(__riscv_vmseq_vv_i8m1_b8(*this, other, 16)); + } + + template + sonic_force_inline simd8 prev(const simd8 prev_chunk) const { + vint8m1_t prev_chunk_slidedown = + __riscv_vslidedown_vx_i8m1(prev_chunk, 16 - N, 16); + return __riscv_vslideup_vx_i8m1(prev_chunk_slidedown, *this, N, 16); + } + + // Perform a lookup assuming no value is larger than 16 + template + sonic_force_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + template + sonic_force_inline simd8 lookup_16(L replace0, L replace1, L replace2, + L replace3, L replace4, L replace5, + L replace6, L replace7, L replace8, + L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } + + template + sonic_force_inline simd8 apply_lookup_16_to(const simd8 original) { + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8(simd8(original), 16, 16); + return __riscv_vmerge_vxm_i8m1( + __riscv_vrgather_vv_i8m1(*this, simd8(original), 16), 0, mask, + 16); + } +}; + +sonic_force_inline uint64_t merge_bitmask(uint16_t mask1, uint16_t mask2, + uint16_t mask3, uint16_t mask4) { + return (uint64_t)mask1 | ((uint64_t)mask2 << 16) | ((uint64_t)mask3 << 32) | + ((uint64_t)mask4 << 48); +} + +template +struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "ARM kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = + delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + sonic_force_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + sonic_force_inline simd8x64(const T ptr[64]) + : chunks{simd8::load(ptr), simd8::load(ptr + 16), + simd8::load(ptr + 32), simd8::load(ptr + 48)} {} + + sonic_force_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0); + this->chunks[1].store(ptr + sizeof(simd8) * 1); + this->chunks[2].store(ptr + sizeof(simd8) * 2); + this->chunks[3].store(ptr + sizeof(simd8) * 3); + } + + sonic_force_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + + sonic_force_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return merge_bitmask(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask); + } + + sonic_force_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return merge_bitmask(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask); + } +}; // struct simd8x64 + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/rvv-128/skip.h b/include/sonic/internal/arch/rvv-128/skip.h new file mode 100644 index 0000000..343e328 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/skip.h @@ -0,0 +1,72 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define VEC_LEN 16 + +#include +#include + +#include "base.h" +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +using sonic_json::internal::common::EqBytes4; +using sonic_json::internal::common::SkipLiteral; + +#include "../common/riscv_common/skip.inc.h" + +sonic_force_inline bool SkipContainer(const uint8_t *data, size_t &pos, + size_t len, uint8_t left, uint8_t right) { + return skip_container>(data, pos, len, left, right); +} + +sonic_force_inline uint8_t skip_space(const uint8_t *data, size_t &pos, + size_t &, uint64_t &) { + // fast path for single space + if (!IsSpace(data[pos++])) return data[pos - 1]; + if (!IsSpace(data[pos++])) return data[pos - 1]; + + // current pos is out of block + while (1) { + uint16_t nonspace = GetNonSpaceBits(data + pos); + if (nonspace) { + int tmp = __builtin_ctz(nonspace); + pos += tmp; + return data[pos++]; + } else { + pos += 16; + } + } + sonic_assert(false && "!should not happen"); +} + +sonic_force_inline uint8_t skip_space_safe(const uint8_t *data, size_t &pos, + size_t len, size_t &, uint64_t &) { + while (pos < len && IsSpace(data[pos++])); + // if not found, still return the space chars + return data[pos - 1]; +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json + +#undef VEC_LEN diff --git a/include/sonic/internal/arch/rvv-128/str2int.h b/include/sonic/internal/arch/rvv-128/str2int.h new file mode 100644 index 0000000..b680d93 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/str2int.h @@ -0,0 +1,40 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +sonic_force_inline uint64_t simd_str2int(const char* c, int& man_nd) { + uint64_t sum = 0; + int i = 0; + while (c[i] >= '0' && c[i] <= '9' && i < man_nd) { + sum = sum * 10 + (c[i] - '0'); + i++; + } + man_nd = i; + return sum; +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/rvv-128/unicode.h b/include/sonic/internal/arch/rvv-128/unicode.h new file mode 100644 index 0000000..29b6815 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/unicode.h @@ -0,0 +1,105 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "../common/unicode_common.h" +#include "base.h" +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +using sonic_json::internal::common::handle_unicode_codepoint; + +struct StringBlock { + public: + sonic_force_inline static StringBlock Find(const uint8_t *src); + sonic_force_inline static StringBlock Find(vuint8m1_t &v); + sonic_force_inline bool HasQuoteFirst() const { + // return (((bs_bits - 1) & quote_bits) != 0) && !HasUnescaped(); + return (((bs_bits)-1) & (quote_bits)) != 0 && !HasUnescaped(); + } + sonic_force_inline bool HasBackslash() const { + // return ((quote_bits - 1) & bs_bits) != 0; + return (((quote_bits)-1) & (bs_bits)) != 0; + } + sonic_force_inline bool HasUnescaped() const { + // return ((quote_bits - 1) & unescaped_bits) != 0; + return (((quote_bits)-1) & (unescaped_bits)) != 0; + } + sonic_force_inline int QuoteIndex() const { + // return TrailingZeroes(quote_bits); + return TrailingZeroes(quote_bits); + } + sonic_force_inline int BsIndex() const { + // return TrailingZeroes(bs_bits); + return TrailingZeroes(bs_bits); + } + sonic_force_inline int UnescapedIndex() const { + // return TrailingZeroes(unescaped_bits); + return TrailingZeroes(unescaped_bits); + } + + uint16_t bs_bits; + uint16_t quote_bits; + uint16_t unescaped_bits; +}; + +sonic_force_inline StringBlock StringBlock::Find(const uint8_t *src) { + vuint8m1_t v = + __riscv_vle8_v_u8m1(reinterpret_cast(src), 16); + vuint16m1_t m1 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16)); + vuint16m1_t m2 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16)); + vuint16m1_t m3 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmsleu_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\x1f', 16), 16)); + return {__riscv_vmv_x_s_u16m1_u16(m1), __riscv_vmv_x_s_u16m1_u16(m2), + __riscv_vmv_x_s_u16m1_u16(m3)}; +} + +sonic_force_inline StringBlock StringBlock::Find(vuint8m1_t &v) { + vuint16m1_t m1 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16)); + vuint16m1_t m2 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16)); + vuint16m1_t m3 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmsleu_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\x1f', 16), 16)); + return {__riscv_vmv_x_s_u16m1_u16(m1), __riscv_vmv_x_s_u16m1_u16(m2), + __riscv_vmv_x_s_u16m1_u16(m3)}; +} + +sonic_force_inline uint16_t GetNonSpaceBits(const uint8_t *data) { + vuint8m1_t v = + __riscv_vle8_v_u8m1(reinterpret_cast(data), 16); + vbool8_t m1 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1(' ', 16), 16); + vbool8_t m2 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\t', 16), 16); + vbool8_t m3 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\n', 16), 16); + vbool8_t m4 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\r', 16), 16); + vbool8_t m5 = __riscv_vmor_mm_b8(m1, m2, 16); + vbool8_t m6 = __riscv_vmor_mm_b8(m3, m4, 16); + vbool8_t m7 = __riscv_vmor_mm_b8(m5, m6, 16); + vbool8_t m8 = __riscv_vmnot_m_b8(m7, 16); + return __riscv_vmv_x_s_u16m1_u16(__riscv_vreinterpret_v_b8_u16m1(m8)); +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/simd_dispatch.h b/include/sonic/internal/arch/simd_dispatch.h index 0c474ce..7fcc377 100644 --- a/include/sonic/internal/arch/simd_dispatch.h +++ b/include/sonic/internal/arch/simd_dispatch.h @@ -41,6 +41,11 @@ #define INCLUDE_ARCH_FILE(file) SONIC_STRINGIFY(neon/file) #endif +#if defined(SONIC_HAVE_RVV_128) +#define SONIC_USING_ARCH_FUNC(func) using rvv_128::func +#define INCLUDE_ARCH_FILE(file) SONIC_STRINGIFY(rvv-128/file) +#endif + #elif defined(SONIC_DYNAMIC_DISPATCH) // TODO: support SVE2 runtime dispatch diff --git a/include/sonic/internal/arch/sonic_cpu_feature.h b/include/sonic/internal/arch/sonic_cpu_feature.h index 21d2707..575b18a 100644 --- a/include/sonic/internal/arch/sonic_cpu_feature.h +++ b/include/sonic/internal/arch/sonic_cpu_feature.h @@ -44,4 +44,8 @@ #if defined(__ARM_FEATURE_SVE2) && (__ARM_FEATURE_SVE_BITS == 128) #define SONIC_HAVE_SVE2_128 #endif +#if defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && \ + __riscv_v_fixed_vlen == 128 +#define SONIC_HAVE_RVV_128 +#endif #endif From 9a77ab9b6e89ef9e5034e90e336ccccfe07cdba4 Mon Sep 17 00:00:00 2001 From: "yintong.ustc@bytedance.com" Date: Mon, 17 Feb 2025 09:59:43 +0000 Subject: [PATCH 3/4] riscv: add rvv support --- CMakeLists.txt | 1 + cmake/set_arch_flags.cmake | 4 + .../internal/arch/common/riscv_common/base.h | 70 +++ .../arch/common/riscv_common/skip.inc.h | 160 +++++++ include/sonic/internal/arch/rvv-128/base.h | 35 ++ include/sonic/internal/arch/rvv-128/itoa.h | 118 +++++ include/sonic/internal/arch/rvv-128/quote.h | 201 ++++++++ include/sonic/internal/arch/rvv-128/simd.h | 430 ++++++++++++++++++ include/sonic/internal/arch/rvv-128/skip.h | 72 +++ include/sonic/internal/arch/rvv-128/str2int.h | 40 ++ include/sonic/internal/arch/rvv-128/unicode.h | 105 +++++ include/sonic/internal/arch/simd_dispatch.h | 5 + .../sonic/internal/arch/sonic_cpu_feature.h | 4 + 13 files changed, 1245 insertions(+) create mode 100644 include/sonic/internal/arch/common/riscv_common/base.h create mode 100644 include/sonic/internal/arch/common/riscv_common/skip.inc.h create mode 100644 include/sonic/internal/arch/rvv-128/base.h create mode 100644 include/sonic/internal/arch/rvv-128/itoa.h create mode 100644 include/sonic/internal/arch/rvv-128/quote.h create mode 100644 include/sonic/internal/arch/rvv-128/simd.h create mode 100644 include/sonic/internal/arch/rvv-128/skip.h create mode 100644 include/sonic/internal/arch/rvv-128/str2int.h create mode 100644 include/sonic/internal/arch/rvv-128/unicode.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 36772ce..c0a0d57 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ option(BUILD_UNITTEST "Build unittest." ON) option(BUILD_FUZZ "Build fuzz." OFF) option(BUILD_BENCH "Build benchmark." OFF) option(ENABLE_SVE2_128 "Build for Arm SVE2 with 128 bit vector size" OFF) +option(ENABLE_RVV_128 "Build for RISC-V RVV with 128 bit vector size" OFF) set(CMAKE_CXX_EXTENSIONS OFF) if(BUILD_UNITTEST) diff --git a/cmake/set_arch_flags.cmake b/cmake/set_arch_flags.cmake index 7975cd0..fd25591 100644 --- a/cmake/set_arch_flags.cmake +++ b/cmake/set_arch_flags.cmake @@ -8,6 +8,10 @@ function(set_arch_flags target arch) else() target_compile_options(${target} PRIVATE -march=armv8-a) endif() + elseif(arch MATCHES "riscv64") + if(ENABLE_RVV_128) + target_compile_options(${target} PRIVATE -march=rv64gcv_zvl128b -mrvv-vector-bits=zvl) + endif() else() message(FATAL_ERROR "Unsupported architecture: ${arch}") endif() diff --git a/include/sonic/internal/arch/common/riscv_common/base.h b/include/sonic/internal/arch/common/riscv_common/base.h new file mode 100644 index 0000000..3e2c3ab --- /dev/null +++ b/include/sonic/internal/arch/common/riscv_common/base.h @@ -0,0 +1,70 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#include + +namespace sonic_json { +namespace internal { +namespace riscv_common { + +// We sometimes call trailing_zero on inputs that are zero, +// but the algorithms do not end up using the returned value. +// Sadly, sanitizers are not smart enough to figure it out. + +sonic_force_inline int TrailingZeroes(uint64_t input_num) { + //////// + // You might expect the next line to be equivalent to + // return (int)_tzcnt_u64(input_num); + // but the generated code differs and might be less efficient? + //////// + return __builtin_ctzll(input_num); +} + +/* result might be undefined when input_num is zero */ +sonic_force_inline uint64_t ClearLowestBit(uint64_t input_num) { + return input_num & (input_num - 1); +} + +/* result might be undefined when input_num is zero */ +sonic_force_inline int LeadingZeroes(uint64_t input_num) { + return __builtin_clzll(input_num); +} + +sonic_force_inline long long int CountOnes(uint64_t input_num) { + return __builtin_popcountll(input_num); +} + +sonic_force_inline uint64_t PrefixXor(uint64_t bitmask) { + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + return bitmask; +} + +template +sonic_force_inline void Xmemcpy(void* dst_, const void* src_, size_t chunks) { + std::memcpy(dst_, src_, chunks * ChunkSize); +} + +} // namespace riscv_common +} // namespace internal +} // namespace sonic_json \ No newline at end of file diff --git a/include/sonic/internal/arch/common/riscv_common/skip.inc.h b/include/sonic/internal/arch/common/riscv_common/skip.inc.h new file mode 100644 index 0000000..8877702 --- /dev/null +++ b/include/sonic/internal/arch/common/riscv_common/skip.inc.h @@ -0,0 +1,160 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef VEC_LEN +#error "Define vector length firstly!" +#endif + +// pos is the after the ending quote +sonic_force_inline int SkipString(const uint8_t *data, size_t &pos, + size_t len) { + const static int kEscaped = 2; + const static int kNormal = 1; + const static int kUnclosed = 0; + uint16_t quote_bits = 0; + uint16_t bs_bits = 0; + int ret = kNormal; + while (pos + VEC_LEN <= len) { + vuint8m1_t v = __riscv_vle8_v_u8m1(data + pos, 16); + bs_bits = to_bitmask( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16)); + quote_bits = to_bitmask( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16)); + if (((bs_bits - 1) & quote_bits) != 0) { + pos += TrailingZeroes(quote_bits) + 1; + return ret; + } + if (bs_bits) { + ret = kEscaped; + pos += (TrailingZeroes(bs_bits) + 2); + while (pos < len) { + if (data[pos] == '\\') { + pos += 2; + } else { + break; + } + } + } else { + pos += VEC_LEN; + } + } + while (pos < len) { + if (data[pos] == '\\') { + if (pos + 1 >= len) { + return kUnclosed; + } + ret = kEscaped; + pos += 2; + continue; + } + if (data[pos++] == '"') { + return ret; + } + }; + return kUnclosed; +} + +template +sonic_force_inline uint64_t GetStringBits(const uint8_t *data, + uint64_t &prev_instring, + uint64_t &prev_escaped) { + const T v(data); + uint64_t escaped = 0; + uint64_t bs_bits = v.eq('\\'); + if (bs_bits) { + escaped = common::GetEscaped<64>(prev_escaped, bs_bits); + } else { + escaped = prev_escaped; + prev_escaped = 0; + } + uint64_t quote_bits = v.eq('"') & ~escaped; + uint64_t in_string = PrefixXor(quote_bits) ^ prev_instring; + prev_instring = uint64_t(static_cast(in_string) >> 63); + return in_string; +} + +// GetNextToken find the next characters in tokens and update the position to +// it. +template +sonic_force_inline uint8_t GetNextToken(const uint8_t *data, size_t &pos, + size_t len, const char (&tokens)[N]) { + while (pos + VEC_LEN <= len) { + vuint8m1_t v = __riscv_vle8_v_u8m1(data + pos, 16); + vbool8_t vor = __riscv_vmclr_m_b8(16); + for (size_t i = 0; i < N - 1; i++) { + vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8( + v, __riscv_vmv_v_x_u8m1((uint8_t)(tokens[i]), 16), 16); + vor = __riscv_vmor_mm_b8(vor, cmp_res, 16); + } + + uint16_t next = to_bitmask(vor); + if (next) { + pos += TrailingZeroes(next); + return data[pos]; + } + pos += VEC_LEN; + } + while (pos < len) { + for (size_t i = 0; i < N - 1; i++) { + if (data[pos] == tokens[i]) { + return tokens[i]; + } + } + pos++; + } + return '\0'; +} + +template +sonic_force_inline bool skip_container(const uint8_t *data, size_t &pos, + size_t len, uint8_t left, + uint8_t right) { + uint64_t prev_instring = 0, prev_escaped = 0, instring; + int rbrace_num = 0, lbrace_num = 0, last_lbrace_num; + const uint8_t *p; + while (pos + 64 <= len) { + p = data + pos; +#define SKIP_LOOP() \ + { \ + instring = GetStringBits(p, prev_instring, prev_escaped); \ + T v(p); \ + last_lbrace_num = lbrace_num; \ + uint64_t rbrace = v.eq(right) & ~instring; \ + uint64_t lbrace = v.eq(left) & ~instring; \ + /* traverse each '}' */ \ + while (rbrace > 0) { \ + rbrace_num++; \ + lbrace_num = last_lbrace_num + CountOnes((rbrace - 1) & lbrace); \ + bool is_closed = lbrace_num < rbrace_num; \ + if (is_closed) { \ + sonic_assert(rbrace_num == lbrace_num + 1); \ + pos += TrailingZeroes(rbrace) + 1; \ + return true; \ + } \ + rbrace &= (rbrace - 1); \ + } \ + lbrace_num = last_lbrace_num + CountOnes(lbrace); \ + } + SKIP_LOOP(); + pos += 64; + } + uint8_t buf[64] = {0}; + std::memcpy(buf, data + pos, len - pos); + p = buf; + SKIP_LOOP(); +#undef SKIP_LOOP + return false; +} \ No newline at end of file diff --git a/include/sonic/internal/arch/rvv-128/base.h b/include/sonic/internal/arch/rvv-128/base.h new file mode 100644 index 0000000..8e74d1d --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/base.h @@ -0,0 +1,35 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#include + +#include "../common/riscv_common/base.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { +using sonic_json::internal::riscv_common::ClearLowestBit; +using sonic_json::internal::riscv_common::CountOnes; +using sonic_json::internal::riscv_common::LeadingZeroes; +using sonic_json::internal::riscv_common::PrefixXor; +using sonic_json::internal::riscv_common::TrailingZeroes; +using sonic_json::internal::riscv_common::Xmemcpy; +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json \ No newline at end of file diff --git a/include/sonic/internal/arch/rvv-128/itoa.h b/include/sonic/internal/arch/rvv-128/itoa.h new file mode 100644 index 0000000..a6e7551 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/itoa.h @@ -0,0 +1,118 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { +// Convert num {abcd} to {axxxx, abxxxx, abcxxxx, abcdxxxx} +static sonic_force_inline vuint16m1_t Utoa_4_helper(uint16_t num) { + uint16_t v = num << 2; + vuint16m1_t v00 = __riscv_vmv_v_x_u16m1(v, 4); + + vuint16m1_t kVecDiv = __riscv_vreinterpret_v_u64m1_u16m1( + __riscv_vmv_v_x_u64m1(0x80003334147b20c5, 1)); + vuint32m1_t v01 = __riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmulu_vv_u32m2(v00, kVecDiv, 4)); + vuint16m1_t v02 = + __riscv_vnsrl_wx_u16m1(__riscv_vlmul_ext_v_u32m1_u32m2(v01), 16, 4); + vuint16m1_t kVecShift = __riscv_vreinterpret_v_u64m1_u16m1( + __riscv_vmv_v_x_u64m1(0x8000200008000080, 1)); + vuint32m1_t v03 = __riscv_vlmul_trunc_v_u32m2_u32m1( + __riscv_vwmulu_vv_u32m2(v02, kVecShift, 4)); + return __riscv_vreinterpret_v_u32m1_u16m1(v03); +} + +static sonic_force_inline vuint16m1_t rvv_uzp2q_u16(vuint16m1_t a, + vuint16m1_t b, size_t vl) { + vuint16m2_t ab = + __riscv_vset_v_u16m1_u16m2(__riscv_vlmul_ext_v_u16m1_u16m2(a), 1, b); + vuint32m2_t ab_ = __riscv_vreinterpret_v_u16m2_u32m2(ab); + vuint16m1_t res = __riscv_vnsrl_wx_u16m1(ab_, 16, 16); + return res; +} + +static sonic_force_inline vuint8m1_t vqmovun_s16(vint16m1_t a) { + vuint16m1_t a_non_neg = + __riscv_vreinterpret_v_i16m1_u16m1(__riscv_vmax_vx_i16m1(a, 0, 8)); + return __riscv_vlmul_ext_v_u8mf2_u8m1( + __riscv_vnclipu_wx_u8mf2(a_non_neg, 0, __RISCV_VXRM_RDN, 8)); +} + +// Convert num's each digit as packed 16-bit in a vector. +// num's digits as abcdefgh (high bits is 0 if not enough) +// The converted vector is { a, b, c, d, e, f, g, h } +sonic_force_inline vuint16m1_t UtoaNeon(uint32_t num) { + uint16_t hi = num % 10000; // {efgh} + uint16_t lo = num / 10000; // {abcd} + + // v10 = {a, ab, abc, abcd, e, ef, efg, efgh} + vuint16m1_t v10 = rvv_uzp2q_u16(Utoa_4_helper(lo), Utoa_4_helper(hi), 8); + + // v12 = {0, a0, ab0, abc0, 0, e0, ef0, efg0} + vuint16m1_t v11 = __riscv_vmul_vv_u16m1(v10, __riscv_vmv_v_x_u16m1(10, 8), 8); + + vuint16m1_t v12 = __riscv_vreinterpret_v_u64m1_u16m1( + __riscv_vsll_vx_u64m1(__riscv_vreinterpret_v_u16m1_u64m1(v11), 16, 2)); + // v13 = {a, b, c, d, e, f, g, h} + vuint16m1_t v13 = __riscv_vsub_vv_u16m1(v10, v12, 8); + return v13; +} + +static sonic_force_inline char *Utoa_8(uint32_t val, char *out) { + /* convert to digits */ + vuint16m1_t v0 = UtoaNeon(val); + vuint16m1_t v1 = __riscv_vmv_v_x_u16m1(0, 8); + + /* convert to bytes, add '0' */ + vuint8m1_t v2 = __riscv_vslideup_vx_u8m1( + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v0)), + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v1)), 8, 16); + vuint8m1_t v3 = __riscv_vadd_vv_u8m1(v2, __riscv_vmv_v_x_u8m1('0', 16), 16); + + /* store high 64 bits */ + __riscv_vse8_v_u8m1((uint8_t *)(out), v3, 16); + return out + 8; +} + +static sonic_force_inline char *Utoa_16(uint64_t val, char *out) { + /* remaining digits */ + vuint16m1_t v0 = UtoaNeon((uint32_t)(val / 100000000)); + + vuint16m1_t v1 = UtoaNeon((uint32_t)(val % 100000000)); + + /* convert to bytes, add '0' */ + vuint8m1_t v2 = __riscv_vslideup_vx_u8m1( + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v0)), + vqmovun_s16(__riscv_vreinterpret_v_u16m1_i16m1(v1)), 8, 16); + + vuint8m1_t v3 = __riscv_vadd_vv_u8m1(v2, __riscv_vmv_v_x_u8m1('0', 16), 16); + + __riscv_vse8_v_u8m1((uint8_t *)(out), v3, 16); + return out + 16; +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/rvv-128/quote.h b/include/sonic/internal/arch/rvv-128/quote.h new file mode 100644 index 0000000..ba52c1e --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/quote.h @@ -0,0 +1,201 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "../common/quote_common.h" +#include "../common/quote_tables.h" +#include "base.h" +#include "unicode.h" + +// Not check the buffer size of dst, src must be a valid UTF-8 string with +// null-terminator. + +#define VEC_LEN 16 +#define PAGE_SIZE 4096 + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +#define MOVE_N_CHARS(src, N) \ + { \ + (src) += (N); \ + nb -= (N); \ + dst += (N); \ + } + +static sonic_force_inline long CopyAndGetEscapMask128(const char *src, + char *dst) { + vuint8m1_t v = + __riscv_vle8_v_u8m1(reinterpret_cast(src), 16); + __riscv_vse8_v_u8m1(reinterpret_cast(dst), v, 16); + + vbool8_t m1 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16); + vbool8_t m2 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16); + vbool8_t m3 = + __riscv_vmsltu_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\x20', 16), 16); + + vbool8_t m4 = __riscv_vmor_mm_b8(m1, m2, 16); + vbool8_t m5 = __riscv_vmor_mm_b8(m3, m4, 16); + + return __riscv_vfirst_m_b8(m5, 16); +} + +sonic_static_inline char *Quote(const char *src, size_t nb, char *dst) { + *dst++ = '"'; + sonic_assert(nb < (1ULL << 32)); + long mm; + int cn; + + /* VEC_LEN byte loop */ + while (nb >= VEC_LEN) { + /* check for matches */ + // TODO: optimize: exploit the simd bitmask in the escape block. + if ((mm = cn = CopyAndGetEscapMask128(src, dst)) >= 0) { + /* move to next block */ + MOVE_N_CHARS(src, cn); + DoEscape(src, dst, nb); + } else { + /* move to next block */ + MOVE_N_CHARS(src, VEC_LEN); + } + } + + if (nb > 0) { + char tmp_src[64] = {127}; + const char *src_r; +#ifdef SONIC_USE_SANITIZE + if (0) { +#else + /* This code would cause address sanitizer report heap-buffer-overflow. */ + if (((size_t)(src) & (PAGE_SIZE - 1)) <= (PAGE_SIZE - 64)) { + src_r = src; +#endif + } else { + std::memcpy(tmp_src, src, nb); + src_r = tmp_src; + } + while (int(nb) > 0) { + long tmp = CopyAndGetEscapMask128(src_r, dst); + cn = mm = (tmp >= static_cast(nb) ? -1 : tmp); + if (mm >= 0) { + MOVE_N_CHARS(src_r, cn); + DoEscape(src_r, dst, nb); + } else { + dst += nb; + nb = 0; + } + } + } + + *dst++ = '"'; + return dst; +} + +sonic_force_inline size_t parseStringInplace(uint8_t *&src, SonicError &err) { +#define SONIC_REPEAT8(v) {v v v v v v v v} + + uint8_t *dst = src; + uint8_t *sdst = src; + while (1) { + find: + auto block = StringBlock::Find(src); + if (block.HasQuoteFirst()) { + int idx = block.QuoteIndex(); + src += idx; + *src++ = '\0'; + return src - sdst - 1; + } + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } + if (!block.HasBackslash()) { + src += VEC_LEN; + goto find; + } + + /* find out where the backspace is */ + auto bs_dist = block.BsIndex(); + src += bs_dist; + dst = src; + cont: + uint8_t escape_char = src[1]; + if (sonic_unlikely(escape_char == 'u')) { + if (!handle_unicode_codepoint(const_cast(&src), &dst)) { + err = kParseErrorEscapedUnicode; + return 0; + } + } else { + *dst = kEscapedMap[escape_char]; + if (sonic_unlikely(*dst == 0u)) { + err = kParseErrorEscapedFormat; + return 0; + } + src += 2; + dst += 1; + } + // fast path for continous escaped chars + if (*src == '\\') { + bs_dist = 0; + goto cont; + } + + find_and_move: + // Copy the next n bytes, and find the backslash and quote in them. + vuint8m1_t v = __riscv_vle8_v_u8m1(src, 16); + block = StringBlock::Find(v); + // If the next thing is the end quote, copy and return + if (block.HasQuoteFirst()) { + // we encountered quotes first. Move dst to point to quotes and exit + while (1) { + SONIC_REPEAT8(if (sonic_unlikely(*src == '"')) break; + else { *dst++ = *src++; }); + } + *dst = '\0'; + src++; + return dst - sdst; + } + if (block.HasUnescaped()) { + err = kParseErrorUnEscaped; + return 0; + } + if (!block.HasBackslash()) { + /* they are the same. Since they can't co-occur, it means we + * encountered neither. */ + __riscv_vse8_v_u8m1(dst, v, 16); + src += VEC_LEN; + dst += VEC_LEN; + goto find_and_move; + } + while (1) { + SONIC_REPEAT8(if (sonic_unlikely(*src == '\\')) break; + else { *dst++ = *src++; }); + } + goto cont; + } + sonic_assert(false); +#undef SONIC_REPEAT8 +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json \ No newline at end of file diff --git a/include/sonic/internal/arch/rvv-128/simd.h b/include/sonic/internal/arch/rvv-128/simd.h new file mode 100644 index 0000000..1e5bc87 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/simd.h @@ -0,0 +1,430 @@ +// Copyright 2018-2019 The simdjson authors + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file may have been modified by ByteDance authors. All ByteDance +// Modifications are Copyright 2022 ByteDance Authors. + +#pragma once + +#include +#include + +namespace sonic_json { +namespace internal { +namespace rvv_128 { +// for rvv-128 +sonic_force_inline uint16_t to_bitmask(vbool8_t v) { + return __riscv_vmv_x_s_u16m1_u16(__riscv_vreinterpret_v_b8_u16m1(v)); +} + +typedef vuint8m1_t vuint8x16_t + __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint8m1_t vint8x16_t + __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); + +template +struct simd8; + +// +// Base class of simd8 and simd8, both of which use vuint8x16_t +// internally. +// +template +struct base_u8 { + vuint8x16_t value; + static const int SIZE = sizeof(value); + + // Conversion from/to SIMD register + sonic_force_inline base_u8(const vuint8x16_t _value) : value(_value) {} + sonic_force_inline operator const vuint8x16_t&() const { return this->value; } + sonic_force_inline operator vuint8x16_t&() { return this->value; } + + // Bit operations + sonic_force_inline simd8 operator|(const simd8 other) const { + return __riscv_vor_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 operator&(const simd8 other) const { + return __riscv_vand_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 operator^(const simd8 other) const { + return __riscv_vxor_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 bit_andnot(const simd8 other) const { + return __riscv_vand_vv_u8m1(*this, __riscv_vnot_v_u8m1(other, 16), 16); + } + sonic_force_inline simd8 operator~() const { return *this ^ 0xFFu; } + sonic_force_inline simd8& operator|=(const simd8 other) { + auto this_cast = static_cast*>(this); + *this_cast = *this_cast | other; + return *this_cast; + } + sonic_force_inline simd8& operator&=(const simd8 other) { + auto this_cast = static_cast*>(this); + *this_cast = *this_cast & other; + return *this_cast; + } + sonic_force_inline simd8& operator^=(const simd8 other) { + auto this_cast = static_cast*>(this); + *this_cast = *this_cast ^ other; + return *this_cast; + } + + friend sonic_force_inline Mask operator==(const simd8 lhs, + const simd8 rhs) { + return to_bitmask(__riscv_vmseq_vv_u8m1_b8(lhs, rhs, 16)); + } + + template + sonic_force_inline simd8 prev(const simd8 prev_chunk) const { + vuint8x16_t prev_chunk_slidedown = + __riscv_vslidedown_vx_u8m1(prev_chunk, 16 - N, 16); + return __riscv_vslideup_vx_u8m1(prev_chunk_slidedown, *this, N, 16); + } +}; + +// Unsigned bytes +template <> +struct simd8 : base_u8 { + static sonic_force_inline vuint8x16_t splat(uint8_t _value) { + return __riscv_vmv_v_x_u8m1(_value, 16); + } + static sonic_force_inline vuint8x16_t zero() { + return __riscv_vmv_v_x_u8m1(0, 16); + } + static sonic_force_inline vuint8x16_t load(const uint8_t* values) { + return __riscv_vle8_v_u8m1(values, 16); + } + + sonic_force_inline simd8(const vuint8x16_t _value) + : base_u8(_value) {} + // Zero constructor + sonic_force_inline simd8() : simd8(zero()) {} + // Array constructor + sonic_force_inline simd8(const uint8_t values[16]) : simd8(load(values)) {} + // Splat constructor + sonic_force_inline simd8(uint8_t _value) : simd8(splat(_value)) {} + // Member-by-member initialization + sonic_force_inline simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, + uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7, + uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, + uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) + : simd8(vuint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + + // Repeat 16 values as many times as necessary (usually for lookup tables) + sonic_force_inline static simd8 repeat_16( + uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, + uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, + uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Store to array + sonic_force_inline void store(uint8_t dst[16]) const { + return __riscv_vse8_v_u8m1(dst, *this, 16); + } + + // Saturated math + sonic_force_inline simd8 saturating_add( + const simd8 other) const { + return __riscv_vsaddu_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 saturating_sub( + const simd8 other) const { + return __riscv_vssubu_vv_u8m1(*this, other, 16); + } + + // Addition/subtraction are the same for signed and unsigned + sonic_force_inline simd8 operator+( + const simd8 other) const { + return __riscv_vadd_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 operator-( + const simd8 other) const { + return __riscv_vsub_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8& operator+=(const simd8 other) { + *this = *this + other; + return *this; + } + sonic_force_inline simd8& operator-=(const simd8 other) { + *this = *this - other; + return *this; + } + + // Order-specific operations + sonic_force_inline uint8_t max_val() const { + return __riscv_vmv_x_s_u8m1_u8( + __riscv_vredmaxu_vs_u8m1_u8m1(*this, __riscv_vmv_v_x_u8m1(0, 16), 16)); + } + sonic_force_inline uint8_t min_val() const { + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1( + *this, __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), 16)); + } + sonic_force_inline simd8 max_val(const simd8 other) const { + return __riscv_vmaxu_vv_u8m1(*this, other, 16); + } + sonic_force_inline simd8 min_val(const simd8 other) const { + return __riscv_vminu_vv_u8m1(*this, other, 16); + } + sonic_force_inline uint16_t operator<=(const simd8 other) const { + return to_bitmask(__riscv_vmsleu_vv_u8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator>=(const simd8 other) const { + return to_bitmask(__riscv_vmsgeu_vv_u8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator<(const simd8 other) const { + return to_bitmask(__riscv_vmsltu_vv_u8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator>(const simd8 other) const { + return to_bitmask(__riscv_vmsgtu_vv_u8m1_b8(*this, other, 16)); + } + // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true + // = nonzero. For ARM, returns all 1's. + sonic_force_inline simd8 gt_bits(const simd8 other) const { + return simd8(*this > other); + } + // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true + // = nonzero. For ARM, returns all 1's. + sonic_force_inline simd8 lt_bits(const simd8 other) const { + return simd8(*this < other); + } + + // Bit-specific operations + sonic_force_inline uint16_t any_bits_set(simd8 bits) const { + return to_bitmask(__riscv_vmsgtu_vx_u8m1_b8( + __riscv_vand_vv_u8m1(*this, bits, 16), 0, 16)); + } + sonic_force_inline bool any_bits_set_anywhere() const { + return this->max_val() != 0; + } + sonic_force_inline bool any_bits_set_anywhere(simd8 bits) const { + return (*this & bits).any_bits_set_anywhere(); + } + template + sonic_force_inline simd8 shr() const { + const int b_half = N >> 1; + vuint8m1_t srl1 = __riscv_vsrl_vx_u8m1(*this, b_half, 16); + return __riscv_vsrl_vx_u8m1(srl1, b_half + (N & 0x1), 16); + + // return vshrq_n_u8(*this, N); + } + template + sonic_force_inline simd8 shl() const { + return __riscv_vsll_vx_u8m1(*this, N, 16); + } + + // Perform a lookup assuming the value is between 0 and 16 (undefined behavior + // for out of range values) + template + sonic_force_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + + template + sonic_force_inline simd8 lookup_16(L replace0, L replace1, L replace2, + L replace3, L replace4, L replace5, + L replace6, L replace7, L replace8, + L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } + + template + sonic_force_inline simd8 apply_lookup_16_to( + const simd8 original) { + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8(simd8(original), 16, 16); + return __riscv_vmerge_vxm_u8m1( + __riscv_vrgather_vv_u8m1(*this, simd8(original), 16), 0, mask, + 16); + // return vqtbl1q_u8(*this, simd8(original)); + } +}; + +// Signed bytes +template <> +struct simd8 { + vint8x16_t value; + + static sonic_force_inline simd8 splat(int8_t _value) { + return __riscv_vmv_v_x_i8m1(_value, 16); + } + static sonic_force_inline simd8 zero() { + return __riscv_vmv_v_x_i8m1(0, 16); + } + static sonic_force_inline simd8 load(const int8_t values[16]) { + return __riscv_vle8_v_i8m1(values, 16); + } + + // Conversion from/to SIMD register + sonic_force_inline simd8(const vint8x16_t _value) : value{_value} {} + sonic_force_inline operator const vint8x16_t&() const { return this->value; } + sonic_force_inline operator vint8x16_t&() { return this->value; } + + // Zero constructor + sonic_force_inline simd8() : simd8(zero()) {} + // Splat constructor + sonic_force_inline simd8(int8_t _value) : simd8(splat(_value)) {} + // Array constructor + sonic_force_inline simd8(const int8_t* values) : simd8(load(values)) {} + // Member-by-member initialization + sonic_force_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, + int8_t v4, int8_t v5, int8_t v6, int8_t v7, + int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) + : simd8(vint8x16_t{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15}) {} + // Repeat 16 values as many times as necessary (usually for lookup tables) + sonic_force_inline static simd8 repeat_16( + int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, + int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11, + int8_t v12, int8_t v13, int8_t v14, int8_t v15) { + return simd8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); + } + + // Store to array + sonic_force_inline void store(int8_t dst[16]) const { + return __riscv_vse8_v_i8m1(dst, *this, 16); + } + + sonic_force_inline explicit operator simd8() const { + return __riscv_vreinterpret_v_i8m1_u8m1(this->value); + } + + // Math + sonic_force_inline simd8 operator+(const simd8 other) const { + return __riscv_vadd_vv_i8m1(*this, other, 16); + } + sonic_force_inline simd8 operator-(const simd8 other) const { + return __riscv_vsub_vv_i8m1(*this, other, 16); + } + sonic_force_inline simd8& operator+=(const simd8 other) { + *this = *this + other; + return *this; + } + sonic_force_inline simd8& operator-=(const simd8 other) { + *this = *this - other; + return *this; + } + + // Order-sensitive comparisons + sonic_force_inline simd8 max_val(const simd8 other) const { + return __riscv_vmax_vv_i8m1(*this, other, 16); + } + sonic_force_inline simd8 min_val(const simd8 other) const { + return __riscv_vmin_vv_i8m1(*this, other, 16); + } + sonic_force_inline uint16_t operator>(const simd8 other) const { + return to_bitmask(__riscv_vmsgt_vv_i8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator<(const simd8 other) const { + return to_bitmask(__riscv_vmslt_vv_i8m1_b8(*this, other, 16)); + } + sonic_force_inline uint16_t operator==(const simd8 other) const { + return to_bitmask(__riscv_vmseq_vv_i8m1_b8(*this, other, 16)); + } + + template + sonic_force_inline simd8 prev(const simd8 prev_chunk) const { + vint8m1_t prev_chunk_slidedown = + __riscv_vslidedown_vx_i8m1(prev_chunk, 16 - N, 16); + return __riscv_vslideup_vx_i8m1(prev_chunk_slidedown, *this, N, 16); + } + + // Perform a lookup assuming no value is larger than 16 + template + sonic_force_inline simd8 lookup_16(simd8 lookup_table) const { + return lookup_table.apply_lookup_16_to(*this); + } + template + sonic_force_inline simd8 lookup_16(L replace0, L replace1, L replace2, + L replace3, L replace4, L replace5, + L replace6, L replace7, L replace8, + L replace9, L replace10, L replace11, + L replace12, L replace13, L replace14, + L replace15) const { + return lookup_16(simd8::repeat_16( + replace0, replace1, replace2, replace3, replace4, replace5, replace6, + replace7, replace8, replace9, replace10, replace11, replace12, + replace13, replace14, replace15)); + } + + template + sonic_force_inline simd8 apply_lookup_16_to(const simd8 original) { + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8(simd8(original), 16, 16); + return __riscv_vmerge_vxm_i8m1( + __riscv_vrgather_vv_i8m1(*this, simd8(original), 16), 0, mask, + 16); + } +}; + +sonic_force_inline uint64_t merge_bitmask(uint16_t mask1, uint16_t mask2, + uint16_t mask3, uint16_t mask4) { + return (uint64_t)mask1 | ((uint64_t)mask2 << 16) | ((uint64_t)mask3 << 32) | + ((uint64_t)mask4 << 48); +} + +template +struct simd8x64 { + static constexpr int NUM_CHUNKS = 64 / sizeof(simd8); + static_assert(NUM_CHUNKS == 4, + "ARM kernel should use four registers per 64-byte block."); + const simd8 chunks[NUM_CHUNKS]; + + simd8x64(const simd8x64& o) = delete; // no copy allowed + simd8x64& operator=(const simd8& other) = + delete; // no assignment allowed + simd8x64() = delete; // no default constructor allowed + + sonic_force_inline simd8x64(const simd8 chunk0, const simd8 chunk1, + const simd8 chunk2, const simd8 chunk3) + : chunks{chunk0, chunk1, chunk2, chunk3} {} + sonic_force_inline simd8x64(const T ptr[64]) + : chunks{simd8::load(ptr), simd8::load(ptr + 16), + simd8::load(ptr + 32), simd8::load(ptr + 48)} {} + + sonic_force_inline void store(T ptr[64]) const { + this->chunks[0].store(ptr + sizeof(simd8) * 0); + this->chunks[1].store(ptr + sizeof(simd8) * 1); + this->chunks[2].store(ptr + sizeof(simd8) * 2); + this->chunks[3].store(ptr + sizeof(simd8) * 3); + } + + sonic_force_inline simd8 reduce_or() const { + return (this->chunks[0] | this->chunks[1]) | + (this->chunks[2] | this->chunks[3]); + } + + sonic_force_inline uint64_t eq(const T m) const { + const simd8 mask = simd8::splat(m); + return merge_bitmask(this->chunks[0] == mask, this->chunks[1] == mask, + this->chunks[2] == mask, this->chunks[3] == mask); + } + + sonic_force_inline uint64_t lteq(const T m) const { + const simd8 mask = simd8::splat(m); + return merge_bitmask(this->chunks[0] <= mask, this->chunks[1] <= mask, + this->chunks[2] <= mask, this->chunks[3] <= mask); + } +}; // struct simd8x64 + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/rvv-128/skip.h b/include/sonic/internal/arch/rvv-128/skip.h new file mode 100644 index 0000000..343e328 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/skip.h @@ -0,0 +1,72 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define VEC_LEN 16 + +#include +#include + +#include "base.h" +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +using sonic_json::internal::common::EqBytes4; +using sonic_json::internal::common::SkipLiteral; + +#include "../common/riscv_common/skip.inc.h" + +sonic_force_inline bool SkipContainer(const uint8_t *data, size_t &pos, + size_t len, uint8_t left, uint8_t right) { + return skip_container>(data, pos, len, left, right); +} + +sonic_force_inline uint8_t skip_space(const uint8_t *data, size_t &pos, + size_t &, uint64_t &) { + // fast path for single space + if (!IsSpace(data[pos++])) return data[pos - 1]; + if (!IsSpace(data[pos++])) return data[pos - 1]; + + // current pos is out of block + while (1) { + uint16_t nonspace = GetNonSpaceBits(data + pos); + if (nonspace) { + int tmp = __builtin_ctz(nonspace); + pos += tmp; + return data[pos++]; + } else { + pos += 16; + } + } + sonic_assert(false && "!should not happen"); +} + +sonic_force_inline uint8_t skip_space_safe(const uint8_t *data, size_t &pos, + size_t len, size_t &, uint64_t &) { + while (pos < len && IsSpace(data[pos++])); + // if not found, still return the space chars + return data[pos - 1]; +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json + +#undef VEC_LEN diff --git a/include/sonic/internal/arch/rvv-128/str2int.h b/include/sonic/internal/arch/rvv-128/str2int.h new file mode 100644 index 0000000..b680d93 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/str2int.h @@ -0,0 +1,40 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +sonic_force_inline uint64_t simd_str2int(const char* c, int& man_nd) { + uint64_t sum = 0; + int i = 0; + while (c[i] >= '0' && c[i] <= '9' && i < man_nd) { + sum = sum * 10 + (c[i] - '0'); + i++; + } + man_nd = i; + return sum; +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/rvv-128/unicode.h b/include/sonic/internal/arch/rvv-128/unicode.h new file mode 100644 index 0000000..29b6815 --- /dev/null +++ b/include/sonic/internal/arch/rvv-128/unicode.h @@ -0,0 +1,105 @@ +/* + * Copyright 2022 ByteDance Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "../common/unicode_common.h" +#include "base.h" +#include "simd.h" + +namespace sonic_json { +namespace internal { +namespace rvv_128 { + +using sonic_json::internal::common::handle_unicode_codepoint; + +struct StringBlock { + public: + sonic_force_inline static StringBlock Find(const uint8_t *src); + sonic_force_inline static StringBlock Find(vuint8m1_t &v); + sonic_force_inline bool HasQuoteFirst() const { + // return (((bs_bits - 1) & quote_bits) != 0) && !HasUnescaped(); + return (((bs_bits)-1) & (quote_bits)) != 0 && !HasUnescaped(); + } + sonic_force_inline bool HasBackslash() const { + // return ((quote_bits - 1) & bs_bits) != 0; + return (((quote_bits)-1) & (bs_bits)) != 0; + } + sonic_force_inline bool HasUnescaped() const { + // return ((quote_bits - 1) & unescaped_bits) != 0; + return (((quote_bits)-1) & (unescaped_bits)) != 0; + } + sonic_force_inline int QuoteIndex() const { + // return TrailingZeroes(quote_bits); + return TrailingZeroes(quote_bits); + } + sonic_force_inline int BsIndex() const { + // return TrailingZeroes(bs_bits); + return TrailingZeroes(bs_bits); + } + sonic_force_inline int UnescapedIndex() const { + // return TrailingZeroes(unescaped_bits); + return TrailingZeroes(unescaped_bits); + } + + uint16_t bs_bits; + uint16_t quote_bits; + uint16_t unescaped_bits; +}; + +sonic_force_inline StringBlock StringBlock::Find(const uint8_t *src) { + vuint8m1_t v = + __riscv_vle8_v_u8m1(reinterpret_cast(src), 16); + vuint16m1_t m1 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16)); + vuint16m1_t m2 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16)); + vuint16m1_t m3 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmsleu_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\x1f', 16), 16)); + return {__riscv_vmv_x_s_u16m1_u16(m1), __riscv_vmv_x_s_u16m1_u16(m2), + __riscv_vmv_x_s_u16m1_u16(m3)}; +} + +sonic_force_inline StringBlock StringBlock::Find(vuint8m1_t &v) { + vuint16m1_t m1 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\\', 16), 16)); + vuint16m1_t m2 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('"', 16), 16)); + vuint16m1_t m3 = __riscv_vreinterpret_v_b8_u16m1( + __riscv_vmsleu_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\x1f', 16), 16)); + return {__riscv_vmv_x_s_u16m1_u16(m1), __riscv_vmv_x_s_u16m1_u16(m2), + __riscv_vmv_x_s_u16m1_u16(m3)}; +} + +sonic_force_inline uint16_t GetNonSpaceBits(const uint8_t *data) { + vuint8m1_t v = + __riscv_vle8_v_u8m1(reinterpret_cast(data), 16); + vbool8_t m1 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1(' ', 16), 16); + vbool8_t m2 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\t', 16), 16); + vbool8_t m3 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\n', 16), 16); + vbool8_t m4 = __riscv_vmseq_vv_u8m1_b8(v, __riscv_vmv_v_x_u8m1('\r', 16), 16); + vbool8_t m5 = __riscv_vmor_mm_b8(m1, m2, 16); + vbool8_t m6 = __riscv_vmor_mm_b8(m3, m4, 16); + vbool8_t m7 = __riscv_vmor_mm_b8(m5, m6, 16); + vbool8_t m8 = __riscv_vmnot_m_b8(m7, 16); + return __riscv_vmv_x_s_u16m1_u16(__riscv_vreinterpret_v_b8_u16m1(m8)); +} + +} // namespace rvv_128 +} // namespace internal +} // namespace sonic_json diff --git a/include/sonic/internal/arch/simd_dispatch.h b/include/sonic/internal/arch/simd_dispatch.h index 0c474ce..7fcc377 100644 --- a/include/sonic/internal/arch/simd_dispatch.h +++ b/include/sonic/internal/arch/simd_dispatch.h @@ -41,6 +41,11 @@ #define INCLUDE_ARCH_FILE(file) SONIC_STRINGIFY(neon/file) #endif +#if defined(SONIC_HAVE_RVV_128) +#define SONIC_USING_ARCH_FUNC(func) using rvv_128::func +#define INCLUDE_ARCH_FILE(file) SONIC_STRINGIFY(rvv-128/file) +#endif + #elif defined(SONIC_DYNAMIC_DISPATCH) // TODO: support SVE2 runtime dispatch diff --git a/include/sonic/internal/arch/sonic_cpu_feature.h b/include/sonic/internal/arch/sonic_cpu_feature.h index 21d2707..575b18a 100644 --- a/include/sonic/internal/arch/sonic_cpu_feature.h +++ b/include/sonic/internal/arch/sonic_cpu_feature.h @@ -44,4 +44,8 @@ #if defined(__ARM_FEATURE_SVE2) && (__ARM_FEATURE_SVE_BITS == 128) #define SONIC_HAVE_SVE2_128 #endif +#if defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && \ + __riscv_v_fixed_vlen == 128 +#define SONIC_HAVE_RVV_128 +#endif #endif From d7fbe1e41ab5a8e48f0c9be0abb5f3fdd5a6c026 Mon Sep 17 00:00:00 2001 From: "yintong.ustc@bytedance.com" Date: Wed, 26 Feb 2025 08:20:02 +0000 Subject: [PATCH 4/4] riscv: add simd str2int support --- include/sonic/internal/arch/rvv-128/str2int.h | 136 ++++++++++++++++-- 1 file changed, 128 insertions(+), 8 deletions(-) diff --git a/include/sonic/internal/arch/rvv-128/str2int.h b/include/sonic/internal/arch/rvv-128/str2int.h index b680d93..7caa6f5 100644 --- a/include/sonic/internal/arch/rvv-128/str2int.h +++ b/include/sonic/internal/arch/rvv-128/str2int.h @@ -24,17 +24,137 @@ namespace sonic_json { namespace internal { namespace rvv_128 { +const int16_t d8d[] = {1000, 100, 10, 1, 1000, 100, 10, 1}; + +sonic_force_inline uint64_t digit_cnt(vint8m1_t in) { + vbool8_t m1 = __riscv_vmsgt_vx_i8m1_b8(in, '9', 16); + vbool8_t m2 = __riscv_vmslt_vx_i8m1_b8(in, '0', 16); + vbool8_t m3 = __riscv_vmor_mm_b8(m1, m2, 16); + return __riscv_vfirst_m_b8(m3, 16) == -1 ? 16 : __riscv_vfirst_m_b8(m3, 16); +} + +sonic_force_inline uint64_t simd_str2int_rvv_8(vint16m1_t in) { + vint16m1_t lu1 = __riscv_vle16_v_i16m1(d8d, 8); + vint32m2_t mul = __riscv_vwmul_vv_i32m2(in, lu1, 8); + vint32m1_t a = __riscv_vlmul_trunc_v_i32m2_i32m1(mul); + vint32m1_t b = + __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vslidedown_vx_i32m2(mul, 4, 8)); + vint32m1_t c = __riscv_vmacc_vx_i32m1(b, 10000, a, 4); + vint64m1_t zero = __riscv_vmv_v_x_i64m1(0, 2); + vint64m1_t reds = __riscv_vwredsum_vs_i32m1_i64m1(c, zero, 4); + return __riscv_vmv_x_s_i64m1_i64(reds); +} + +sonic_force_inline uint64_t simd_str2int_rvv_l8(vint16m1_t in, uint32_t len) { + vint16m1_t _d = + __riscv_vslideup_vx_i16m1(__riscv_vmv_v_x_i16m1(0, 8), in, 8 - len, 8); + + return simd_str2int_rvv_8(_d); +} + sonic_force_inline uint64_t simd_str2int(const char* c, int& man_nd) { - uint64_t sum = 0; - int i = 0; - while (c[i] >= '0' && c[i] <= '9' && i < man_nd) { - sum = sum * 10 + (c[i] - '0'); - i++; + vint8m1_t in = + __riscv_vle8_v_i8m1(reinterpret_cast(&c[0]), 16); + int len = (int)digit_cnt(in); + uint64_t ret = 1; + man_nd = man_nd < len ? man_nd : len; + in = __riscv_vsub_vx_i8m1(in, '0', 16); + switch (man_nd) { + vint8m1_t hi; + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + ret = simd_str2int_rvv_l8( + __riscv_vwcvt_x_x_v_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16), + man_nd); + + // ret = simd_str2int_sve_l8(svunpklo_s16(in), man_nd); + break; + case 8: + ret = simd_str2int_rvv_8( + __riscv_vwcvt_x_x_v_i16m1(__riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)); + break; + case 9: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 10ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 1); + break; + case 10: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 100ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 2); + break; + case 11: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 1000ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 3); + break; + case 12: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 10000ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 4); + break; + case 13: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 100000ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 5); + break; + case 14: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 1000000ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 6); + break; + case 15: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 10000000ull + + simd_str2int_rvv_l8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16), + 7); + break; + case 16: + hi = __riscv_vslidedown_vx_i8m1(in, 8, 16); + ret = simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(in), 16)) * + 100000000ull + + simd_str2int_rvv_8(__riscv_vwcvt_x_x_v_i16m1( + __riscv_vlmul_trunc_v_i8m1_i8mf2(hi), 16)); + break; + default: + ret = 0; + break; } - man_nd = i; - return sum; + return ret; } - } // namespace rvv_128 } // namespace internal } // namespace sonic_json