diff --git a/Cargo.toml b/Cargo.toml index 7a3bf3a..ede9561 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ Supports multithreading.""" keywords = ["matrix", "sgemm", "dgemm"] categories = ["science"] -exclude = ["docs/*"] +exclude = ["examples/*", "ci/*", ".github/*", "benches/*"] build = "build.rs" diff --git a/docs/0.2.0-avx-64.log b/docs/0.2.0-avx-64.log deleted file mode 100644 index dba19b8..0000000 --- a/docs/0.2.0-avx-64.log +++ /dev/null @@ -1,2 +0,0 @@ -test mat_mul_f32::m064 ... bench: 13,857 ns/iter (+/- 500) -test mat_mul_f64::m064 ... bench: 27,453 ns/iter (+/- 926) diff --git a/docs/0.2.1-avx-64.log b/docs/0.2.1-avx-64.log deleted file mode 100644 index f6697da..0000000 --- a/docs/0.2.1-avx-64.log +++ /dev/null @@ -1,2 +0,0 @@ -test mat_mul_f32::m064 ... bench: 13,197 ns/iter (+/- 106) -test mat_mul_f64::m064 ... bench: 27,156 ns/iter (+/- 908) diff --git a/docs/after-always-masked b/docs/after-always-masked deleted file mode 100644 index 22e2e51..0000000 --- a/docs/after-always-masked +++ /dev/null @@ -1,7 +0,0 @@ - -test mat_mul_f32::mix10000 ... bench: 16,291,680 ns/iter (+/- 1,350,524) -test mat_mul_f64::mix10000 ... bench: 27,606,065 ns/iter (+/- 875,748) -test mat_mul_f32::m127 ... bench: 208,380 ns/iter (+/- 3,311) -test mat_mul_f64::m127 ... bench: 343,568 ns/iter (+/- 17,215) -test nonative_mat_mul_f32::m127 ... bench: 291,936 ns/iter (+/- 10,408) -test nonative_mat_mul_f64::m127 ... bench: 478,248 ns/iter (+/- 4,558) diff --git a/docs/before-always-masked b/docs/before-always-masked deleted file mode 100644 index 7ebabb8..0000000 --- a/docs/before-always-masked +++ /dev/null @@ -1,7 +0,0 @@ - -test mat_mul_f32::mix10000 ... bench: 23,430,259 ns/iter (+/- 700,947) -test mat_mul_f64::mix10000 ... bench: 36,255,323 ns/iter (+/- 1,344,432) -test mat_mul_f32::m127 ... bench: 290,740 ns/iter (+/- 6,776) -test mat_mul_f64::m127 ... bench: 433,429 ns/iter (+/- 17,110) -test nonative_mat_mul_f32::m127 ... bench: 375,725 ns/iter (+/- 7,761) -test nonative_mat_mul_f64::m127 ... bench: 503,924 ns/iter (+/- 16,038) diff --git a/docs/dgemm4x4_rust.log b/docs/dgemm4x4_rust.log deleted file mode 100644 index 129bc7c..0000000 --- a/docs/dgemm4x4_rust.log +++ /dev/null @@ -1,5 +0,0 @@ -name 4x4-dgemm.log ns/iter 8x4-dgemm.log ns/iter diff ns/iter diff % -mat_mul_f64::m016 1,254 1,103 -151 -12.04% -mat_mul_f64::m064 47,573 38,887 -8,686 -18.26% -mat_mul_f64::m127 344,298 288,225 -56,073 -16.29% -nonative_mat_mul_f64::m127 473,111 549,677 76,566 16.18% diff --git a/docs/onealloc_after.log b/docs/onealloc_after.log deleted file mode 100644 index f198943..0000000 --- a/docs/onealloc_after.log +++ /dev/null @@ -1,22 +0,0 @@ - -test mat_mul_f32::m004 ... bench: 142 ns/iter (+/- 3) -test mat_mul_f32::m005 ... bench: 200 ns/iter (+/- 2) -test mat_mul_f32::m006 ... bench: 215 ns/iter (+/- 5) -test mat_mul_f32::m007 ... bench: 242 ns/iter (+/- 10) -test mat_mul_f32::m008 ... bench: 251 ns/iter (+/- 15) -test mat_mul_f32::m009 ... bench: 457 ns/iter (+/- 8) -test mat_mul_f32::m012 ... bench: 606 ns/iter (+/- 7) -test mat_mul_f32::m016 ... bench: 910 ns/iter (+/- 20) -test mat_mul_f32::m032 ... bench: 4,595 ns/iter (+/- 280) -test mat_mul_f32::m064 ... bench: 28,104 ns/iter (+/- 530) -test mat_mul_f32::m127 ... bench: 189,393 ns/iter (+/- 4,303) -test mat_mul_f32::mix16x4 ... bench: 1,717 ns/iter (+/- 64) -test mat_mul_f32::mix32x2 ... bench: 1,462 ns/iter (+/- 29) -test mat_mul_f64::m004 ... bench: 145 ns/iter (+/- 17) -test mat_mul_f64::m007 ... bench: 257 ns/iter (+/- 6) -test mat_mul_f64::m008 ... bench: 276 ns/iter (+/- 11) -test mat_mul_f64::m012 ... bench: 678 ns/iter (+/- 22) -test mat_mul_f64::m016 ... bench: 1,065 ns/iter (+/- 19) -test mat_mul_f64::m032 ... bench: 6,024 ns/iter (+/- 1,709) -test mat_mul_f64::m064 ... bench: 39,642 ns/iter (+/- 6,456) -test mat_mul_f64::m127 ... bench: 278,104 ns/iter (+/- 8,016) diff --git a/docs/onealloc_before.log b/docs/onealloc_before.log deleted file mode 100644 index bae8574..0000000 --- a/docs/onealloc_before.log +++ /dev/null @@ -1,22 +0,0 @@ - -test mat_mul_f32::m004 ... bench: 172 ns/iter (+/- 25) -test mat_mul_f32::m005 ... bench: 230 ns/iter (+/- 9) -test mat_mul_f32::m006 ... bench: 250 ns/iter (+/- 7) -test mat_mul_f32::m007 ... bench: 274 ns/iter (+/- 12) -test mat_mul_f32::m008 ... bench: 289 ns/iter (+/- 18) -test mat_mul_f32::m009 ... bench: 499 ns/iter (+/- 14) -test mat_mul_f32::m012 ... bench: 670 ns/iter (+/- 50) -test mat_mul_f32::m016 ... bench: 956 ns/iter (+/- 47) -test mat_mul_f32::m032 ... bench: 4,645 ns/iter (+/- 158) -test mat_mul_f32::m064 ... bench: 27,912 ns/iter (+/- 582) -test mat_mul_f32::m127 ... bench: 189,245 ns/iter (+/- 4,271) -test mat_mul_f32::mix16x4 ... bench: 1,767 ns/iter (+/- 55) -test mat_mul_f32::mix32x2 ... bench: 1,523 ns/iter (+/- 220) -test mat_mul_f64::m004 ... bench: 178 ns/iter (+/- 5) -test mat_mul_f64::m007 ... bench: 286 ns/iter (+/- 3) -test mat_mul_f64::m008 ... bench: 319 ns/iter (+/- 22) -test mat_mul_f64::m012 ... bench: 745 ns/iter (+/- 44) -test mat_mul_f64::m016 ... bench: 1,127 ns/iter (+/- 245) -test mat_mul_f64::m032 ... bench: 6,109 ns/iter (+/- 641) -test mat_mul_f64::m064 ... bench: 39,756 ns/iter (+/- 1,640) -test mat_mul_f64::m127 ... bench: 281,060 ns/iter (+/- 3,167) diff --git a/docs/sgemm4x4.log b/docs/sgemm4x4.log deleted file mode 100644 index 56e0609..0000000 --- a/docs/sgemm4x4.log +++ /dev/null @@ -1,12 +0,0 @@ - -test mat_mul_f32::m004 ... bench: 153 ns/iter (+/- 2) -test mat_mul_f32::m007 ... bench: 280 ns/iter (+/- 7) -test mat_mul_f32::m008 ... bench: 305 ns/iter (+/- 4) -test mat_mul_f32::m012 ... bench: 598 ns/iter (+/- 18) -test mat_mul_f32::m016 ... bench: 1,044 ns/iter (+/- 131) -test mat_mul_f32::m032 ... bench: 5,037 ns/iter (+/- 76) -test mat_mul_f32::m064 ... bench: 30,305 ns/iter (+/- 1,550) - -test mat_mul_f32::m127 ... bench: 208,380 ns/iter (+/- 3,311) -test nonative_mat_mul_f32::m127 ... bench: 291,936 ns/iter (+/- 10,408) -test mat_mul_f32::mix128x10000x128 ... bench: 16,291,680 ns/iter (+/- 1,350,524) diff --git a/docs/sgemm4x8.log b/docs/sgemm4x8.log deleted file mode 100644 index ff0fc59..0000000 --- a/docs/sgemm4x8.log +++ /dev/null @@ -1,11 +0,0 @@ - -test mat_mul_f32::m004 ... bench: 172 ns/iter (+/- 2) -test mat_mul_f32::m007 ... bench: 271 ns/iter (+/- 5) -test mat_mul_f32::m008 ... bench: 286 ns/iter (+/- 15) -test mat_mul_f32::m012 ... bench: 650 ns/iter (+/- 12) -test mat_mul_f32::m016 ... bench: 945 ns/iter (+/- 59) -test mat_mul_f32::m032 ... bench: 4,638 ns/iter (+/- 104) -test mat_mul_f32::m064 ... bench: 27,748 ns/iter (+/- 587) -test mat_mul_f32::m127 ... bench: 188,977 ns/iter (+/- 8,601) -test mat_mul_f32::mix128x10000x128 ... bench: 14,293,288 ns/iter (+/- 2,022,603) -test nonative_mat_mul_f32::m127 ... bench: 272,487 ns/iter (+/- 5,289) diff --git a/spare_kernels/aarch64_neon_4x4.rs b/spare_kernels/aarch64_neon_4x4.rs deleted file mode 100644 index 319b134..0000000 --- a/spare_kernels/aarch64_neon_4x4.rs +++ /dev/null @@ -1,150 +0,0 @@ -#[cfg(target_arch="aarch64")] -struct KernelArmNeon; - -#[cfg(target_arch="aarch64")] -impl GemmKernel for KernelArmNeon { - type Elem = T; - - type MRTy = U4; - type NRTy = U4; - - #[inline(always)] - fn align_to() -> usize { 16 } - - #[inline(always)] - fn always_masked() -> bool { false } - - #[inline(always)] - fn nc() -> usize { archparam::S_NC } - #[inline(always)] - fn kc() -> usize { archparam::S_KC } - #[inline(always)] - fn mc() -> usize { archparam::S_MC } - - #[inline(always)] - unsafe fn kernel( - k: usize, - alpha: T, - a: *const T, - b: *const T, - beta: T, - c: *mut T, rsc: isize, csc: isize) { - kernel_target_arm_neon(k, alpha, a, b, beta, c, rsc, csc) - } -} - -// 4x4 neon kernel unrolled developed for apple silicon M1 -#[cfg(target_arch="aarch64")] -#[target_feature(enable="neon")] -unsafe fn kernel_target_arm_neon(k: usize, alpha: T, a: *const T, b: *const T, - beta: T, c: *mut T, rsc: isize, csc: isize) -{ - use core::arch::aarch64::*; - const MR: usize = KernelArmNeon::MR; - const NR: usize = KernelArmNeon::NR; - - let (mut a, mut b, rsc, csc) = if rsc == 1 { (b, a, csc, rsc) } else { (a, b, rsc, csc) }; - - let mut ab = [vmovq_n_f32(0.); MR]; - let mut ab2 = [vmovq_n_f32(0.); MR]; - let mut ab3 = [vmovq_n_f32(0.); MR]; - let mut ab4 = [vmovq_n_f32(0.); MR]; - let use_fma = true; - - // Compute - // ab_ij = a_i * b_j for all i, j - macro_rules! ab_ij_equals_ai_bj { - ($dest:ident, $av:expr, $bv:expr) => { - if use_fma { - $dest[0] = vfmaq_laneq_f32($dest[0], $bv, $av, 0); - $dest[1] = vfmaq_laneq_f32($dest[1], $bv, $av, 1); - $dest[2] = vfmaq_laneq_f32($dest[2], $bv, $av, 2); - $dest[3] = vfmaq_laneq_f32($dest[3], $bv, $av, 3); - } else { - $dest[0] = vaddq_f32($dest[0], vmulq_laneq_f32($bv, $av, 0)); - $dest[1] = vaddq_f32($dest[1], vmulq_laneq_f32($bv, $av, 1)); - $dest[2] = vaddq_f32($dest[2], vmulq_laneq_f32($bv, $av, 2)); - $dest[3] = vaddq_f32($dest[3], vmulq_laneq_f32($bv, $av, 3)); - } - } - } - - const UNROLL_BY: usize = 4; - - for _ in 0..k / UNROLL_BY { - let av = vld1q_f32(a); - let bv = vld1q_f32(b); - // eprintln!("a: {av:?}"); - // eprintln!("b: {bv:?}"); - - // FMLA instruction - // Cortex 7A: FMA has 7 cycles latency or 3 cycles when the dependency is on the accumulator - // M1: Latency 3, throughput 0.25 - ab_ij_equals_ai_bj!(ab, av, bv); - - let av = vld1q_f32(a.add(4)); - let bv = vld1q_f32(b.add(4)); - - ab_ij_equals_ai_bj!(ab2, av, bv); - - if UNROLL_BY > 2 { - - let av = vld1q_f32(a.add(8)); - let bv = vld1q_f32(b.add(8)); - - ab_ij_equals_ai_bj!(ab3, av, bv); - - let av = vld1q_f32(a.add(12)); - let bv = vld1q_f32(b.add(12)); - - ab_ij_equals_ai_bj!(ab4, av, bv); - - } - - a = a.offset(UNROLL_BY as isize * MR as isize); - b = b.offset(UNROLL_BY as isize * NR as isize); - } - - for _ in 0..k % UNROLL_BY { - let av = vld1q_f32(a); - let bv = vld1q_f32(b); - - ab_ij_equals_ai_bj!(ab, av, bv); - - a = a.offset(MR as isize); - b = b.offset(NR as isize); - } - - macro_rules! c { - ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); - } - - macro_rules! extract { - ($v:expr, $imm:expr) => ( - f32::from_bits(vgetq_lane_u32(core::mem::transmute::<_, uint32x4_t>($v), $imm)) - ) - } - - // Combine accumulators and multiply by alpha - loop4!(i, ab[i] = vaddq_f32(vaddq_f32(ab[i], ab2[i]), vaddq_f32(ab3[i], ab4[i]))); - loop4!(i, ab[i] = vmulq_n_f32(ab[i], alpha)); - - if beta == 0. { - // set C = α A B - if csc == 1 { - loop4!(i, vst1q_f32(c![i, 0], ab[i])); - } else { - loop4!(i, vst1q_lane_f32(c![i, 0], ab[i], 0)); - loop4!(i, vst1q_lane_f32(c![i, 1], ab[i], 1)); - loop4!(i, vst1q_lane_f32(c![i, 2], ab[i], 2)); - loop4!(i, vst1q_lane_f32(c![i, 3], ab[i], 3)); - } - } else { - // set C = α A B + beta C - loop4!(i, *c![i, 0] = *c![i, 0] * beta + extract!(ab[i], 0)); - loop4!(i, *c![i, 1] = *c![i, 1] * beta + extract!(ab[i], 1)); - loop4!(i, *c![i, 2] = *c![i, 2] * beta + extract!(ab[i], 2)); - loop4!(i, *c![i, 3] = *c![i, 3] * beta + extract!(ab[i], 3)); - } -} - diff --git a/spare_kernels/x86_sse_sgemm.rs b/spare_kernels/x86_sse_sgemm.rs deleted file mode 100644 index 720c93c..0000000 --- a/spare_kernels/x86_sse_sgemm.rs +++ /dev/null @@ -1,84 +0,0 @@ - -// 4x4 sse sgemm -macro_rules! mm_transpose4 { - ($c0:expr, $c1:expr, $c2:expr, $c3:expr) => {{ - // This is _MM_TRANSPOSE4_PS except we take variables, not references - let tmp0 = _mm_unpacklo_ps($c0, $c1); - let tmp2 = _mm_unpacklo_ps($c2, $c3); - let tmp1 = _mm_unpackhi_ps($c0, $c1); - let tmp3 = _mm_unpackhi_ps($c2, $c3); - - $c0 = _mm_movelh_ps(tmp0, tmp2); - $c1 = _mm_movehl_ps(tmp2, tmp0); - $c2 = _mm_movelh_ps(tmp1, tmp3); - $c3 = _mm_movehl_ps(tmp3, tmp1); - }} -} - -#[inline(always)] -#[cfg(any(target_arch="x86", target_arch="x86_64"))] -unsafe fn kernel_x86_sse(k: usize, alpha: T, a: *const T, b: *const T, - beta: T, c: *mut T, rsc: isize, csc: isize) -{ - let mut ab = [_mm_setzero_ps(); MR]; - - let mut bv; - let (mut a, mut b) = (a, b); - - // Compute A B - for _ in 0..k { - bv = _mm_load_ps(b as _); // aligned due to GemmKernel::align_to - - loop_m!(i, { - // Compute ab_i += [ai b_j+0, ai b_j+1, ai b_j+2, ai b_j+3] - let aiv = _mm_set1_ps(at(a, i)); - ab[i] = _mm_add_ps(ab[i], _mm_mul_ps(aiv, bv)); - }); - - a = a.add(MR); - b = b.add(NR); - } - - // Compute α (A B) - let alphav = _mm_set1_ps(alpha); - loop_m!(i, ab[i] = _mm_mul_ps(alphav, ab[i])); - - macro_rules! c { - ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); - } - - // C ← α A B + β C - let mut c = [_mm_setzero_ps(); MR]; - let betav = _mm_set1_ps(beta); - if beta != 0. { - // Read C - if csc == 1 { - loop_m!(i, c[i] = _mm_loadu_ps(c![i, 0])); - } else if rsc == 1 { - loop_m!(i, c[i] = _mm_loadu_ps(c![0, i])); - mm_transpose4!(c[0], c[1], c[2], c[3]); - } else { - loop_m!(i, c[i] = _mm_set_ps(*c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0])); - } - // Compute β C - loop_m!(i, c[i] = _mm_mul_ps(c[i], betav)); - } - - // Compute (α A B) + (β C) - loop_m!(i, c[i] = _mm_add_ps(c[i], ab[i])); - - // Store C back to memory - if csc == 1 { - loop_m!(i, _mm_storeu_ps(c![i, 0], c[i])); - } else if rsc == 1 { - mm_transpose4!(c[0], c[1], c[2], c[3]); - loop_m!(i, _mm_storeu_ps(c![0, i], c[i])); - } else { - // extract the nth value of a vector using _mm_cvtss_f32 (extract lowest) - // in combination with shuffle (move nth value to first position) - loop_m!(i, *c![i, 0] = _mm_cvtss_f32(c[i])); - loop_m!(i, *c![i, 1] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 1))); - loop_m!(i, *c![i, 2] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 2))); - loop_m!(i, *c![i, 3] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 3))); - } -}