// This software is licensed under a dual license model: // // GNU Affero General Public License v3 (AGPLv3): You may use, modify, and // distribute this software under the terms of the AGPLv3. // // Elastic License v2 (ELv2): You may also use, modify, and distribute this // software under the Elastic License v2, which has specific restrictions. // // We welcome any commercial collaboration or support. For inquiries // regarding the licenses, please contact us at: // vectorchord-inquiry@tensorchord.ai // // Copyright (c) 2025 TensorChord Inc. #![allow(unsafe_code)] use criterion::{Criterion, criterion_group, criterion_main}; fn floating_f32_reduce_sum_of_xy(c: &mut Criterion) { use rand::Rng; let mut rng = rand::rng(); let x = (0..4095) .map(|_| rng.random_range(-1.0..=1.0f32)) .collect::>(); let y = (0..4095) .map(|_| rng.random_range(-1.0..=1.0f32)) .collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("floating_f32::reduce_sum_of_xy::v4", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_xy::reduce_sum_of_xy_v4(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("floating_f32::reduce_sum_of_xy::v3", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_xy::reduce_sum_of_xy_v3(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v2") && simd::is_feature_detected!("fma") { c.bench_function("floating_f32::reduce_sum_of_xy::v2_fma", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_xy::reduce_sum_of_xy_v2_fma(&x, &y) }) }); } #[cfg(all(target_arch = "aarch64", target_endian = "little"))] if simd::is_cpu_detected!("a3.256") { c.bench_function("floating_f32::reduce_sum_of_xy::a3_256", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_xy::reduce_sum_of_xy_a3_256(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") { c.bench_function("floating_f32::reduce_sum_of_xy::a2", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_xy::reduce_sum_of_xy_a2(&x, &y) }) }); } } fn floating_f32_reduce_sum_of_d2(c: &mut Criterion) { use rand::Rng; let mut rng = rand::rng(); let x = (0..4095) .map(|_| rng.random_range(-1.0..=1.0f32)) .collect::>(); let y = (0..4095) .map(|_| rng.random_range(-1.0..=1.0f32)) .collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("floating_f32::reduce_sum_of_d2::v4", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_d2::reduce_sum_of_d2_v4(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("floating_f32::reduce_sum_of_d2::v3", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_d2::reduce_sum_of_d2_v3(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v2") && simd::is_feature_detected!("fma") { c.bench_function("floating_f32::reduce_sum_of_d2::v2_fma", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_d2::reduce_sum_of_d2_v2_fma(&x, &y) }) }); } #[cfg(all(target_arch = "aarch64", target_endian = "little"))] if simd::is_cpu_detected!("a3.256") { c.bench_function("floating_f32::reduce_sum_of_d2::a3_256", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_d2::reduce_sum_of_d2_a3_256(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") { c.bench_function("floating_f32::reduce_sum_of_d2::a2", |b| { b.iter(|| unsafe { simd::floating_f32::reduce_sum_of_d2::reduce_sum_of_d2_a2(&x, &y) }) }); } } fn floating_f16_reduce_sum_of_xy(c: &mut Criterion) { use rand::Rng; use simd::{F16, f16}; let mut rng = rand::rng(); let x = (0..4095) .map(|_| f16::_from_f32(rng.random_range(-1.0..=1.0))) .collect::>(); let y = (0..4095) .map(|_| f16::_from_f32(rng.random_range(-1.0..=1.0))) .collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") && simd::is_feature_detected!("avx512fp16") { c.bench_function("floating_f16::reduce_sum_of_xy::v4_avx512fp16", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_xy::reduce_sum_of_xy_v4_avx512fp16(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("floating_f16::reduce_sum_of_xy::v4", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_xy::reduce_sum_of_xy_v4(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("floating_f16::reduce_sum_of_xy::v3", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_xy::reduce_sum_of_xy_v3(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a3.512") { c.bench_function("floating_f16::reduce_sum_of_xy::a3_512", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_xy::reduce_sum_of_xy_a3_512(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") && simd::is_feature_detected!("fp16") { c.bench_function("floating_f16::reduce_sum_of_xy::a2_fp16", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_xy::reduce_sum_of_xy_a2_fp16(&x, &y) }) }); } } fn floating_f16_reduce_sum_of_d2(c: &mut Criterion) { use rand::Rng; use simd::{F16, f16}; let mut rng = rand::rng(); let x = (0..4095) .map(|_| f16::_from_f32(rng.random_range(-1.0..=1.0))) .collect::>(); let y = (0..4095) .map(|_| f16::_from_f32(rng.random_range(-1.0..=1.0))) .collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") && simd::is_feature_detected!("avx512fp16") { c.bench_function("floating_f16::reduce_sum_of_d2::v4_avx512fp16", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_d2::reduce_sum_of_d2_v4_avx512fp16(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("floating_f16::reduce_sum_of_d2::v4", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_d2::reduce_sum_of_d2_v4(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("floating_f16::reduce_sum_of_d2::v3", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_d2::reduce_sum_of_d2_v3(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a3.512") { c.bench_function("floating_f16::reduce_sum_of_d2::a3_512", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_d2::reduce_sum_of_d2_a3_512(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") && simd::is_feature_detected!("fp16") { c.bench_function("floating_f16::reduce_sum_of_d2::a2_fp16", |b| { b.iter(|| unsafe { simd::floating_f16::reduce_sum_of_d2::reduce_sum_of_d2_a2_fp16(&x, &y) }) }); } } fn byte_reduce_sum_of_xy(c: &mut Criterion) { use rand::Rng; let mut rng = rand::rng(); let x = (0..4095).map(|_| rng.random::()).collect::>(); let y = (0..4095).map(|_| rng.random::()).collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") && simd::is_feature_detected!("avx512vnni") { c.bench_function("byte::reduce_sum_of_xy::v4_avx512vnni", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_xy::reduce_sum_of_xy_v4_avx512vnni(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("byte::reduce_sum_of_xy::v4", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_xy::reduce_sum_of_xy_v4(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("byte::reduce_sum_of_xy::v3", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_xy::reduce_sum_of_xy_v3(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v2") { c.bench_function("byte::reduce_sum_of_xy::v2", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_xy::reduce_sum_of_xy_v2(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") && simd::is_feature_detected!("dotprod") { c.bench_function("byte::reduce_sum_of_xy::a2_prod", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_xy::reduce_sum_of_xy_a2_dotprod(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") { c.bench_function("byte::reduce_sum_of_xy::a2", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_xy::reduce_sum_of_xy_a2(&x, &y) }) }); } } fn byte_reduce_sum_of_x(c: &mut Criterion) { use rand::Rng; let mut rng = rand::rng(); let this = (0..4095).map(|_| rng.random::()).collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("byte::reduce_sum_of_x::v4", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_x::reduce_sum_of_x_v4(&this) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("byte::reduce_sum_of_x::v3", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_x::reduce_sum_of_x_v3(&this) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v2") { c.bench_function("byte::reduce_sum_of_x::v2", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_x::reduce_sum_of_x_v2(&this) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") { c.bench_function("byte::reduce_sum_of_x::a2", |b| { b.iter(|| unsafe { simd::byte::reduce_sum_of_x::reduce_sum_of_x_a2(&this) }) }); } } fn halfbyte_reduce_sum_of_xy(c: &mut Criterion) { use rand::Rng; let mut rng = rand::rng(); let x = (0..2047).map(|_| rng.random::()).collect::>(); let y = (0..2047).map(|_| rng.random::()).collect::>(); #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") && simd::is_feature_detected!("avx512vnni") { c.bench_function("halfbyte::reduce_sum_of_xy::v4_avx512vnni", |b| { b.iter(|| unsafe { simd::halfbyte::reduce_sum_of_xy::reduce_sum_of_xy_v4_avx512vnni(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v4") { c.bench_function("halfbyte::reduce_sum_of_xy::v4", |b| { b.iter(|| unsafe { simd::halfbyte::reduce_sum_of_xy::reduce_sum_of_xy_v4(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v3") { c.bench_function("halfbyte::reduce_sum_of_xy::v3", |b| { b.iter(|| unsafe { simd::halfbyte::reduce_sum_of_xy::reduce_sum_of_xy_v3(&x, &y) }) }); } #[cfg(target_arch = "x86_64")] if simd::is_cpu_detected!("v2") { c.bench_function("halfbyte::reduce_sum_of_xy::v2", |b| { b.iter(|| unsafe { simd::halfbyte::reduce_sum_of_xy::reduce_sum_of_xy_v2(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") && simd::is_feature_detected!("dotprod") { c.bench_function("halfbyte::reduce_sum_of_xy::a2_prod", |b| { b.iter(|| unsafe { simd::halfbyte::reduce_sum_of_xy::reduce_sum_of_xy_a2_dotprod(&x, &y) }) }); } #[cfg(target_arch = "aarch64")] if simd::is_cpu_detected!("a2") { c.bench_function("halfbyte::reduce_sum_of_xy::a2", |b| { b.iter(|| unsafe { simd::halfbyte::reduce_sum_of_xy::reduce_sum_of_xy_a2(&x, &y) }) }); } } criterion_group!( benches, floating_f32_reduce_sum_of_xy, floating_f32_reduce_sum_of_d2, floating_f16_reduce_sum_of_xy, floating_f16_reduce_sum_of_d2, byte_reduce_sum_of_xy, byte_reduce_sum_of_x, halfbyte_reduce_sum_of_xy, ); criterion_main!(benches);