// This software is licensed under a dual license model: // // GNU Affero General Public License v3 (AGPLv3): You may use, modify, and // distribute this software under the terms of the AGPLv3. // // Elastic License v2 (ELv2): You may also use, modify, and distribute this // software under the Elastic License v2, which has specific restrictions. // // We welcome any commercial collaboration or support. For inquiries // regarding the licenses, please contact us at: // vectorchord-inquiry@tensorchord.ai // // Copyright (c) 2025 TensorChord Inc. #if defined(__clang__) #if !(__clang_major__ >= 16) #error "Clang version must be at least 16." #endif #else #error "This file requires Clang." #endif #include #include #include typedef _Float16 f16; typedef float f32; __attribute__((target("arch=x86-64-v4,avx512fp16"))) float fp16_reduce_sum_of_xy_v4fp16(f16 *restrict a, f16 *restrict b, size_t n) { __m512h xy = _mm512_setzero_ph(); while (n >= 32) { __m512h x = _mm512_loadu_ph(a); __m512h y = _mm512_loadu_ph(b); a += 32; b += 32; n -= 32; xy = _mm512_fmadd_ph(x, y, xy); } if (n > 0) { unsigned int mask = _bzhi_u32(0xffffffff, n); __m512h x = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a)); __m512h y = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b)); xy = _mm512_fmadd_ph(x, y, xy); } return _mm512_reduce_add_ph(xy); } __attribute__((target("arch=x86-64-v4,avx512fp16"))) float fp16_reduce_sum_of_d2_v4fp16(f16 *restrict a, f16 *restrict b, size_t n) { __m512h d2 = _mm512_setzero_ph(); while (n >= 32) { __m512h x = _mm512_loadu_ph(a); __m512h y = _mm512_loadu_ph(b); a += 32; b += 32; n -= 32; __m512h d = _mm512_sub_ph(x, y); d2 = _mm512_fmadd_ph(d, d, d2); } if (n > 0) { unsigned int mask = _bzhi_u32(0xffffffff, n); __m512h x = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, a)); __m512h y = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(mask, b)); __m512h d = _mm512_sub_ph(x, y); d2 = _mm512_fmadd_ph(d, d, d2); } return _mm512_reduce_add_ph(d2); }