76 lines
1.7 KiB
C
76 lines
1.7 KiB
C
#include <immintrin.h>
|
|
|
|
void* memcpy_simd(void *dest, const void *src, size_t n) {
|
|
unsigned char *d = dest;
|
|
const unsigned char *s = src;
|
|
|
|
size_t i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m128i chunk = _mm_loadu_si128((const __m128i *)(s + i));
|
|
_mm_storeu_si128((__m128i *)(d + i), chunk);
|
|
}
|
|
for (; i < n; i++) {
|
|
d[i] = s[i];
|
|
}
|
|
return dest;
|
|
}
|
|
|
|
void* memset_simd(void *dest, int c, size_t n) {
|
|
unsigned char *d = dest;
|
|
size_t i = 0;
|
|
|
|
__m128i val = _mm_set1_epi8((char)c);
|
|
for (; i + 16 <= n; i += 16) {
|
|
_mm_storeu_si128((__m128i *)(d + i), val);
|
|
}
|
|
for (; i < n; i++) {
|
|
d[i] = (unsigned char)c;
|
|
}
|
|
return dest;
|
|
}
|
|
|
|
int memcmp_simd(const void *s1, const void *s2, size_t n) {
|
|
const unsigned char *a = s1;
|
|
const unsigned char *b = s2;
|
|
|
|
size_t i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m128i va = _mm_loadu_si128((const __m128i *)(a + i));
|
|
__m128i vb = _mm_loadu_si128((const __m128i *)(b + i));
|
|
__m128i cmp = _mm_cmpeq_epi8(va, vb);
|
|
int mask = _mm_movemask_epi8(cmp);
|
|
if (mask != 0xFFFF) {
|
|
// not equal somewhere, fallback to scalar check
|
|
for (int j = 0; j < 16; j++) {
|
|
if (a[i + j] != b[i + j])
|
|
return a[i + j] - b[i + j];
|
|
}
|
|
}
|
|
}
|
|
for (; i < n; i++) {
|
|
if (a[i] != b[i])
|
|
return a[i] - b[i];
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void* memmove_simd(void *dest, const void *src, size_t n) {
|
|
unsigned char *d = dest;
|
|
const unsigned char *s = src;
|
|
|
|
if (d < s) {
|
|
return memcpy_simd(dest, src, n);
|
|
} else {
|
|
size_t i = n;
|
|
while (i >= 16) {
|
|
i -= 16;
|
|
__m128i chunk = _mm_loadu_si128((const __m128i *)(s + i));
|
|
_mm_storeu_si128((__m128i *)(d + i), chunk);
|
|
}
|
|
while (i-- > 0) {
|
|
d[i] = s[i];
|
|
}
|
|
return dest;
|
|
}
|
|
}
|