LOAD(N, xmm0, xmm2, xmm4, xmm6);
XLPS128M(h, xmm0, xmm2, xmm4, xmm6);
- LOAD(m, xmm1, xmm3, xmm5, xmm7);
+ ULOAD(m, xmm1, xmm3, xmm5, xmm7);
XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
for (i = 0; i < 11; i++)
X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
X128M(h, xmm0, xmm2, xmm4, xmm6);
- X128M(m, xmm0, xmm2, xmm4, xmm6);
-
- UNLOAD(h, xmm0, xmm2, xmm4, xmm6);
+ ULOAD(m, xmm1, xmm3, xmm5, xmm7);
+ X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
- /* Restore the Floating-point status on the CPU */
- _mm_empty();
+ STORE(h, xmm0, xmm2, xmm4, xmm6);
#else
union uint512_u Ki, data;
unsigned int i;
#include <mmintrin.h>
#include <emmintrin.h>
+#ifdef __SSE3__
+# include <pmmintrin.h>
+#endif
#define LO(v) ((unsigned char) (v))
#define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
# define _mm_cvtm64_si64(v) (long long) v
#endif
+#ifdef __SSE3__
+/*
+ * "This intrinsic may perform better than _mm_loadu_si128 when
+ * the data crosses a cache line boundary."
+ */
+# define UMEM_READ_I128 _mm_lddqu_si128
+#else /* SSE2 */
+# define UMEM_READ_I128 _mm_loadu_si128
+#endif
+
+/* load 512bit from unaligned memory */
+#define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \
+ const __m128i *__m128p = (const __m128i *) P; \
+ xmm0 = UMEM_READ_I128(&__m128p[0]); \
+ xmm1 = UMEM_READ_I128(&__m128p[1]); \
+ xmm2 = UMEM_READ_I128(&__m128p[2]); \
+ xmm3 = UMEM_READ_I128(&__m128p[3]); \
+}
+
+#ifdef UNALIGNED_SIMD_ACCESS
+
+# define MEM_WRITE_I128 _mm_storeu_si128
+# define MEM_READ_I128 UMEM_READ_I128
+# define LOAD ULOAD
+
+#else /* !UNALIGNED_SIMD_ACCESS */
+
+# define MEM_WRITE_I128 _mm_store_si128
+# define MEM_READ_I128 _mm_load_si128
#define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
- const __m128i *__m128p = (const __m128i *) &P[0]; \
- xmm0 = _mm_load_si128(&__m128p[0]); \
- xmm1 = _mm_load_si128(&__m128p[1]); \
- xmm2 = _mm_load_si128(&__m128p[2]); \
- xmm3 = _mm_load_si128(&__m128p[3]); \
+ const __m128i *__m128p = (const __m128i *) P; \
+ xmm0 = MEM_READ_I128(&__m128p[0]); \
+ xmm1 = MEM_READ_I128(&__m128p[1]); \
+ xmm2 = MEM_READ_I128(&__m128p[2]); \
+ xmm3 = MEM_READ_I128(&__m128p[3]); \
}
+#endif /* !UNALIGNED_SIMD_ACCESS */
-#define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
+#define STORE(P, xmm0, xmm1, xmm2, xmm3) { \
__m128i *__m128p = (__m128i *) &P[0]; \
- _mm_store_si128(&__m128p[0], xmm0); \
- _mm_store_si128(&__m128p[1], xmm1); \
- _mm_store_si128(&__m128p[2], xmm2); \
- _mm_store_si128(&__m128p[3], xmm3); \
+ MEM_WRITE_I128(&__m128p[0], xmm0); \
+ MEM_WRITE_I128(&__m128p[1], xmm1); \
+ MEM_WRITE_I128(&__m128p[2], xmm2); \
+ MEM_WRITE_I128(&__m128p[3], xmm3); \
}
#define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
#define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
const __m128i *__m128p = (const __m128i *) &P[0]; \
- xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
- xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
- xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
- xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
+ xmm0 = _mm_xor_si128(xmm0, MEM_READ_I128(&__m128p[0])); \
+ xmm1 = _mm_xor_si128(xmm1, MEM_READ_I128(&__m128p[1])); \
+ xmm2 = _mm_xor_si128(xmm2, MEM_READ_I128(&__m128p[2])); \
+ xmm3 = _mm_xor_si128(xmm3, MEM_READ_I128(&__m128p[3])); \
}
#define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))