+#ifdef __SSE3__
+/*
+ * "This intrinsic may perform better than _mm_loadu_si128 when
+ * the data crosses a cache line boundary."
+ */
+# define UMEM_READ_I128 _mm_lddqu_si128
+#else /* SSE2 */
+# define UMEM_READ_I128 _mm_loadu_si128
+#endif
+
+/* load 512bit from unaligned memory */
+#define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \
+ const __m128i *__m128p = (const __m128i *) P; \
+ xmm0 = UMEM_READ_I128(&__m128p[0]); \
+ xmm1 = UMEM_READ_I128(&__m128p[1]); \
+ xmm2 = UMEM_READ_I128(&__m128p[2]); \
+ xmm3 = UMEM_READ_I128(&__m128p[3]); \
+}
+
+#ifdef UNALIGNED_SIMD_ACCESS
+
+# define MEM_WRITE_I128 _mm_storeu_si128
+# define MEM_READ_I128 UMEM_READ_I128
+# define LOAD ULOAD
+
+#else /* !UNALIGNED_SIMD_ACCESS */
+
+# define MEM_WRITE_I128 _mm_store_si128
+# define MEM_READ_I128 _mm_load_si128