2 * Implementation of core functions for GOST R 34.11-2012 using SSE2.
4 * Copyright (c) 2013 Cryptocom LTD.
5 * This file is distributed under the same license as OpenSSL.
7 * Author: Alexey Degtyarev <alexey@renatasystems.org>
11 #ifndef __GOST3411_HAS_SSE2__
12 # error "GOST R 34.11-2012: SSE2 not enabled"
16 #include <emmintrin.h>
18 # include <pmmintrin.h>
21 #define LO(v) ((unsigned char) (v))
22 #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
25 # define EXTRACT EXTRACT32
27 # define EXTRACT EXTRACT64
31 # define _mm_cvtsi64_m64(v) (__m64) v
32 # define _mm_cvtm64_si64(v) (long long) v
37 * "This intrinsic may perform better than _mm_loadu_si128 when
38 * the data crosses a cache line boundary."
40 # define UMEM_READ_I128 _mm_lddqu_si128
42 # define UMEM_READ_I128 _mm_loadu_si128
45 /* load 512bit from unaligned memory */
46 #define ULOAD(P, xmm0, xmm1, xmm2, xmm3) { \
47 const __m128i *__m128p = (const __m128i *) P; \
48 xmm0 = UMEM_READ_I128(&__m128p[0]); \
49 xmm1 = UMEM_READ_I128(&__m128p[1]); \
50 xmm2 = UMEM_READ_I128(&__m128p[2]); \
51 xmm3 = UMEM_READ_I128(&__m128p[3]); \
54 #ifdef UNALIGNED_SIMD_ACCESS
56 # define MEM_WRITE_I128 _mm_storeu_si128
57 # define MEM_READ_I128 UMEM_READ_I128
60 #else /* !UNALIGNED_SIMD_ACCESS */
62 # define MEM_WRITE_I128 _mm_store_si128
63 # define MEM_READ_I128 _mm_load_si128
64 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
65 const __m128i *__m128p = (const __m128i *) P; \
66 xmm0 = MEM_READ_I128(&__m128p[0]); \
67 xmm1 = MEM_READ_I128(&__m128p[1]); \
68 xmm2 = MEM_READ_I128(&__m128p[2]); \
69 xmm3 = MEM_READ_I128(&__m128p[3]); \
71 #endif /* !UNALIGNED_SIMD_ACCESS */
73 #define STORE(P, xmm0, xmm1, xmm2, xmm3) { \
74 __m128i *__m128p = (__m128i *) &P[0]; \
75 MEM_WRITE_I128(&__m128p[0], xmm0); \
76 MEM_WRITE_I128(&__m128p[1], xmm1); \
77 MEM_WRITE_I128(&__m128p[2], xmm2); \
78 MEM_WRITE_I128(&__m128p[3], xmm3); \
81 #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
82 xmm0 = _mm_xor_si128(xmm0, xmm4); \
83 xmm1 = _mm_xor_si128(xmm1, xmm5); \
84 xmm2 = _mm_xor_si128(xmm2, xmm6); \
85 xmm3 = _mm_xor_si128(xmm3, xmm7); \
88 #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
89 const __m128i *__m128p = (const __m128i *) &P[0]; \
90 xmm0 = _mm_xor_si128(xmm0, MEM_READ_I128(&__m128p[0])); \
91 xmm1 = _mm_xor_si128(xmm1, MEM_READ_I128(&__m128p[1])); \
92 xmm2 = _mm_xor_si128(xmm2, MEM_READ_I128(&__m128p[2])); \
93 xmm3 = _mm_xor_si128(xmm3, MEM_READ_I128(&__m128p[3])); \
96 #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
98 #define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
99 register unsigned short ax; \
102 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
103 mm0 = _mm_cvtsi64_m64(Ax[0][LO(ax)]); \
104 mm1 = _mm_cvtsi64_m64(Ax[0][HI(ax)]); \
106 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
107 mm0 = _mm_xor_64(mm0, Ax[1][LO(ax)]); \
108 mm1 = _mm_xor_64(mm1, Ax[1][HI(ax)]); \
110 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
111 mm0 = _mm_xor_64(mm0, Ax[2][LO(ax)]); \
112 mm1 = _mm_xor_64(mm1, Ax[2][HI(ax)]); \
114 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
115 mm0 = _mm_xor_64(mm0, Ax[3][LO(ax)]); \
116 mm1 = _mm_xor_64(mm1, Ax[3][HI(ax)]); \
118 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
119 mm0 = _mm_xor_64(mm0, Ax[4][LO(ax)]); \
120 mm1 = _mm_xor_64(mm1, Ax[4][HI(ax)]); \
122 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
123 mm0 = _mm_xor_64(mm0, Ax[5][LO(ax)]); \
124 mm1 = _mm_xor_64(mm1, Ax[5][HI(ax)]); \
126 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
127 mm0 = _mm_xor_64(mm0, Ax[6][LO(ax)]); \
128 mm1 = _mm_xor_64(mm1, Ax[6][HI(ax)]); \
130 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
131 mm0 = _mm_xor_64(mm0, Ax[7][LO(ax)]); \
132 mm1 = _mm_xor_64(mm1, Ax[7][HI(ax)]); \
134 xmm4 = _mm_set_epi64(mm1, mm0); \
137 #define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
139 register unsigned short ax; \
140 register unsigned long long r0, r1; \
142 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
143 r0 = Ax[0][LO(ax)]; \
144 r1 = Ax[0][HI(ax)]; \
146 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
147 r0 ^= Ax[1][LO(ax)]; \
148 r1 ^= Ax[1][HI(ax)]; \
150 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
151 r0 ^= Ax[2][LO(ax)]; \
152 r1 ^= Ax[2][HI(ax)]; \
154 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
155 r0 ^= Ax[3][LO(ax)]; \
156 r1 ^= Ax[3][HI(ax)]; \
158 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
159 r0 ^= Ax[4][LO(ax)]; \
160 r1 ^= Ax[4][HI(ax)]; \
162 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
163 r0 ^= Ax[5][LO(ax)]; \
164 r1 ^= Ax[5][HI(ax)]; \
166 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
167 r0 ^= Ax[6][LO(ax)]; \
168 r1 ^= Ax[6][HI(ax)]; \
170 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
171 r0 ^= Ax[7][LO(ax)]; \
172 r1 ^= Ax[7][HI(ax)]; \
174 xmm4 = _mm_cvtsi64_si128((long long) r0); \
175 tmm4 = _mm_cvtsi64_si128((long long) r1); \
176 xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
179 #define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
180 __m128i tmm0, tmm1, tmm2, tmm3; \
181 X128M(P, xmm0, xmm1, xmm2, xmm3); \
183 EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
184 EXTRACT(1, xmm0, xmm1, xmm2, xmm3, tmm1); \
185 EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm2); \
186 EXTRACT(3, xmm0, xmm1, xmm2, xmm3, tmm3); \
194 #define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
195 __m128i tmm0, tmm1, tmm2, tmm3; \
196 X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
198 EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
199 EXTRACT(1, xmm4, xmm5, xmm6, xmm7, tmm1); \
200 EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm2); \
201 EXTRACT(3, xmm4, xmm5, xmm6, xmm7, tmm3); \
209 #define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
210 XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
211 XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \