2 * Implementation of core functions for GOST R 34.11-2012 using SSE2.
4 * Copyright (c) 2013 Cryptocom LTD.
5 * This file is distributed under the same license as OpenSSL.
7 * Author: Alexey Degtyarev <alexey@renatasystems.org>
11 #ifndef __GOST3411_HAS_SSE2__
12 # error "GOST R 34.11-2012: SSE2 not enabled"
16 #include <emmintrin.h>
18 #define LO(v) ((unsigned char) (v))
19 #define HI(v) ((unsigned char) (((unsigned int) (v)) >> 8))
22 # define EXTRACT EXTRACT32
24 # define EXTRACT EXTRACT64
28 # define _mm_cvtsi64_m64(v) (__m64) v
29 # define _mm_cvtm64_si64(v) (long long) v
32 #define LOAD(P, xmm0, xmm1, xmm2, xmm3) { \
33 const __m128i *__m128p = (const __m128i *) &P[0]; \
34 xmm0 = _mm_load_si128(&__m128p[0]); \
35 xmm1 = _mm_load_si128(&__m128p[1]); \
36 xmm2 = _mm_load_si128(&__m128p[2]); \
37 xmm3 = _mm_load_si128(&__m128p[3]); \
40 #define UNLOAD(P, xmm0, xmm1, xmm2, xmm3) { \
41 __m128i *__m128p = (__m128i *) &P[0]; \
42 _mm_store_si128(&__m128p[0], xmm0); \
43 _mm_store_si128(&__m128p[1], xmm1); \
44 _mm_store_si128(&__m128p[2], xmm2); \
45 _mm_store_si128(&__m128p[3], xmm3); \
48 #define X128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
49 xmm0 = _mm_xor_si128(xmm0, xmm4); \
50 xmm1 = _mm_xor_si128(xmm1, xmm5); \
51 xmm2 = _mm_xor_si128(xmm2, xmm6); \
52 xmm3 = _mm_xor_si128(xmm3, xmm7); \
55 #define X128M(P, xmm0, xmm1, xmm2, xmm3) { \
56 const __m128i *__m128p = (const __m128i *) &P[0]; \
57 xmm0 = _mm_xor_si128(xmm0, _mm_load_si128(&__m128p[0])); \
58 xmm1 = _mm_xor_si128(xmm1, _mm_load_si128(&__m128p[1])); \
59 xmm2 = _mm_xor_si128(xmm2, _mm_load_si128(&__m128p[2])); \
60 xmm3 = _mm_xor_si128(xmm3, _mm_load_si128(&__m128p[3])); \
63 #define _mm_xor_64(mm0, mm1) _mm_xor_si64(mm0, _mm_cvtsi64_m64(mm1))
65 #define EXTRACT32(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
66 register unsigned short ax; \
69 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
70 mm0 = _mm_cvtsi64_m64(Ax[0][LO(ax)]); \
71 mm1 = _mm_cvtsi64_m64(Ax[0][HI(ax)]); \
73 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
74 mm0 = _mm_xor_64(mm0, Ax[1][LO(ax)]); \
75 mm1 = _mm_xor_64(mm1, Ax[1][HI(ax)]); \
77 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
78 mm0 = _mm_xor_64(mm0, Ax[2][LO(ax)]); \
79 mm1 = _mm_xor_64(mm1, Ax[2][HI(ax)]); \
81 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
82 mm0 = _mm_xor_64(mm0, Ax[3][LO(ax)]); \
83 mm1 = _mm_xor_64(mm1, Ax[3][HI(ax)]); \
85 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
86 mm0 = _mm_xor_64(mm0, Ax[4][LO(ax)]); \
87 mm1 = _mm_xor_64(mm1, Ax[4][HI(ax)]); \
89 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
90 mm0 = _mm_xor_64(mm0, Ax[5][LO(ax)]); \
91 mm1 = _mm_xor_64(mm1, Ax[5][HI(ax)]); \
93 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
94 mm0 = _mm_xor_64(mm0, Ax[6][LO(ax)]); \
95 mm1 = _mm_xor_64(mm1, Ax[6][HI(ax)]); \
97 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
98 mm0 = _mm_xor_64(mm0, Ax[7][LO(ax)]); \
99 mm1 = _mm_xor_64(mm1, Ax[7][HI(ax)]); \
101 xmm4 = _mm_set_epi64(mm1, mm0); \
104 #define __EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
106 register unsigned long long r0, r1; \
107 r0 = Ax[0][_mm_extract_epi8(xmm0, row + 0)]; \
108 r0 ^= Ax[1][_mm_extract_epi8(xmm0, row + 8)]; \
109 r0 ^= Ax[2][_mm_extract_epi8(xmm1, row + 0)]; \
110 r0 ^= Ax[3][_mm_extract_epi8(xmm1, row + 8)]; \
111 r0 ^= Ax[4][_mm_extract_epi8(xmm2, row + 0)]; \
112 r0 ^= Ax[5][_mm_extract_epi8(xmm2, row + 8)]; \
113 r0 ^= Ax[6][_mm_extract_epi8(xmm3, row + 0)]; \
114 r0 ^= Ax[7][_mm_extract_epi8(xmm3, row + 8)]; \
116 r1 = Ax[0][_mm_extract_epi8(xmm0, row + 1)]; \
117 r1 ^= Ax[1][_mm_extract_epi8(xmm0, row + 9)]; \
118 r1 ^= Ax[2][_mm_extract_epi8(xmm1, row + 1)]; \
119 r1 ^= Ax[3][_mm_extract_epi8(xmm1, row + 9)]; \
120 r1 ^= Ax[4][_mm_extract_epi8(xmm2, row + 1)]; \
121 r1 ^= Ax[5][_mm_extract_epi8(xmm2, row + 9)]; \
122 r1 ^= Ax[6][_mm_extract_epi8(xmm3, row + 1)]; \
123 r1 ^= Ax[7][_mm_extract_epi8(xmm3, row + 9)]; \
124 xmm4 = _mm_cvtsi64_si128((long long) r0); \
125 tmm4 = _mm_cvtsi64_si128((long long) r1); \
126 xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
129 #define EXTRACT64(row, xmm0, xmm1, xmm2, xmm3, xmm4) { \
131 register unsigned short ax; \
132 register unsigned long long r0, r1; \
134 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 0); \
135 r0 = Ax[0][LO(ax)]; \
136 r1 = Ax[0][HI(ax)]; \
138 ax = (unsigned short) _mm_extract_epi16(xmm0, row + 4); \
139 r0 ^= Ax[1][LO(ax)]; \
140 r1 ^= Ax[1][HI(ax)]; \
142 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 0); \
143 r0 ^= Ax[2][LO(ax)]; \
144 r1 ^= Ax[2][HI(ax)]; \
146 ax = (unsigned short) _mm_extract_epi16(xmm1, row + 4); \
147 r0 ^= Ax[3][LO(ax)]; \
148 r1 ^= Ax[3][HI(ax)]; \
150 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 0); \
151 r0 ^= Ax[4][LO(ax)]; \
152 r1 ^= Ax[4][HI(ax)]; \
154 ax = (unsigned short) _mm_extract_epi16(xmm2, row + 4); \
155 r0 ^= Ax[5][LO(ax)]; \
156 r1 ^= Ax[5][HI(ax)]; \
158 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 0); \
159 r0 ^= Ax[6][LO(ax)]; \
160 r1 ^= Ax[6][HI(ax)]; \
162 ax = (unsigned short) _mm_extract_epi16(xmm3, row + 4); \
163 r0 ^= Ax[7][LO(ax)]; \
164 r1 ^= Ax[7][HI(ax)]; \
166 xmm4 = _mm_cvtsi64_si128((long long) r0); \
167 tmm4 = _mm_cvtsi64_si128((long long) r1); \
168 xmm4 = _mm_unpacklo_epi64(xmm4, tmm4); \
171 #define XLPS128M(P, xmm0, xmm1, xmm2, xmm3) { \
172 __m128i tmm0, tmm1, tmm2, tmm3; \
173 X128M(P, xmm0, xmm1, xmm2, xmm3); \
175 EXTRACT(0, xmm0, xmm1, xmm2, xmm3, tmm0); \
176 EXTRACT(1, xmm0, xmm1, xmm2, xmm3, tmm1); \
177 EXTRACT(2, xmm0, xmm1, xmm2, xmm3, tmm2); \
178 EXTRACT(3, xmm0, xmm1, xmm2, xmm3, tmm3); \
186 #define XLPS128R(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7) { \
187 __m128i tmm0, tmm1, tmm2, tmm3; \
188 X128R(xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3); \
190 EXTRACT(0, xmm4, xmm5, xmm6, xmm7, tmm0); \
191 EXTRACT(1, xmm4, xmm5, xmm6, xmm7, tmm1); \
192 EXTRACT(2, xmm4, xmm5, xmm6, xmm7, tmm2); \
193 EXTRACT(3, xmm4, xmm5, xmm6, xmm7, tmm3); \
201 #define ROUND128(i, xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7) { \
202 XLPS128M((&C[i]), xmm0, xmm2, xmm4, xmm6); \
203 XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7); \