*/
#include "gosthash2012.h"
+#ifdef __x86_64__
+# ifdef _MSC_VER
+# include <intrin.h>
+# else
+# include <x86intrin.h>
+# endif
+#endif
#if defined(_WIN32) || defined(_WINDOWS)
# define INLINE __inline
void init_gost2012_hash_ctx(gost2012_hash_ctx * CTX,
const unsigned int digest_size)
{
- unsigned int i;
-
memset(CTX, 0, sizeof(gost2012_hash_ctx));
CTX->digest_size = digest_size;
+ /*
+ * IV for 512-bit hash should be 0^512
+ * IV for 256-bit hash should be (00000001)^64
+ *
+ * It's already zeroed when CTX is cleared above, so we only
+ * need to set it to 0x01-s for 256-bit hash.
+ */
if (digest_size == 256)
memset(&CTX->h, 0x01, sizeof(uint512_u));
- else
- memset(&CTX->h, 0x00, sizeof(uint512_u));
}
static INLINE void pad(gost2012_hash_ctx * CTX)
{
- unsigned char buf[64];
-
- if (CTX->bufsize > 63)
- return;
-
- memset(&buf, 0x00, sizeof buf);
- memcpy(&buf, CTX->buffer, CTX->bufsize);
+ memset(&(CTX->buffer.B[CTX->bufsize]), 0, sizeof(CTX->buffer) - CTX->bufsize);
+ CTX->buffer.B[CTX->bufsize] = 1;
- buf[CTX->bufsize] = 0x01;
- memcpy(CTX->buffer, &buf, sizeof buf);
}
-static INLINE void add512(const union uint512_u *x,
- const union uint512_u *y, union uint512_u *r)
+static INLINE void add512(union uint512_u * RESTRICT x,
+ const union uint512_u * RESTRICT y)
{
#ifndef __GOST3411_BIG_ENDIAN__
- unsigned int CF, OF;
+ unsigned int CF = 0;
unsigned int i;
- CF = 0;
+# ifdef HAVE_ADDCARRY_U64
+ for (i = 0; i < 8; i++)
+ CF = _addcarry_u64(CF, x->QWORD[i] , y->QWORD[i], &(x->QWORD[i]));
+# else
for (i = 0; i < 8; i++) {
- r->QWORD[i] = x->QWORD[i] + y->QWORD[i];
- if (r->QWORD[i] < y->QWORD[i] || r->QWORD[i] < x->QWORD[i])
- OF = 1;
- else
- OF = 0;
-
- r->QWORD[i] += CF;
- CF = OF;
+ const unsigned long long left = x->QWORD[i];
+ unsigned long long sum;
+
+ sum = left + y->QWORD[i] + CF;
+ /*
+ * (sum == left): is noop, because it's possible only
+ * when `left' is added with `0 + 0' or with `ULLONG_MAX + 1',
+ * in that case `CF' (carry) retain previous value, which is correct,
+ * because when `left + 0 + 0' there was no overflow (thus no carry),
+ * and when `left + ULLONG_MAX + 1' value is wrapped back to
+ * itself with overflow, thus creating carry.
+ *
+ * (sum != left):
+ * if `sum' is not wrapped (sum > left) there should not be carry,
+ * if `sum' is wrapped (sum < left) there should be carry.
+ */
+ if (sum != left)
+ CF = (sum < left);
+ x->QWORD[i] = sum;
}
-#else
- const unsigned char *xp, *yp;
- unsigned char *rp;
+# endif /* !__x86_64__ */
+#else /* __GOST3411_BIG_ENDIAN__ */
+ const unsigned char *yp;
+ unsigned char *xp;
unsigned int i;
int buf;
- xp = (const unsigned char *)&x[0];
+ xp = (unsigned char *)&x[0];
yp = (const unsigned char *)&y[0];
- rp = (unsigned char *)&r[0];
buf = 0;
for (i = 0; i < 64; i++) {
buf = xp[i] + yp[i] + (buf >> 8);
- rp[i] = (unsigned char)buf & 0xFF;
+ xp[i] = (unsigned char)buf & 0xFF;
}
-#endif
+#endif /* __GOST3411_BIG_ENDIAN__ */
}
-static void g(union uint512_u *h, const union uint512_u *N,
- const unsigned char *m)
+static void g(union uint512_u *h, const union uint512_u * RESTRICT N,
+ const union uint512_u * RESTRICT m)
{
#ifdef __GOST3411_HAS_SSE2__
__m128i xmm0, xmm2, xmm4, xmm6; /* XMMR0-quadruple */
LOAD(N, xmm0, xmm2, xmm4, xmm6);
XLPS128M(h, xmm0, xmm2, xmm4, xmm6);
- LOAD(m, xmm1, xmm3, xmm5, xmm7);
+ ULOAD(m, xmm1, xmm3, xmm5, xmm7);
XLPS128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
for (i = 0; i < 11; i++)
X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
X128M(h, xmm0, xmm2, xmm4, xmm6);
- X128M(m, xmm0, xmm2, xmm4, xmm6);
-
- UNLOAD(h, xmm0, xmm2, xmm4, xmm6);
+ ULOAD(m, xmm1, xmm3, xmm5, xmm7);
+ X128R(xmm0, xmm2, xmm4, xmm6, xmm1, xmm3, xmm5, xmm7);
- /* Restore the Floating-point status on the CPU */
- _mm_empty();
+ STORE(h, xmm0, xmm2, xmm4, xmm6);
#else
union uint512_u Ki, data;
unsigned int i;
/* E() done */
X((&data), h, (&data));
- X((&data), ((const union uint512_u *)&m[0]), h);
+ X((&data), m, h);
#endif
}
-static INLINE void stage2(gost2012_hash_ctx * CTX, const unsigned char *data)
+static INLINE void stage2(gost2012_hash_ctx * CTX, const union uint512_u *data)
{
g(&(CTX->h), &(CTX->N), data);
- add512(&(CTX->N), &buffer512, &(CTX->N));
- add512(&(CTX->Sigma), (const union uint512_u *)data, &(CTX->Sigma));
+ add512(&(CTX->N), &buffer512);
+ add512(&(CTX->Sigma), data);
}
static INLINE void stage3(gost2012_hash_ctx * CTX)
{
- ALIGN(16) union uint512_u buf;
-
- memset(&buf, 0x00, sizeof buf);
- memcpy(&buf, &(CTX->buffer), CTX->bufsize);
- memcpy(&(CTX->buffer), &buf, sizeof(uint512_u));
+ pad(CTX);
+ g(&(CTX->h), &(CTX->N), &(CTX->buffer));
+ add512(&(CTX->Sigma), &CTX->buffer);
- memset(&buf, 0x00, sizeof buf);
+ memset(&(CTX->buffer.B[0]), 0, sizeof(uint512_u));
#ifndef __GOST3411_BIG_ENDIAN__
- buf.QWORD[0] = CTX->bufsize << 3;
+ CTX->buffer.QWORD[0] = CTX->bufsize << 3;
#else
- buf.QWORD[0] = BSWAP64(CTX->bufsize << 3);
+ CTX->buffer.QWORD[0] = BSWAP64(CTX->bufsize << 3);
#endif
+ add512(&(CTX->N), &(CTX->buffer));
- pad(CTX);
-
- g(&(CTX->h), &(CTX->N), (const unsigned char *)&(CTX->buffer));
-
- add512(&(CTX->N), &buf, &(CTX->N));
- add512(&(CTX->Sigma), (const union uint512_u *)&CTX->buffer[0],
- &(CTX->Sigma));
-
- g(&(CTX->h), &buffer0, (const unsigned char *)&(CTX->N));
-
- g(&(CTX->h), &buffer0, (const unsigned char *)&(CTX->Sigma));
- memcpy(&(CTX->hash), &(CTX->h), sizeof(uint512_u));
+ g(&(CTX->h), &buffer0, &(CTX->N));
+ g(&(CTX->h), &buffer0, &(CTX->Sigma));
}
/*
void gost2012_hash_block(gost2012_hash_ctx * CTX,
const unsigned char *data, size_t len)
{
- size_t chunksize;
-
- while (len > 63 && CTX->bufsize == 0) {
- stage2(CTX, data);
-
- data += 64;
- len -= 64;
+ register size_t chunksize;
+ register size_t bufsize = CTX->bufsize;
+
+ if (bufsize == 0) {
+ while (len >= 64) {
+ memcpy(&CTX->buffer.B[0], data, 64);
+ stage2(CTX, &(CTX->buffer));
+ data += 64;
+ len -= 64;
+ }
}
while (len) {
- chunksize = 64 - CTX->bufsize;
+ chunksize = 64 - bufsize;
if (chunksize > len)
chunksize = len;
- memcpy(&CTX->buffer[CTX->bufsize], data, chunksize);
+ memcpy(&CTX->buffer.B[bufsize], data, chunksize);
- CTX->bufsize += chunksize;
+ bufsize += chunksize;
len -= chunksize;
data += chunksize;
- if (CTX->bufsize == 64) {
- stage2(CTX, CTX->buffer);
-
- CTX->bufsize = 0;
+ if (bufsize == 64) {
+ stage2(CTX, &(CTX->buffer) );
+ bufsize = 0;
}
}
+ CTX->bufsize = bufsize;
}
/*
CTX->bufsize = 0;
if (CTX->digest_size == 256)
- memcpy(digest, &(CTX->hash.QWORD[4]), 32);
+ memcpy(digest, &(CTX->h.QWORD[4]), 32);
else
- memcpy(digest, &(CTX->hash.QWORD[0]), 64);
+ memcpy(digest, &(CTX->h.QWORD[0]), 64);
}