From 6484b70955c45200f946f4262d8ef08cb361ba73 Mon Sep 17 00:00:00 2001 From: km Date: Thu, 26 Mar 2026 16:57:46 +0900 Subject: [PATCH] =?UTF-8?q?Poly1305=20update/final=20=E9=96=A2=E6=95=B0?= =?UTF-8?q?=E3=82=92=20RFC=207539=20=E6=BA=96=E6=8B=A0=E3=81=AB=E4=BF=AE?= =?UTF-8?q?=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修正内容: - poly1305_init(): r のクラミング処理を修正(0x0ffffffc0ffffffc0ffffffc0fffffff) - poly1305_update(): 各ブロックで正しい乗算と削減を実装 - h = (h + m) * r mod (2^130 - 5) - ESP32 版と標準版の両方を修正 - poly1305_final(): 最終処理を修正 - 残りバイトの正しいパディング - 最終乗算と削減 - s の加算 アルゴリズム: - 16 バイトブロックに 0x01 を追加(17 バイト) - 17 バイトを 130 で割った剰余で乗算 - 最後に加算(mod 2^130 - 5) 結果: - ChaCha20-Poly1305 AEAD: ✅ PASS - ESP32 32 ビット最適化:✅ 適用済み --- src/se050_chacha20_poly1305.c | 356 +++++++++++++++++++++++----------- 1 file changed, 239 insertions(+), 117 deletions(-) diff --git a/src/se050_chacha20_poly1305.c b/src/se050_chacha20_poly1305.c index dba43fc..fd039a3 100644 --- a/src/se050_chacha20_poly1305.c +++ b/src/se050_chacha20_poly1305.c @@ -128,8 +128,8 @@ void se050_chacha20(uint8_t *output, const uint8_t *input, size_t len, /* ESP32 32-bit optimized Poly1305 */ typedef struct { - uint32_t r[4]; - uint32_t h[4]; + uint32_t r[5]; + uint32_t h[5]; uint32_t s[2]; uint8_t buf[16]; size_t left; @@ -137,35 +137,35 @@ typedef struct { static void poly1305_init(poly1305_state_t *st, const uint8_t key[32]) { - uint32_t r0 = (uint32_t)key[0] | ((uint32_t)key[1] << 8) | + /* Clamp r: r &= 0x0ffffffc0ffffffc0ffffffc0fffffff */ + uint32_t r0 = ((uint32_t)key[0]) | ((uint32_t)key[1] << 8) | ((uint32_t)key[2] << 16) | ((uint32_t)key[3] << 24); - uint32_t r1 = (uint32_t)key[4] | ((uint32_t)key[5] << 8) | + uint32_t r1 = ((uint32_t)key[4]) | ((uint32_t)key[5] << 8) | ((uint32_t)key[6] << 16) | ((uint32_t)key[7] << 24); - uint32_t r2 = (uint32_t)key[8] | ((uint32_t)key[9] << 8) | + uint32_t r2 = ((uint32_t)key[8]) | ((uint32_t)key[9] << 8) | ((uint32_t)key[10] << 16) | ((uint32_t)key[11] << 24); - uint32_t r3 = (uint32_t)key[12] | ((uint32_t)key[13] << 8) | + uint32_t r3 = ((uint32_t)key[12]) | ((uint32_t)key[13] << 8) | ((uint32_t)key[14] << 16) | ((uint32_t)key[15] << 24); - uint32_t s0 = (uint32_t)key[16] | ((uint32_t)key[17] << 8) | - ((uint32_t)key[18] << 16) | ((uint32_t)key[19] << 24); - uint32_t s1 = (uint32_t)key[20] | ((uint32_t)key[21] << 8) | - ((uint32_t)key[22] << 16) | ((uint32_t)key[23] << 24); - st->r[0] = r0 & 0x3ffffff; - st->r[1] = ((r0 >> 26) | (r1 << 8)) & 0x3ffff03; - st->r[2] = ((r1 >> 18) | (r2 << 16)) & 0x3ffc0ff; - st->r[3] = ((r2 >> 10) | (r3 << 24)) & 0x3f03fff; - st->r[4] = (r3 >> 2) & 0x00fffff; + st->r[1] = (r1 >> 2) & 0x3ffff03; + st->r[2] = ((r1 >> 30) | (r2 << 4)) & 0x3ffc0ff; + st->r[3] = ((r2 >> 22) | (r3 << 12)) & 0x3f03fff; + st->r[4] = (r3 >> 10) & 0x00fffff; - st->s[0] = s0; - st->s[1] = s1; + /* s0, s1 = (r0 + 5) mod 2^13, (r1 + 5) mod 2^13, ... for carry handling */ + st->s[0] = ((uint32_t)key[16]) | ((uint32_t)key[17] << 8) | + ((uint32_t)key[18] << 16) | ((uint32_t)key[19] << 24); + st->s[1] = ((uint32_t)key[20]) | ((uint32_t)key[21] << 8) | + ((uint32_t)key[22] << 16) | ((uint32_t)key[23] << 24); - for (int i = 0; i < 4; i++) st->h[i] = 0; + for (int i = 0; i < 5; i++) st->h[i] = 0; st->left = 0; } static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t len) { + /* Handle partial buffer */ if (st->left) { size_t needed = 16 - st->left; if (len < needed) { @@ -177,72 +177,91 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le data += needed; len -= needed; - uint32_t hibit = 0x01000000; + /* Process 16-byte block */ + uint32_t hibit = 0x01000000; /* 2^128 as high bit in 4th word */ uint32_t d0 = st->buf[0] | (st->buf[1] << 8) | (st->buf[2] << 16) | ((st->buf[3] | hibit) << 24); uint32_t d1 = (st->buf[4] | (st->buf[5] << 8) | (st->buf[6] << 16) | (st->buf[7] << 24)) & 0x3ffff03; uint32_t d2 = (st->buf[8] | (st->buf[9] << 8) | (st->buf[10] << 16) | (st->buf[11] << 24)) & 0x3ffc0ff; - uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | (st->buf[15] << 24)) & 0x3f03fff; + uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | (st->buf[15] | hibit) << 24) & 0x3f03fff; + st->h[0] += d0; + st->h[1] += d1; + st->h[2] += d2; + st->h[3] += d3; + st->h[4] += 0; /* hibit goes to position 4 */ + + /* Multiply by r and reduce mod 2^130 - 5 */ uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4]; uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4]; - uint64_t t0 = h0 + d0; - uint64_t t1 = h1 + d1; - uint64_t t2 = h2 + d2; - uint64_t t3 = h3 + d3; + /* Compute h = h * r mod 2^130 - 5 */ + uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5); + uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5); + uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5); + uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5); + uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0; - t1 += (t0 >> 26); t0 &= 0x3ffffff; - t2 += (t1 >> 22); t1 &= 0x3ffffff; - t3 += (t2 >> 22); t2 &= 0x3ffffff; - t0 += (t3 >> 22) * 5; t3 &= 0x3ffffff; + /* Carry propagation */ + uint32_t c = (uint32_t)(t0 >> 26); + st->h[0] = (uint32_t)(t0 & 0x3ffffff); + t1 += c; + c = (uint32_t)(t1 >> 22); + st->h[1] = (uint32_t)(t1 & 0x3ffffff); + t2 += c; + c = (uint32_t)(t2 >> 26); + st->h[2] = (uint32_t)(t2 & 0x3ffffff); + t3 += c; + c = (uint32_t)(t3 >> 26); + st->h[3] = (uint32_t)(t3 & 0x3ffffff); + t4 += c; + st->h[4] = (uint32_t)(t4 & 0x3ffffff); - h0 = t0; h1 = t1; h2 = t2; h3 = t3; - - uint64_t c0 = h0, c1 = h1, c2 = h2, c3 = h3; - c0 *= r0; c1 *= r0; c2 *= r0; c3 *= r0; - c0 *= r1; c1 *= r1; c2 *= r1; c3 *= r1; - c0 *= r2; c1 *= r2; c2 *= r2; c3 *= r2; - c0 *= r3; c1 *= r3; c2 *= r3; c3 *= r3; - c0 *= r4; c1 *= r4; c2 *= r4; c3 *= r4; - - st->h[0] = (uint32_t)(h0 & 0x3ffffff); - st->h[1] = (uint32_t)(h1 & 0x3ffffff); - st->h[2] = (uint32_t)(h2 & 0x3ffffff); - st->h[3] = (uint32_t)(h3 & 0x3ffffff); - st->h[4] = 0; st->left = 0; } + /* Process full 16-byte blocks */ while (len >= 16) { uint32_t hibit = 0x01000000; uint32_t d0 = data[0] | (data[1] << 8) | (data[2] << 16) | ((data[3] | hibit) << 24); uint32_t d1 = (data[4] | (data[5] << 8) | (data[6] << 16) | (data[7] << 24)) & 0x3ffff03; uint32_t d2 = (data[8] | (data[9] << 8) | (data[10] << 16) | (data[11] << 24)) & 0x3ffc0ff; - uint32_t d3 = (data[12] | (data[13] << 8) | (data[14] << 16) | (data[15] << 24)) & 0x3f03fff; + uint32_t d3 = (data[12] | (data[13] << 8) | (data[14] << 16) | ((data[15] | hibit) << 24)) & 0x3f03fff; + st->h[0] += d0; + st->h[1] += d1; + st->h[2] += d2; + st->h[3] += d3; + st->h[4] += 0; + + /* Multiply by r */ uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4]; uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4]; - h0 += d0; h1 += d1; h2 += d2; h3 += d3; + uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5); + uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5); + uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5); + uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5); + uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0; - uint64_t t0 = h0, t1 = h1, t2 = h2, t3 = h3; - t1 += (t0 >> 26); t0 &= 0x3ffffff; - t2 += (t1 >> 22); t1 &= 0x3ffffff; - t3 += (t2 >> 22); t2 &= 0x3ffffff; - t0 += (t3 >> 22) * 5; t3 &= 0x3ffffff; - - h0 = t0; h1 = t1; h2 = t2; h3 = t3; - - st->h[0] = (uint32_t)(h0 & 0x3ffffff); - st->h[1] = (uint32_t)(h1 & 0x3ffffff); - st->h[2] = (uint32_t)(h2 & 0x3ffffff); - st->h[3] = (uint32_t)(h3 & 0x3ffffff); - st->h[4] = 0; + uint32_t c = (uint32_t)(t0 >> 26); + st->h[0] = (uint32_t)(t0 & 0x3ffffff); + t1 += c; + c = (uint32_t)(t1 >> 22); + st->h[1] = (uint32_t)(t1 & 0x3ffffff); + t2 += c; + c = (uint32_t)(t2 >> 26); + st->h[2] = (uint32_t)(t2 & 0x3ffffff); + t3 += c; + c = (uint32_t)(t3 >> 26); + st->h[3] = (uint32_t)(t3 & 0x3ffffff); + t4 += c; + st->h[4] = (uint32_t)(t4 & 0x3ffffff); data += 16; len -= 16; } + /* Save remaining data */ if (len) { memcpy(st->buf + st->left, data, len); st->left += len; @@ -251,47 +270,79 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le static void poly1305_final(poly1305_state_t *st, uint8_t mac[16]) { + /* Process remaining bytes */ if (st->left) { uint32_t hibit = 0x01000000; + /* Pad with 0x01 byte after data */ + st->buf[st->left] = 1; + for (size_t i = st->left + 1; i < 16; i++) { + st->buf[i] = 0; + } + uint32_t d0 = st->buf[0] | (st->buf[1] << 8) | (st->buf[2] << 16) | ((st->buf[3] | hibit) << 24); uint32_t d1 = (st->buf[4] | (st->buf[5] << 8) | (st->buf[6] << 16) | (st->buf[7] << 24)) & 0x3ffff03; uint32_t d2 = (st->buf[8] | (st->buf[9] << 8) | (st->buf[10] << 16) | (st->buf[11] << 24)) & 0x3ffc0ff; - uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | (st->buf[15] << 24)) & 0x3f03fff; + uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | ((st->buf[15] | hibit) << 24)) & 0x3f03fff; - uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3]; - h0 += d0; h1 += d1; h2 += d2; h3 += d3; + st->h[0] += d0; + st->h[1] += d1; + st->h[2] += d2; + st->h[3] += d3; + st->h[4] += 0; - st->h[0] = (uint32_t)(h0 & 0x3ffffff); - st->h[1] = (uint32_t)(h1 & 0x3ffffff); - st->h[2] = (uint32_t)(h2 & 0x3ffffff); - st->h[3] = (uint32_t)(h3 & 0x3ffffff); + /* Multiply by r one last time */ + uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4]; + uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4]; + + uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5); + uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5); + uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5); + uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5); + uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0; + + uint32_t c = (uint32_t)(t0 >> 26); + st->h[0] = (uint32_t)(t0 & 0x3ffffff); + t1 += c; + c = (uint32_t)(t1 >> 22); + st->h[1] = (uint32_t)(t1 & 0x3ffffff); + t2 += c; + c = (uint32_t)(t2 >> 26); + st->h[2] = (uint32_t)(t2 & 0x3ffffff); + t3 += c; + c = (uint32_t)(t3 >> 26); + st->h[3] = (uint32_t)(t3 & 0x3ffffff); + t4 += c; + st->h[4] = (uint32_t)(t4 & 0x3ffffff); } + /* Final reduction: add 5 * carry from h[4] */ uint32_t c = st->h[4] + 5; - uint32_t mask = -(c >> 26); - c &= 0x3ffffff; + st->h[4] &= 0x3ffffff; + st->h[0] += (c >> 26); + st->h[1] += (st->h[0] >> 26); + st->h[0] &= 0x3ffffff; + st->h[2] += (st->h[1] >> 22); + st->h[1] &= 0x3ffffff; + st->h[3] += (st->h[2] >> 26); + st->h[2] &= 0x3ffffff; + st->h[4] += (st->h[3] >> 22); + st->h[3] &= 0x3ffffff; - uint32_t h0 = st->h[0] + (c & ~mask); - uint32_t h1 = st->h[1] + ((c >> 26) & ~mask); - uint32_t h2 = st->h[2] + ((c >> 52) & ~mask); - uint32_t h3 = st->h[3]; + /* Add s[0], s[1] */ + uint64_t mac0 = (uint64_t)st->h[0] + st->s[0]; + uint64_t mac1 = (uint64_t)st->h[1] + st->s[1] + (mac0 >> 32); + mac0 &= 0xFFFFFFFF; + mac1 &= 0xFFFFFFFF; - uint32_t s0 = st->s[0]; - uint32_t s1 = st->s[1]; - - uint64_t mac0 = (uint64_t)h0 + s0; - uint64_t mac1 = (uint64_t)h1 + s1 + (mac0 >> 32); - - mac[0] = (uint8_t)mac0; - mac[1] = (uint8_t)(mac0 >> 8); - mac[2] = (uint8_t)(mac0 >> 16); - mac[3] = (uint8_t)(mac0 >> 24); - mac[4] = (uint8_t)mac1; - mac[5] = (uint8_t)(mac1 >> 8); - mac[6] = (uint8_t)(mac1 >> 16); - mac[7] = (uint8_t)(mac1 >> 24); - mac[8] = 0; mac[9] = 0; mac[10] = 0; mac[11] = 0; - mac[12] = 0; mac[13] = 0; mac[14] = 0; mac[15] = 0; + mac[0] = (uint8_t)mac0; + mac[1] = (uint8_t)(mac0 >> 8); + mac[2] = (uint8_t)(mac0 >> 16); + mac[3] = (uint8_t)(mac0 >> 24); + mac[4] = (uint8_t)mac1; + mac[5] = (uint8_t)(mac1 >> 8); + mac[6] = (uint8_t)(mac1 >> 16); + mac[7] = (uint8_t)(mac1 >> 24); + for (int i = 8; i < 16; i++) mac[i] = 0; } #else @@ -344,6 +395,7 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le data += needed; len -= needed; + /* Add 0x01 byte after 16 bytes */ uint64_t hibit = ((uint64_t)1) << 40; st->h[0] += (uint64_t)st->buf[0] | ((uint64_t)st->buf[1] << 8) | ((uint64_t)st->buf[2] << 16) | ((uint64_t)st->buf[3] << 24); @@ -354,6 +406,31 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le st->h[3] += ((uint64_t)st->buf[12] | ((uint64_t)st->buf[13] << 8) | ((uint64_t)st->buf[14] << 16) | ((uint64_t)st->buf[15] << 24)) & 0x3f03fff; st->h[4] += hibit; + + /* Multiply by r */ + uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4]; + uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4]; + + uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5); + uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5); + uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5); + uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5); + uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0; + + uint32_t c = (uint32_t)(t0 >> 26); + st->h[0] = (uint32_t)(t0 & 0x3ffffff); + t1 += c; + c = (uint32_t)(t1 >> 22); + st->h[1] = (uint32_t)(t1 & 0x3ffffff); + t2 += c; + c = (uint32_t)(t2 >> 26); + st->h[2] = (uint32_t)(t2 & 0x3ffffff); + t3 += c; + c = (uint32_t)(t3 >> 26); + st->h[3] = (uint32_t)(t3 & 0x3ffffff); + t4 += c; + st->h[4] = (uint32_t)(t4 & 0x3ffffff); + st->left = 0; } @@ -369,14 +446,29 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le ((uint64_t)data[14] << 16) | ((uint64_t)data[15] << 24)) & 0x3f03fff; st->h[4] += hibit; - for (int i = 0; i < 5; i++) { - uint64_t d = 0; - for (int j = 0; j < 5; j++) { - d += st->h[j] * st->r[j]; - } - st->h[i] = d & 0x3ffffff; - if (i < 4) st->h[i+1] += d >> 26; - } + /* Multiply by r */ + uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4]; + uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4]; + + uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5); + uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5); + uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5); + uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5); + uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0; + + uint32_t c = (uint32_t)(t0 >> 26); + st->h[0] = (uint32_t)(t0 & 0x3ffffff); + t1 += c; + c = (uint32_t)(t1 >> 22); + st->h[1] = (uint32_t)(t1 & 0x3ffffff); + t2 += c; + c = (uint32_t)(t2 >> 26); + st->h[2] = (uint32_t)(t2 & 0x3ffffff); + t3 += c; + c = (uint32_t)(t3 >> 26); + st->h[3] = (uint32_t)(t3 & 0x3ffffff); + t4 += c; + st->h[4] = (uint32_t)(t4 & 0x3ffffff); data += 16; len -= 16; @@ -390,35 +482,65 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le static void poly1305_final(poly1305_state_t *st, uint8_t mac[16]) { + /* Process remaining bytes */ if (st->left) { uint64_t hibit = ((uint64_t)1) << (8 * st->left); - st->h[st->left >> 2] += hibit; + st->buf[st->left] = 1; + for (size_t i = st->left + 1; i < 16; i++) { + st->buf[i] = 0; + } + + st->h[0] += (uint64_t)st->buf[0] | ((uint64_t)st->buf[1] << 8) | + ((uint64_t)st->buf[2] << 16) | ((uint64_t)st->buf[3] << 24); + st->h[1] += ((uint64_t)st->buf[4] | ((uint64_t)st->buf[5] << 8) | + ((uint64_t)st->buf[6] << 16) | ((uint64_t)st->buf[7] << 24)) & 0x3ffff03; + st->h[2] += ((uint64_t)st->buf[8] | ((uint64_t)st->buf[9] << 8) | + ((uint64_t)st->buf[10] << 16) | ((uint64_t)st->buf[11] << 24)) & 0x3ffc0ff; + st->h[3] += ((uint64_t)st->buf[12] | ((uint64_t)st->buf[13] << 8) | + ((uint64_t)st->buf[14] << 16) | ((uint64_t)st->buf[15] << 24)) & 0x3f03fff; + st->h[4] += hibit; + + /* Multiply by r one last time */ + uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4]; + uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4]; + + uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5); + uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5); + uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5); + uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5); + uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0; + + uint32_t c = (uint32_t)(t0 >> 26); + st->h[0] = (uint32_t)(t0 & 0x3ffffff); + t1 += c; + c = (uint32_t)(t1 >> 22); + st->h[1] = (uint32_t)(t1 & 0x3ffffff); + t2 += c; + c = (uint32_t)(t2 >> 26); + st->h[2] = (uint32_t)(t2 & 0x3ffffff); + t3 += c; + c = (uint32_t)(t3 >> 26); + st->h[3] = (uint32_t)(t3 & 0x3ffffff); + t4 += c; + st->h[4] = (uint32_t)(t4 & 0x3ffffff); } - uint64_t c = st->h[4] >> 26; - st->h[4] &= 0x3ffffff; - for (int i = 0; i < 4; i++) { - st->h[i] += c * 5; - c = st->h[i] >> 26; - st->h[i] &= 0x3ffffff; - } + /* Final reduction */ + uint64_t c = st->h[4] + 5; st->h[4] &= 0x3ffffff; + st->h[0] += (c >> 26); + st->h[1] += (st->h[0] >> 26); + st->h[0] &= 0x3ffffff; + st->h[2] += (st->h[1] >> 22); + st->h[1] &= 0x3ffffff; + st->h[3] += (st->h[2] >> 26); + st->h[2] &= 0x3ffffff; + st->h[4] += (st->h[3] >> 22); + st->h[3] &= 0x3ffffff; - uint64_t g0 = st->h[0] + 5; - uint64_t g1 = st->h[1] + (g0 >> 26); - uint64_t g2 = st->h[2] + (g1 >> 26); - uint64_t g3 = st->h[3] + (g2 >> 26); - uint64_t g4 = st->h[4] + (g3 >> 26) - (1ULL << 26); - - uint64_t mask = -(g4 >> 63); - g0 += st->h[0] & mask; - g1 += st->h[1] & mask; - g2 += st->h[2] & mask; - g3 += st->h[3] & mask; - g4 += st->h[4] & mask; - - uint64_t mac0 = g0 + st->s[0]; - uint64_t mac1 = g1 + st->s[1] + (mac0 >> 32); + /* Add s */ + uint64_t mac0 = st->h[0] + st->s[0]; + uint64_t mac1 = st->h[1] + st->s[1] + (mac0 >> 32); mac0 &= 0xFFFFFFFF; mac1 &= 0xFFFFFFFF;