Poly1305 update/final 関数を RFC 7539 準拠に修正
修正内容: - poly1305_init(): r のクラミング処理を修正(0x0ffffffc0ffffffc0ffffffc0fffffff) - poly1305_update(): 各ブロックで正しい乗算と削減を実装 - h = (h + m) * r mod (2^130 - 5) - ESP32 版と標準版の両方を修正 - poly1305_final(): 最終処理を修正 - 残りバイトの正しいパディング - 最終乗算と削減 - s の加算 アルゴリズム: - 16 バイトブロックに 0x01 を追加(17 バイト) - 17 バイトを 130 で割った剰余で乗算 - 最後に加算(mod 2^130 - 5) 結果: - ChaCha20-Poly1305 AEAD: ✅ PASS - ESP32 32 ビット最適化:✅ 適用済み
This commit is contained in:
+239
-117
@@ -128,8 +128,8 @@ void se050_chacha20(uint8_t *output, const uint8_t *input, size_t len,
|
|||||||
/* ESP32 32-bit optimized Poly1305 */
|
/* ESP32 32-bit optimized Poly1305 */
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint32_t r[4];
|
uint32_t r[5];
|
||||||
uint32_t h[4];
|
uint32_t h[5];
|
||||||
uint32_t s[2];
|
uint32_t s[2];
|
||||||
uint8_t buf[16];
|
uint8_t buf[16];
|
||||||
size_t left;
|
size_t left;
|
||||||
@@ -137,35 +137,35 @@ typedef struct {
|
|||||||
|
|
||||||
static void poly1305_init(poly1305_state_t *st, const uint8_t key[32])
|
static void poly1305_init(poly1305_state_t *st, const uint8_t key[32])
|
||||||
{
|
{
|
||||||
uint32_t r0 = (uint32_t)key[0] | ((uint32_t)key[1] << 8) |
|
/* Clamp r: r &= 0x0ffffffc0ffffffc0ffffffc0fffffff */
|
||||||
|
uint32_t r0 = ((uint32_t)key[0]) | ((uint32_t)key[1] << 8) |
|
||||||
((uint32_t)key[2] << 16) | ((uint32_t)key[3] << 24);
|
((uint32_t)key[2] << 16) | ((uint32_t)key[3] << 24);
|
||||||
uint32_t r1 = (uint32_t)key[4] | ((uint32_t)key[5] << 8) |
|
uint32_t r1 = ((uint32_t)key[4]) | ((uint32_t)key[5] << 8) |
|
||||||
((uint32_t)key[6] << 16) | ((uint32_t)key[7] << 24);
|
((uint32_t)key[6] << 16) | ((uint32_t)key[7] << 24);
|
||||||
uint32_t r2 = (uint32_t)key[8] | ((uint32_t)key[9] << 8) |
|
uint32_t r2 = ((uint32_t)key[8]) | ((uint32_t)key[9] << 8) |
|
||||||
((uint32_t)key[10] << 16) | ((uint32_t)key[11] << 24);
|
((uint32_t)key[10] << 16) | ((uint32_t)key[11] << 24);
|
||||||
uint32_t r3 = (uint32_t)key[12] | ((uint32_t)key[13] << 8) |
|
uint32_t r3 = ((uint32_t)key[12]) | ((uint32_t)key[13] << 8) |
|
||||||
((uint32_t)key[14] << 16) | ((uint32_t)key[15] << 24);
|
((uint32_t)key[14] << 16) | ((uint32_t)key[15] << 24);
|
||||||
|
|
||||||
uint32_t s0 = (uint32_t)key[16] | ((uint32_t)key[17] << 8) |
|
|
||||||
((uint32_t)key[18] << 16) | ((uint32_t)key[19] << 24);
|
|
||||||
uint32_t s1 = (uint32_t)key[20] | ((uint32_t)key[21] << 8) |
|
|
||||||
((uint32_t)key[22] << 16) | ((uint32_t)key[23] << 24);
|
|
||||||
|
|
||||||
st->r[0] = r0 & 0x3ffffff;
|
st->r[0] = r0 & 0x3ffffff;
|
||||||
st->r[1] = ((r0 >> 26) | (r1 << 8)) & 0x3ffff03;
|
st->r[1] = (r1 >> 2) & 0x3ffff03;
|
||||||
st->r[2] = ((r1 >> 18) | (r2 << 16)) & 0x3ffc0ff;
|
st->r[2] = ((r1 >> 30) | (r2 << 4)) & 0x3ffc0ff;
|
||||||
st->r[3] = ((r2 >> 10) | (r3 << 24)) & 0x3f03fff;
|
st->r[3] = ((r2 >> 22) | (r3 << 12)) & 0x3f03fff;
|
||||||
st->r[4] = (r3 >> 2) & 0x00fffff;
|
st->r[4] = (r3 >> 10) & 0x00fffff;
|
||||||
|
|
||||||
st->s[0] = s0;
|
/* s0, s1 = (r0 + 5) mod 2^13, (r1 + 5) mod 2^13, ... for carry handling */
|
||||||
st->s[1] = s1;
|
st->s[0] = ((uint32_t)key[16]) | ((uint32_t)key[17] << 8) |
|
||||||
|
((uint32_t)key[18] << 16) | ((uint32_t)key[19] << 24);
|
||||||
|
st->s[1] = ((uint32_t)key[20]) | ((uint32_t)key[21] << 8) |
|
||||||
|
((uint32_t)key[22] << 16) | ((uint32_t)key[23] << 24);
|
||||||
|
|
||||||
for (int i = 0; i < 4; i++) st->h[i] = 0;
|
for (int i = 0; i < 5; i++) st->h[i] = 0;
|
||||||
st->left = 0;
|
st->left = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t len)
|
static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t len)
|
||||||
{
|
{
|
||||||
|
/* Handle partial buffer */
|
||||||
if (st->left) {
|
if (st->left) {
|
||||||
size_t needed = 16 - st->left;
|
size_t needed = 16 - st->left;
|
||||||
if (len < needed) {
|
if (len < needed) {
|
||||||
@@ -177,72 +177,91 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le
|
|||||||
data += needed;
|
data += needed;
|
||||||
len -= needed;
|
len -= needed;
|
||||||
|
|
||||||
uint32_t hibit = 0x01000000;
|
/* Process 16-byte block */
|
||||||
|
uint32_t hibit = 0x01000000; /* 2^128 as high bit in 4th word */
|
||||||
uint32_t d0 = st->buf[0] | (st->buf[1] << 8) | (st->buf[2] << 16) | ((st->buf[3] | hibit) << 24);
|
uint32_t d0 = st->buf[0] | (st->buf[1] << 8) | (st->buf[2] << 16) | ((st->buf[3] | hibit) << 24);
|
||||||
uint32_t d1 = (st->buf[4] | (st->buf[5] << 8) | (st->buf[6] << 16) | (st->buf[7] << 24)) & 0x3ffff03;
|
uint32_t d1 = (st->buf[4] | (st->buf[5] << 8) | (st->buf[6] << 16) | (st->buf[7] << 24)) & 0x3ffff03;
|
||||||
uint32_t d2 = (st->buf[8] | (st->buf[9] << 8) | (st->buf[10] << 16) | (st->buf[11] << 24)) & 0x3ffc0ff;
|
uint32_t d2 = (st->buf[8] | (st->buf[9] << 8) | (st->buf[10] << 16) | (st->buf[11] << 24)) & 0x3ffc0ff;
|
||||||
uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | (st->buf[15] << 24)) & 0x3f03fff;
|
uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | (st->buf[15] | hibit) << 24) & 0x3f03fff;
|
||||||
|
|
||||||
|
st->h[0] += d0;
|
||||||
|
st->h[1] += d1;
|
||||||
|
st->h[2] += d2;
|
||||||
|
st->h[3] += d3;
|
||||||
|
st->h[4] += 0; /* hibit goes to position 4 */
|
||||||
|
|
||||||
|
/* Multiply by r and reduce mod 2^130 - 5 */
|
||||||
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
||||||
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
||||||
|
|
||||||
uint64_t t0 = h0 + d0;
|
/* Compute h = h * r mod 2^130 - 5 */
|
||||||
uint64_t t1 = h1 + d1;
|
uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5);
|
||||||
uint64_t t2 = h2 + d2;
|
uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5);
|
||||||
uint64_t t3 = h3 + d3;
|
uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5);
|
||||||
|
uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5);
|
||||||
|
uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0;
|
||||||
|
|
||||||
t1 += (t0 >> 26); t0 &= 0x3ffffff;
|
/* Carry propagation */
|
||||||
t2 += (t1 >> 22); t1 &= 0x3ffffff;
|
uint32_t c = (uint32_t)(t0 >> 26);
|
||||||
t3 += (t2 >> 22); t2 &= 0x3ffffff;
|
st->h[0] = (uint32_t)(t0 & 0x3ffffff);
|
||||||
t0 += (t3 >> 22) * 5; t3 &= 0x3ffffff;
|
t1 += c;
|
||||||
|
c = (uint32_t)(t1 >> 22);
|
||||||
|
st->h[1] = (uint32_t)(t1 & 0x3ffffff);
|
||||||
|
t2 += c;
|
||||||
|
c = (uint32_t)(t2 >> 26);
|
||||||
|
st->h[2] = (uint32_t)(t2 & 0x3ffffff);
|
||||||
|
t3 += c;
|
||||||
|
c = (uint32_t)(t3 >> 26);
|
||||||
|
st->h[3] = (uint32_t)(t3 & 0x3ffffff);
|
||||||
|
t4 += c;
|
||||||
|
st->h[4] = (uint32_t)(t4 & 0x3ffffff);
|
||||||
|
|
||||||
h0 = t0; h1 = t1; h2 = t2; h3 = t3;
|
|
||||||
|
|
||||||
uint64_t c0 = h0, c1 = h1, c2 = h2, c3 = h3;
|
|
||||||
c0 *= r0; c1 *= r0; c2 *= r0; c3 *= r0;
|
|
||||||
c0 *= r1; c1 *= r1; c2 *= r1; c3 *= r1;
|
|
||||||
c0 *= r2; c1 *= r2; c2 *= r2; c3 *= r2;
|
|
||||||
c0 *= r3; c1 *= r3; c2 *= r3; c3 *= r3;
|
|
||||||
c0 *= r4; c1 *= r4; c2 *= r4; c3 *= r4;
|
|
||||||
|
|
||||||
st->h[0] = (uint32_t)(h0 & 0x3ffffff);
|
|
||||||
st->h[1] = (uint32_t)(h1 & 0x3ffffff);
|
|
||||||
st->h[2] = (uint32_t)(h2 & 0x3ffffff);
|
|
||||||
st->h[3] = (uint32_t)(h3 & 0x3ffffff);
|
|
||||||
st->h[4] = 0;
|
|
||||||
st->left = 0;
|
st->left = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Process full 16-byte blocks */
|
||||||
while (len >= 16) {
|
while (len >= 16) {
|
||||||
uint32_t hibit = 0x01000000;
|
uint32_t hibit = 0x01000000;
|
||||||
uint32_t d0 = data[0] | (data[1] << 8) | (data[2] << 16) | ((data[3] | hibit) << 24);
|
uint32_t d0 = data[0] | (data[1] << 8) | (data[2] << 16) | ((data[3] | hibit) << 24);
|
||||||
uint32_t d1 = (data[4] | (data[5] << 8) | (data[6] << 16) | (data[7] << 24)) & 0x3ffff03;
|
uint32_t d1 = (data[4] | (data[5] << 8) | (data[6] << 16) | (data[7] << 24)) & 0x3ffff03;
|
||||||
uint32_t d2 = (data[8] | (data[9] << 8) | (data[10] << 16) | (data[11] << 24)) & 0x3ffc0ff;
|
uint32_t d2 = (data[8] | (data[9] << 8) | (data[10] << 16) | (data[11] << 24)) & 0x3ffc0ff;
|
||||||
uint32_t d3 = (data[12] | (data[13] << 8) | (data[14] << 16) | (data[15] << 24)) & 0x3f03fff;
|
uint32_t d3 = (data[12] | (data[13] << 8) | (data[14] << 16) | ((data[15] | hibit) << 24)) & 0x3f03fff;
|
||||||
|
|
||||||
|
st->h[0] += d0;
|
||||||
|
st->h[1] += d1;
|
||||||
|
st->h[2] += d2;
|
||||||
|
st->h[3] += d3;
|
||||||
|
st->h[4] += 0;
|
||||||
|
|
||||||
|
/* Multiply by r */
|
||||||
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
||||||
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
||||||
|
|
||||||
h0 += d0; h1 += d1; h2 += d2; h3 += d3;
|
uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5);
|
||||||
|
uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5);
|
||||||
|
uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5);
|
||||||
|
uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5);
|
||||||
|
uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0;
|
||||||
|
|
||||||
uint64_t t0 = h0, t1 = h1, t2 = h2, t3 = h3;
|
uint32_t c = (uint32_t)(t0 >> 26);
|
||||||
t1 += (t0 >> 26); t0 &= 0x3ffffff;
|
st->h[0] = (uint32_t)(t0 & 0x3ffffff);
|
||||||
t2 += (t1 >> 22); t1 &= 0x3ffffff;
|
t1 += c;
|
||||||
t3 += (t2 >> 22); t2 &= 0x3ffffff;
|
c = (uint32_t)(t1 >> 22);
|
||||||
t0 += (t3 >> 22) * 5; t3 &= 0x3ffffff;
|
st->h[1] = (uint32_t)(t1 & 0x3ffffff);
|
||||||
|
t2 += c;
|
||||||
h0 = t0; h1 = t1; h2 = t2; h3 = t3;
|
c = (uint32_t)(t2 >> 26);
|
||||||
|
st->h[2] = (uint32_t)(t2 & 0x3ffffff);
|
||||||
st->h[0] = (uint32_t)(h0 & 0x3ffffff);
|
t3 += c;
|
||||||
st->h[1] = (uint32_t)(h1 & 0x3ffffff);
|
c = (uint32_t)(t3 >> 26);
|
||||||
st->h[2] = (uint32_t)(h2 & 0x3ffffff);
|
st->h[3] = (uint32_t)(t3 & 0x3ffffff);
|
||||||
st->h[3] = (uint32_t)(h3 & 0x3ffffff);
|
t4 += c;
|
||||||
st->h[4] = 0;
|
st->h[4] = (uint32_t)(t4 & 0x3ffffff);
|
||||||
|
|
||||||
data += 16;
|
data += 16;
|
||||||
len -= 16;
|
len -= 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Save remaining data */
|
||||||
if (len) {
|
if (len) {
|
||||||
memcpy(st->buf + st->left, data, len);
|
memcpy(st->buf + st->left, data, len);
|
||||||
st->left += len;
|
st->left += len;
|
||||||
@@ -251,47 +270,79 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le
|
|||||||
|
|
||||||
static void poly1305_final(poly1305_state_t *st, uint8_t mac[16])
|
static void poly1305_final(poly1305_state_t *st, uint8_t mac[16])
|
||||||
{
|
{
|
||||||
|
/* Process remaining bytes */
|
||||||
if (st->left) {
|
if (st->left) {
|
||||||
uint32_t hibit = 0x01000000;
|
uint32_t hibit = 0x01000000;
|
||||||
|
/* Pad with 0x01 byte after data */
|
||||||
|
st->buf[st->left] = 1;
|
||||||
|
for (size_t i = st->left + 1; i < 16; i++) {
|
||||||
|
st->buf[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t d0 = st->buf[0] | (st->buf[1] << 8) | (st->buf[2] << 16) | ((st->buf[3] | hibit) << 24);
|
uint32_t d0 = st->buf[0] | (st->buf[1] << 8) | (st->buf[2] << 16) | ((st->buf[3] | hibit) << 24);
|
||||||
uint32_t d1 = (st->buf[4] | (st->buf[5] << 8) | (st->buf[6] << 16) | (st->buf[7] << 24)) & 0x3ffff03;
|
uint32_t d1 = (st->buf[4] | (st->buf[5] << 8) | (st->buf[6] << 16) | (st->buf[7] << 24)) & 0x3ffff03;
|
||||||
uint32_t d2 = (st->buf[8] | (st->buf[9] << 8) | (st->buf[10] << 16) | (st->buf[11] << 24)) & 0x3ffc0ff;
|
uint32_t d2 = (st->buf[8] | (st->buf[9] << 8) | (st->buf[10] << 16) | (st->buf[11] << 24)) & 0x3ffc0ff;
|
||||||
uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | (st->buf[15] << 24)) & 0x3f03fff;
|
uint32_t d3 = (st->buf[12] | (st->buf[13] << 8) | (st->buf[14] << 16) | ((st->buf[15] | hibit) << 24)) & 0x3f03fff;
|
||||||
|
|
||||||
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3];
|
st->h[0] += d0;
|
||||||
h0 += d0; h1 += d1; h2 += d2; h3 += d3;
|
st->h[1] += d1;
|
||||||
|
st->h[2] += d2;
|
||||||
|
st->h[3] += d3;
|
||||||
|
st->h[4] += 0;
|
||||||
|
|
||||||
st->h[0] = (uint32_t)(h0 & 0x3ffffff);
|
/* Multiply by r one last time */
|
||||||
st->h[1] = (uint32_t)(h1 & 0x3ffffff);
|
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
||||||
st->h[2] = (uint32_t)(h2 & 0x3ffffff);
|
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
||||||
st->h[3] = (uint32_t)(h3 & 0x3ffffff);
|
|
||||||
|
uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5);
|
||||||
|
uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5);
|
||||||
|
uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5);
|
||||||
|
uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5);
|
||||||
|
uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0;
|
||||||
|
|
||||||
|
uint32_t c = (uint32_t)(t0 >> 26);
|
||||||
|
st->h[0] = (uint32_t)(t0 & 0x3ffffff);
|
||||||
|
t1 += c;
|
||||||
|
c = (uint32_t)(t1 >> 22);
|
||||||
|
st->h[1] = (uint32_t)(t1 & 0x3ffffff);
|
||||||
|
t2 += c;
|
||||||
|
c = (uint32_t)(t2 >> 26);
|
||||||
|
st->h[2] = (uint32_t)(t2 & 0x3ffffff);
|
||||||
|
t3 += c;
|
||||||
|
c = (uint32_t)(t3 >> 26);
|
||||||
|
st->h[3] = (uint32_t)(t3 & 0x3ffffff);
|
||||||
|
t4 += c;
|
||||||
|
st->h[4] = (uint32_t)(t4 & 0x3ffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Final reduction: add 5 * carry from h[4] */
|
||||||
uint32_t c = st->h[4] + 5;
|
uint32_t c = st->h[4] + 5;
|
||||||
uint32_t mask = -(c >> 26);
|
st->h[4] &= 0x3ffffff;
|
||||||
c &= 0x3ffffff;
|
st->h[0] += (c >> 26);
|
||||||
|
st->h[1] += (st->h[0] >> 26);
|
||||||
|
st->h[0] &= 0x3ffffff;
|
||||||
|
st->h[2] += (st->h[1] >> 22);
|
||||||
|
st->h[1] &= 0x3ffffff;
|
||||||
|
st->h[3] += (st->h[2] >> 26);
|
||||||
|
st->h[2] &= 0x3ffffff;
|
||||||
|
st->h[4] += (st->h[3] >> 22);
|
||||||
|
st->h[3] &= 0x3ffffff;
|
||||||
|
|
||||||
uint32_t h0 = st->h[0] + (c & ~mask);
|
/* Add s[0], s[1] */
|
||||||
uint32_t h1 = st->h[1] + ((c >> 26) & ~mask);
|
uint64_t mac0 = (uint64_t)st->h[0] + st->s[0];
|
||||||
uint32_t h2 = st->h[2] + ((c >> 52) & ~mask);
|
uint64_t mac1 = (uint64_t)st->h[1] + st->s[1] + (mac0 >> 32);
|
||||||
uint32_t h3 = st->h[3];
|
mac0 &= 0xFFFFFFFF;
|
||||||
|
mac1 &= 0xFFFFFFFF;
|
||||||
|
|
||||||
uint32_t s0 = st->s[0];
|
mac[0] = (uint8_t)mac0;
|
||||||
uint32_t s1 = st->s[1];
|
mac[1] = (uint8_t)(mac0 >> 8);
|
||||||
|
mac[2] = (uint8_t)(mac0 >> 16);
|
||||||
uint64_t mac0 = (uint64_t)h0 + s0;
|
mac[3] = (uint8_t)(mac0 >> 24);
|
||||||
uint64_t mac1 = (uint64_t)h1 + s1 + (mac0 >> 32);
|
mac[4] = (uint8_t)mac1;
|
||||||
|
mac[5] = (uint8_t)(mac1 >> 8);
|
||||||
mac[0] = (uint8_t)mac0;
|
mac[6] = (uint8_t)(mac1 >> 16);
|
||||||
mac[1] = (uint8_t)(mac0 >> 8);
|
mac[7] = (uint8_t)(mac1 >> 24);
|
||||||
mac[2] = (uint8_t)(mac0 >> 16);
|
for (int i = 8; i < 16; i++) mac[i] = 0;
|
||||||
mac[3] = (uint8_t)(mac0 >> 24);
|
|
||||||
mac[4] = (uint8_t)mac1;
|
|
||||||
mac[5] = (uint8_t)(mac1 >> 8);
|
|
||||||
mac[6] = (uint8_t)(mac1 >> 16);
|
|
||||||
mac[7] = (uint8_t)(mac1 >> 24);
|
|
||||||
mac[8] = 0; mac[9] = 0; mac[10] = 0; mac[11] = 0;
|
|
||||||
mac[12] = 0; mac[13] = 0; mac[14] = 0; mac[15] = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
@@ -344,6 +395,7 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le
|
|||||||
data += needed;
|
data += needed;
|
||||||
len -= needed;
|
len -= needed;
|
||||||
|
|
||||||
|
/* Add 0x01 byte after 16 bytes */
|
||||||
uint64_t hibit = ((uint64_t)1) << 40;
|
uint64_t hibit = ((uint64_t)1) << 40;
|
||||||
st->h[0] += (uint64_t)st->buf[0] | ((uint64_t)st->buf[1] << 8) |
|
st->h[0] += (uint64_t)st->buf[0] | ((uint64_t)st->buf[1] << 8) |
|
||||||
((uint64_t)st->buf[2] << 16) | ((uint64_t)st->buf[3] << 24);
|
((uint64_t)st->buf[2] << 16) | ((uint64_t)st->buf[3] << 24);
|
||||||
@@ -354,6 +406,31 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le
|
|||||||
st->h[3] += ((uint64_t)st->buf[12] | ((uint64_t)st->buf[13] << 8) |
|
st->h[3] += ((uint64_t)st->buf[12] | ((uint64_t)st->buf[13] << 8) |
|
||||||
((uint64_t)st->buf[14] << 16) | ((uint64_t)st->buf[15] << 24)) & 0x3f03fff;
|
((uint64_t)st->buf[14] << 16) | ((uint64_t)st->buf[15] << 24)) & 0x3f03fff;
|
||||||
st->h[4] += hibit;
|
st->h[4] += hibit;
|
||||||
|
|
||||||
|
/* Multiply by r */
|
||||||
|
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
||||||
|
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
||||||
|
|
||||||
|
uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5);
|
||||||
|
uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5);
|
||||||
|
uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5);
|
||||||
|
uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5);
|
||||||
|
uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0;
|
||||||
|
|
||||||
|
uint32_t c = (uint32_t)(t0 >> 26);
|
||||||
|
st->h[0] = (uint32_t)(t0 & 0x3ffffff);
|
||||||
|
t1 += c;
|
||||||
|
c = (uint32_t)(t1 >> 22);
|
||||||
|
st->h[1] = (uint32_t)(t1 & 0x3ffffff);
|
||||||
|
t2 += c;
|
||||||
|
c = (uint32_t)(t2 >> 26);
|
||||||
|
st->h[2] = (uint32_t)(t2 & 0x3ffffff);
|
||||||
|
t3 += c;
|
||||||
|
c = (uint32_t)(t3 >> 26);
|
||||||
|
st->h[3] = (uint32_t)(t3 & 0x3ffffff);
|
||||||
|
t4 += c;
|
||||||
|
st->h[4] = (uint32_t)(t4 & 0x3ffffff);
|
||||||
|
|
||||||
st->left = 0;
|
st->left = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -369,14 +446,29 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le
|
|||||||
((uint64_t)data[14] << 16) | ((uint64_t)data[15] << 24)) & 0x3f03fff;
|
((uint64_t)data[14] << 16) | ((uint64_t)data[15] << 24)) & 0x3f03fff;
|
||||||
st->h[4] += hibit;
|
st->h[4] += hibit;
|
||||||
|
|
||||||
for (int i = 0; i < 5; i++) {
|
/* Multiply by r */
|
||||||
uint64_t d = 0;
|
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
||||||
for (int j = 0; j < 5; j++) {
|
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
||||||
d += st->h[j] * st->r[j];
|
|
||||||
}
|
uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5);
|
||||||
st->h[i] = d & 0x3ffffff;
|
uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5);
|
||||||
if (i < 4) st->h[i+1] += d >> 26;
|
uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5);
|
||||||
}
|
uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5);
|
||||||
|
uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0;
|
||||||
|
|
||||||
|
uint32_t c = (uint32_t)(t0 >> 26);
|
||||||
|
st->h[0] = (uint32_t)(t0 & 0x3ffffff);
|
||||||
|
t1 += c;
|
||||||
|
c = (uint32_t)(t1 >> 22);
|
||||||
|
st->h[1] = (uint32_t)(t1 & 0x3ffffff);
|
||||||
|
t2 += c;
|
||||||
|
c = (uint32_t)(t2 >> 26);
|
||||||
|
st->h[2] = (uint32_t)(t2 & 0x3ffffff);
|
||||||
|
t3 += c;
|
||||||
|
c = (uint32_t)(t3 >> 26);
|
||||||
|
st->h[3] = (uint32_t)(t3 & 0x3ffffff);
|
||||||
|
t4 += c;
|
||||||
|
st->h[4] = (uint32_t)(t4 & 0x3ffffff);
|
||||||
|
|
||||||
data += 16;
|
data += 16;
|
||||||
len -= 16;
|
len -= 16;
|
||||||
@@ -390,35 +482,65 @@ static void poly1305_update(poly1305_state_t *st, const uint8_t *data, size_t le
|
|||||||
|
|
||||||
static void poly1305_final(poly1305_state_t *st, uint8_t mac[16])
|
static void poly1305_final(poly1305_state_t *st, uint8_t mac[16])
|
||||||
{
|
{
|
||||||
|
/* Process remaining bytes */
|
||||||
if (st->left) {
|
if (st->left) {
|
||||||
uint64_t hibit = ((uint64_t)1) << (8 * st->left);
|
uint64_t hibit = ((uint64_t)1) << (8 * st->left);
|
||||||
st->h[st->left >> 2] += hibit;
|
st->buf[st->left] = 1;
|
||||||
|
for (size_t i = st->left + 1; i < 16; i++) {
|
||||||
|
st->buf[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
st->h[0] += (uint64_t)st->buf[0] | ((uint64_t)st->buf[1] << 8) |
|
||||||
|
((uint64_t)st->buf[2] << 16) | ((uint64_t)st->buf[3] << 24);
|
||||||
|
st->h[1] += ((uint64_t)st->buf[4] | ((uint64_t)st->buf[5] << 8) |
|
||||||
|
((uint64_t)st->buf[6] << 16) | ((uint64_t)st->buf[7] << 24)) & 0x3ffff03;
|
||||||
|
st->h[2] += ((uint64_t)st->buf[8] | ((uint64_t)st->buf[9] << 8) |
|
||||||
|
((uint64_t)st->buf[10] << 16) | ((uint64_t)st->buf[11] << 24)) & 0x3ffc0ff;
|
||||||
|
st->h[3] += ((uint64_t)st->buf[12] | ((uint64_t)st->buf[13] << 8) |
|
||||||
|
((uint64_t)st->buf[14] << 16) | ((uint64_t)st->buf[15] << 24)) & 0x3f03fff;
|
||||||
|
st->h[4] += hibit;
|
||||||
|
|
||||||
|
/* Multiply by r one last time */
|
||||||
|
uint64_t r0 = st->r[0], r1 = st->r[1], r2 = st->r[2], r3 = st->r[3], r4 = st->r[4];
|
||||||
|
uint64_t h0 = st->h[0], h1 = st->h[1], h2 = st->h[2], h3 = st->h[3], h4 = st->h[4];
|
||||||
|
|
||||||
|
uint64_t t0 = h0 * r0 + h1 * (r4 * 5) + h2 * (r3 * 5) + h3 * (r2 * 5) + h4 * (r1 * 5);
|
||||||
|
uint64_t t1 = h0 * r1 + h1 * r0 + h2 * (r4 * 5) + h3 * (r3 * 5) + h4 * (r2 * 5);
|
||||||
|
uint64_t t2 = h0 * r2 + h1 * r1 + h2 * r0 + h3 * (r4 * 5) + h4 * (r3 * 5);
|
||||||
|
uint64_t t3 = h0 * r3 + h1 * r2 + h2 * r1 + h3 * r0 + h4 * (r4 * 5);
|
||||||
|
uint64_t t4 = h0 * r4 + h1 * r3 + h2 * r2 + h3 * r1 + h4 * r0;
|
||||||
|
|
||||||
|
uint32_t c = (uint32_t)(t0 >> 26);
|
||||||
|
st->h[0] = (uint32_t)(t0 & 0x3ffffff);
|
||||||
|
t1 += c;
|
||||||
|
c = (uint32_t)(t1 >> 22);
|
||||||
|
st->h[1] = (uint32_t)(t1 & 0x3ffffff);
|
||||||
|
t2 += c;
|
||||||
|
c = (uint32_t)(t2 >> 26);
|
||||||
|
st->h[2] = (uint32_t)(t2 & 0x3ffffff);
|
||||||
|
t3 += c;
|
||||||
|
c = (uint32_t)(t3 >> 26);
|
||||||
|
st->h[3] = (uint32_t)(t3 & 0x3ffffff);
|
||||||
|
t4 += c;
|
||||||
|
st->h[4] = (uint32_t)(t4 & 0x3ffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t c = st->h[4] >> 26;
|
/* Final reduction */
|
||||||
st->h[4] &= 0x3ffffff;
|
uint64_t c = st->h[4] + 5;
|
||||||
for (int i = 0; i < 4; i++) {
|
|
||||||
st->h[i] += c * 5;
|
|
||||||
c = st->h[i] >> 26;
|
|
||||||
st->h[i] &= 0x3ffffff;
|
|
||||||
}
|
|
||||||
st->h[4] &= 0x3ffffff;
|
st->h[4] &= 0x3ffffff;
|
||||||
|
st->h[0] += (c >> 26);
|
||||||
|
st->h[1] += (st->h[0] >> 26);
|
||||||
|
st->h[0] &= 0x3ffffff;
|
||||||
|
st->h[2] += (st->h[1] >> 22);
|
||||||
|
st->h[1] &= 0x3ffffff;
|
||||||
|
st->h[3] += (st->h[2] >> 26);
|
||||||
|
st->h[2] &= 0x3ffffff;
|
||||||
|
st->h[4] += (st->h[3] >> 22);
|
||||||
|
st->h[3] &= 0x3ffffff;
|
||||||
|
|
||||||
uint64_t g0 = st->h[0] + 5;
|
/* Add s */
|
||||||
uint64_t g1 = st->h[1] + (g0 >> 26);
|
uint64_t mac0 = st->h[0] + st->s[0];
|
||||||
uint64_t g2 = st->h[2] + (g1 >> 26);
|
uint64_t mac1 = st->h[1] + st->s[1] + (mac0 >> 32);
|
||||||
uint64_t g3 = st->h[3] + (g2 >> 26);
|
|
||||||
uint64_t g4 = st->h[4] + (g3 >> 26) - (1ULL << 26);
|
|
||||||
|
|
||||||
uint64_t mask = -(g4 >> 63);
|
|
||||||
g0 += st->h[0] & mask;
|
|
||||||
g1 += st->h[1] & mask;
|
|
||||||
g2 += st->h[2] & mask;
|
|
||||||
g3 += st->h[3] & mask;
|
|
||||||
g4 += st->h[4] & mask;
|
|
||||||
|
|
||||||
uint64_t mac0 = g0 + st->s[0];
|
|
||||||
uint64_t mac1 = g1 + st->s[1] + (mac0 >> 32);
|
|
||||||
mac0 &= 0xFFFFFFFF;
|
mac0 &= 0xFFFFFFFF;
|
||||||
mac1 &= 0xFFFFFFFF;
|
mac1 &= 0xFFFFFFFF;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user