Files
se050-wireguard/src/se050_x25519_sw.c
T
km a8d28882c7 Add ESP32 support with 128-bit arithmetic emulation
- Detect ESP32 platform using ESP_PLATFORM and __XTENSA__ macros
- Implement 128-bit multiplication and addition using 64-bit arithmetic
- Wrap fe_mul(), fe_sq(), and fe_mul_small() with ESP32-specific code paths
- Standard platforms use native unsigned __int128 (faster)
- ESP32 uses 128-bit emulation (compatible with 32-bit architecture)
2026-03-28 07:40:38 +09:00

684 lines
22 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* @file se050_x25519_sw.c
* @brief Software X25519 ECDH Implementation (Clean-room RFC7748)
* Based on RFC 7748 reference implementation with 5×51-bit limbs
* License: MIT (Clean-room implementation)
*/
#include "se050_x25519_sw.h"
#include "se050_crypto_utils.h"
#include <string.h>
/* =========================================================================
* Platform detection
* ========================================================================= */
#if defined(ESP_PLATFORM) || defined(__XTENSA__)
#define SE050_X25519_ESP32 1
#else
#define SE050_X25519_ESP32 0
#endif
/* =========================================================================
* Field GF(2^255-19)
*
* We represent field elements as arrays of 5 uint64_t limbs in radix 2^51.
* Each limb holds at most 51 bits in "loose" form.
*
* value = limb[0] + limb[1] * 2^51 + limb[2] * 2^102 + limb[3] * 2^153 + limb[4] * 2^204
*
* p = 2^255 - 19, so 2^255 ≡ 19 (mod p)
* ========================================================================= */
#define NLIMBS 5
typedef uint64_t fe[NLIMBS]; /* field element */
#define L51 ((uint64_t)1 << 51)
#define MASK51 (L51 - 1)
/* 128-bit helpers */
#if !SE050_X25519_ESP32
static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; }
static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); }
#else
/* ESP32: 128-bit emulation using 64-bit arithmetic */
typedef struct { uint64_t lo, hi; } u128;
static inline u128 u128_mul(uint64_t a, uint64_t b) {
u128 r;
uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32;
uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32;
uint64_t p0 = a_lo * b_lo;
uint64_t p1 = a_lo * b_hi;
uint64_t p2 = a_hi * b_lo;
uint64_t p3 = a_hi * b_hi;
uint64_t mid = p1 + p2;
r.lo = p0 + (mid << 32);
r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0);
return r;
}
static inline uint64_t u128_lo(u128 x) { return x.lo; }
static inline uint64_t u128_hi(u128 x) { return x.hi; }
static inline u128 u128_add(u128 a, u128 b) {
u128 r;
r.lo = a.lo + b.lo;
r.hi = a.hi + b.hi + (r.lo < a.lo);
return r;
}
#endif
/* --- Basic operations --- */
static void fe_zero(fe f) { f[0] = f[1] = f[2] = f[3] = f[4] = 0; }
static void fe_one(fe f) { f[0] = 1; f[1] = f[2] = f[3] = f[4] = 0; }
static void fe_copy(fe out, const fe in)
{
out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; out[4] = in[4];
}
/* fe_add: out = a + b (loose, ≤ 2·2^51) */
static void fe_add(fe out, const fe a, const fe b)
{
out[0] = a[0] + b[0];
out[1] = a[1] + b[1];
out[2] = a[2] + b[2];
out[3] = a[3] + b[3];
out[4] = a[4] + b[4];
}
/* fe_sub: out = a - b (loose, uses bias to avoid underflow) */
static void fe_sub(fe out, const fe a, const fe b)
{
out[0] = a[0] + 2*(L51 - 19) - b[0];
out[1] = a[1] + 2*(L51 - 1) - b[1];
out[2] = a[2] + 2*(L51 - 1) - b[2];
out[3] = a[3] + 2*(L51 - 1) - b[3];
out[4] = a[4] + 2*(L51 - 1) - b[4];
}
/* fe_reduce: propagate carries, keep limbs < 2^51 */
static void fe_reduce(fe f)
{
uint64_t c;
c = f[0] >> 51; f[0] &= MASK51; f[1] += c;
c = f[1] >> 51; f[1] &= MASK51; f[2] += c;
c = f[2] >> 51; f[2] &= MASK51; f[3] += c;
c = f[3] >> 51; f[3] &= MASK51; f[4] += c;
c = f[4] >> 51; f[4] &= MASK51; f[0] += 19 * c;
c = f[0] >> 51; f[0] &= MASK51; f[1] += c;
}
/* --- Multiplication --- */
#if !SE050_X25519_ESP32
/* fe_mul: out = a * b mod p (128-bit accumulators) */
static void fe_mul(fe out, const fe a, const fe b)
{
unsigned __int128 t0, t1, t2, t3, t4;
uint64_t c;
uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
t0 = (unsigned __int128)a[0] * b[0];
t0 += (unsigned __int128)a[1] * b4_19;
t0 += (unsigned __int128)a[2] * b3_19;
t0 += (unsigned __int128)a[3] * b2_19;
t0 += (unsigned __int128)a[4] * b1_19;
t1 = (unsigned __int128)a[0] * b[1];
t1 += (unsigned __int128)a[1] * b[0];
t1 += (unsigned __int128)a[2] * b4_19;
t1 += (unsigned __int128)a[3] * b3_19;
t1 += (unsigned __int128)a[4] * b2_19;
t2 = (unsigned __int128)a[0] * b[2];
t2 += (unsigned __int128)a[1] * b[1];
t2 += (unsigned __int128)a[2] * b[0];
t2 += (unsigned __int128)a[3] * b4_19;
t2 += (unsigned __int128)a[4] * b3_19;
t3 = (unsigned __int128)a[0] * b[3];
t3 += (unsigned __int128)a[1] * b[2];
t3 += (unsigned __int128)a[2] * b[1];
t3 += (unsigned __int128)a[3] * b[0];
t3 += (unsigned __int128)a[4] * b4_19;
t4 = (unsigned __int128)a[0] * b[4];
t4 += (unsigned __int128)a[1] * b[3];
t4 += (unsigned __int128)a[2] * b[2];
t4 += (unsigned __int128)a[3] * b[1];
t4 += (unsigned __int128)a[4] * b[0];
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c;
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c;
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c;
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#else
/* ESP32: fe_mul with 128-bit emulation */
static void fe_mul(fe out, const fe a, const fe b)
{
u128 t0, t1, t2, t3, t4;
uint64_t c;
uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
t0 = u128_mul(a[0], b[0]);
t0 = u128_add(t0, u128_mul(a[1], b4_19));
t0 = u128_add(t0, u128_mul(a[2], b3_19));
t0 = u128_add(t0, u128_mul(a[3], b2_19));
t0 = u128_add(t0, u128_mul(a[4], b1_19));
t1 = u128_mul(a[0], b[1]);
t1 = u128_add(t1, u128_mul(a[1], b[0]));
t1 = u128_add(t1, u128_mul(a[2], b4_19));
t1 = u128_add(t1, u128_mul(a[3], b3_19));
t1 = u128_add(t1, u128_mul(a[4], b2_19));
t2 = u128_mul(a[0], b[2]);
t2 = u128_add(t2, u128_mul(a[1], b[1]));
t2 = u128_add(t2, u128_mul(a[2], b[0]));
t2 = u128_add(t2, u128_mul(a[3], b4_19));
t2 = u128_add(t2, u128_mul(a[4], b3_19));
t3 = u128_mul(a[0], b[3]);
t3 = u128_add(t3, u128_mul(a[1], b[2]));
t3 = u128_add(t3, u128_mul(a[2], b[1]));
t3 = u128_add(t3, u128_mul(a[3], b[0]));
t3 = u128_add(t3, u128_mul(a[4], b4_19));
t4 = u128_mul(a[0], b[4]);
t4 = u128_add(t4, u128_mul(a[1], b[3]));
t4 = u128_add(t4, u128_mul(a[2], b[2]));
t4 = u128_add(t4, u128_mul(a[3], b[1]));
t4 = u128_add(t4, u128_mul(a[4], b[0]));
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#endif
/* fe_sq: out = a^2 mod p (optimized) */
#if !SE050_X25519_ESP32
static void fe_sq(fe out, const fe a)
{
unsigned __int128 t0, t1, t2, t3, t4;
uint64_t c;
uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
t0 = (unsigned __int128)a[0] * a[0];
t0 += (unsigned __int128)d1_19 * a[4];
t0 += (unsigned __int128)d2_19 * a[3];
t1 = (unsigned __int128)a[0] * d1;
t1 += (unsigned __int128)d2_19 * a[4];
t1 += (unsigned __int128)a3_19 * a[3];
t2 = (unsigned __int128)a[0] * d2;
t2 += (unsigned __int128)a[1] * a[1];
t2 += (unsigned __int128)d3 * a4_19;
t3 = (unsigned __int128)a[0] * d3;
t3 += (unsigned __int128)d1 * a[2];
t3 += (unsigned __int128)a[4] * a4_19;
t4 = (unsigned __int128)a[0] * (2 * a[4]);
t4 += (unsigned __int128)d1 * a[3];
t4 += (unsigned __int128)a[2] * a[2];
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c;
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c;
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c;
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#else
static void fe_sq(fe out, const fe a)
{
u128 t0, t1, t2, t3, t4;
uint64_t c;
uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
t0 = u128_mul(a[0], a[0]);
t0 = u128_add(t0, u128_mul(d1_19, a[4]));
t0 = u128_add(t0, u128_mul(d2_19, a[3]));
t1 = u128_mul(a[0], d1);
t1 = u128_add(t1, u128_mul(d2_19, a[4]));
t1 = u128_add(t1, u128_mul(a3_19, a[3]));
t2 = u128_mul(a[0], d2);
t2 = u128_add(t2, u128_mul(a[1], a[1]));
t2 = u128_add(t2, u128_mul(d3, a4_19));
t3 = u128_mul(a[0], d3);
t3 = u128_add(t3, u128_mul(d1, a[2]));
t3 = u128_add(t3, u128_mul(a[4], a4_19));
t4 = u128_mul(a[0], 2 * a[4]);
t4 = u128_add(t4, u128_mul(d1, a[3]));
t4 = u128_add(t4, u128_mul(a[2], a[2]));
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#endif
/* fe_mul_small: out = f * n (n < 2^22) */
#if !SE050_X25519_ESP32
static void fe_mul_small(fe out, const fe f, uint64_t n)
{
unsigned __int128 t0, t1, t2, t3, t4;
uint64_t c;
t0 = (unsigned __int128)f[0] * n;
t1 = (unsigned __int128)f[1] * n;
t2 = (unsigned __int128)f[2] * n;
t3 = (unsigned __int128)f[3] * n;
t4 = (unsigned __int128)f[4] * n;
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c;
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c;
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c;
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#else
static void fe_mul_small(fe out, const fe f, uint64_t n)
{
u128 t0, t1, t2, t3, t4;
uint64_t c;
t0 = u128_mul(f[0], n);
t1 = u128_mul(f[1], n);
t2 = u128_mul(f[2], n);
t3 = u128_mul(f[3], n);
t4 = u128_mul(f[4], n);
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#endif
/* --- Inversion --- */
/* fe_invert: out = a^(-1) = a^(p-2) using addition chain */
static void fe_invert(fe out, const fe a)
{
fe t0, t1, t2, t3;
int i;
fe_sq(t0, a); /* t0 = a^2 */
fe_sq(t1, t0); /* t1 = a^4 */
fe_sq(t1, t1); /* t1 = a^8 */
fe_mul(t1, t1, a); /* t1 = a^9 */
fe_mul(t0, t0, t1); /* t0 = a^11 */
fe_sq(t2, t0); /* t2 = a^22 */
fe_mul(t1, t1, t2); /* t1 = a^31 */
fe_sq(t2, t1);
for (i = 1; i < 5; i++) fe_sq(t2, t2);
fe_mul(t1, t2, t1); /* t1 = a^(2^10-1) */
fe_sq(t2, t1);
for (i = 1; i < 10; i++) fe_sq(t2, t2);
fe_mul(t2, t2, t1); /* t2 = a^(2^20-1) */
fe_sq(t3, t2);
for (i = 1; i < 20; i++) fe_sq(t3, t3);
fe_mul(t2, t3, t2); /* t2 = a^(2^40-1) */
fe_sq(t2, t2);
for (i = 1; i < 10; i++) fe_sq(t2, t2);
fe_mul(t1, t2, t1); /* t1 = a^(2^50-1) */
fe_sq(t2, t1);
for (i = 1; i < 50; i++) fe_sq(t2, t2);
fe_mul(t2, t2, t1); /* t2 = a^(2^100-1) */
fe_sq(t3, t2);
for (i = 1; i < 100; i++) fe_sq(t3, t3);
fe_mul(t2, t3, t2); /* t2 = a^(2^200-1) */
fe_sq(t2, t2);
for (i = 1; i < 50; i++) fe_sq(t2, t2);
fe_mul(t1, t2, t1); /* t1 = a^(2^250-1) */
fe_sq(t1, t1);
fe_sq(t1, t1);
fe_sq(t1, t1);
fe_sq(t1, t1);
fe_sq(t1, t1); /* t1 = a^(2^255-2^5) */
fe_mul(out, t1, t0); /* out = a^(2^255-21) = a^(p-2) */
}
/* --- Byte conversion --- */
/* fe_from_bytes: 32-byte little-endian → field element */
static void fe_from_bytes(fe out, const uint8_t in[32])
{
uint8_t buf[32];
memcpy(buf, in, 32);
buf[31] &= 0x7f; /* clear top bit per RFC 7748 §5 */
out[0] = ((uint64_t)buf[ 0])
| ((uint64_t)buf[ 1] << 8)
| ((uint64_t)buf[ 2] << 16)
| ((uint64_t)buf[ 3] << 24)
| ((uint64_t)buf[ 4] << 32)
| ((uint64_t)buf[ 5] << 40)
| ((uint64_t)(buf[6] & 0x07) << 48);
out[1] = ((uint64_t)buf[ 6] >> 3)
| ((uint64_t)buf[ 7] << 5)
| ((uint64_t)buf[ 8] << 13)
| ((uint64_t)buf[ 9] << 21)
| ((uint64_t)buf[10] << 29)
| ((uint64_t)buf[11] << 37)
| ((uint64_t)(buf[12] & 0x3f) << 45);
out[2] = ((uint64_t)buf[12] >> 6)
| ((uint64_t)buf[13] << 2)
| ((uint64_t)buf[14] << 10)
| ((uint64_t)buf[15] << 18)
| ((uint64_t)buf[16] << 26)
| ((uint64_t)buf[17] << 34)
| ((uint64_t)buf[18] << 42)
| ((uint64_t)(buf[19] & 0x01) << 50);
out[3] = ((uint64_t)buf[19] >> 1)
| ((uint64_t)buf[20] << 7)
| ((uint64_t)buf[21] << 15)
| ((uint64_t)buf[22] << 23)
| ((uint64_t)buf[23] << 31)
| ((uint64_t)buf[24] << 39)
| ((uint64_t)(buf[25] & 0x0f) << 47);
out[4] = ((uint64_t)buf[25] >> 4)
| ((uint64_t)buf[26] << 4)
| ((uint64_t)buf[27] << 12)
| ((uint64_t)buf[28] << 20)
| ((uint64_t)buf[29] << 28)
| ((uint64_t)buf[30] << 36)
| ((uint64_t)(buf[31] & 0x7f) << 44);
}
/* fe_to_bytes: field element → 32-byte little-endian */
static void fe_to_bytes(uint8_t out[32], const fe in)
{
fe f;
uint64_t c, t;
fe_copy(f, in);
fe_reduce(f);
fe_reduce(f);
/* Conditional subtract p = 2^255 - 19 */
t = f[0] + 19;
c = t >> 51; t &= MASK51; uint64_t g0 = t;
t = f[1] + c; c = t >> 51; t &= MASK51; uint64_t g1 = t;
t = f[2] + c; c = t >> 51; t &= MASK51; uint64_t g2 = t;
t = f[3] + c; c = t >> 51; t &= MASK51; uint64_t g3 = t;
t = f[4] + c; uint64_t g4 = t & MASK51;
uint64_t mask = -((t >> 51) & 1);
f[0] = (f[0] & ~mask) | (g0 & mask);
f[1] = (f[1] & ~mask) | (g1 & mask);
f[2] = (f[2] & ~mask) | (g2 & mask);
f[3] = (f[3] & ~mask) | (g3 & mask);
f[4] = (f[4] & ~mask) | (g4 & mask);
/* Unpack to bytes */
out[ 0] = (uint8_t)(f[0]);
out[ 1] = (uint8_t)(f[0] >> 8);
out[ 2] = (uint8_t)(f[0] >> 16);
out[ 3] = (uint8_t)(f[0] >> 24);
out[ 4] = (uint8_t)(f[0] >> 32);
out[ 5] = (uint8_t)(f[0] >> 40);
out[ 6] = (uint8_t)((f[0] >> 48) | (f[1] << 3));
out[ 7] = (uint8_t)(f[1] >> 5);
out[ 8] = (uint8_t)(f[1] >> 13);
out[ 9] = (uint8_t)(f[1] >> 21);
out[10] = (uint8_t)(f[1] >> 29);
out[11] = (uint8_t)(f[1] >> 37);
out[12] = (uint8_t)((f[1] >> 45) | (f[2] << 6));
out[13] = (uint8_t)(f[2] >> 2);
out[14] = (uint8_t)(f[2] >> 10);
out[15] = (uint8_t)(f[2] >> 18);
out[16] = (uint8_t)(f[2] >> 26);
out[17] = (uint8_t)(f[2] >> 34);
out[18] = (uint8_t)(f[2] >> 42);
out[19] = (uint8_t)((f[2] >> 50) | (f[3] << 1));
out[20] = (uint8_t)(f[3] >> 7);
out[21] = (uint8_t)(f[3] >> 15);
out[22] = (uint8_t)(f[3] >> 23);
out[23] = (uint8_t)(f[3] >> 31);
out[24] = (uint8_t)(f[3] >> 39);
out[25] = (uint8_t)((f[3] >> 47) | (f[4] << 4));
out[26] = (uint8_t)(f[4] >> 4);
out[27] = (uint8_t)(f[4] >> 12);
out[28] = (uint8_t)(f[4] >> 20);
out[29] = (uint8_t)(f[4] >> 28);
out[30] = (uint8_t)(f[4] >> 36);
out[31] = (uint8_t)(f[4] >> 44);
}
/* --- Montgomery ladder --- */
#define A24 121665ULL
/* fe_cswap: conditional swap */
static void fe_cswap(fe a, fe b, uint64_t swap)
{
uint64_t mask = -(swap & 1);
for (int i = 0; i < NLIMBS; i++) {
uint64_t t = mask & (a[i] ^ b[i]);
a[i] ^= t;
b[i] ^= t;
}
}
/* ladder_step: one Montgomery ladder step */
static void ladder_step(
fe X2, fe Z2, fe X3, fe Z3,
const fe X2_in, const fe Z2_in,
const fe X3_in, const fe Z3_in,
const fe x1)
{
fe A, AA, B, BB, E, C, D, DA, CB, tmp, a24_E;
fe_add(A, X2_in, Z2_in);
fe_sq (AA, A);
fe_sub(B, X2_in, Z2_in);
fe_sq (BB, B);
fe_sub(E, AA, BB);
fe_add(C, X3_in, Z3_in);
fe_sub(D, X3_in, Z3_in);
fe_mul(DA, D, A);
fe_mul(CB, C, B);
fe_add(tmp, DA, CB);
fe_sq (X3, tmp);
fe_sub(tmp, DA, CB);
fe_sq (tmp, tmp);
fe_mul(Z3, tmp, x1);
fe_mul(X2, AA, BB);
fe_mul_small(a24_E, E, A24);
fe_add(tmp, AA, a24_E);
fe_mul(Z2, E, tmp);
}
/* --- Public API --- */
const uint8_t X25519_BASE_POINT[32] = { 9 };
int x25519_sw(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32])
{
uint8_t e[32];
fe x1, X2, Z2, X3, Z3;
uint64_t prev_bit, swap;
int i;
/* Step 1: clamp scalar */
memcpy(e, scalar, 32);
e[ 0] &= 248;
e[31] &= 127;
e[31] |= 64;
/* Step 2: decode u-coordinate */
if (point == NULL)
fe_from_bytes(x1, X25519_BASE_POINT);
else
fe_from_bytes(x1, point);
/* Step 3: initialise projective points */
fe_one (X2); fe_zero(Z2);
fe_copy(X3, x1); fe_one(Z3);
/* Step 4: Montgomery ladder */
prev_bit = 0;
for (i = 254; i >= 0; i--) {
uint64_t bit = (e[i / 8] >> (i % 8)) & 1;
swap = bit ^ prev_bit;
prev_bit = bit;
fe_cswap(X2, X3, swap);
fe_cswap(Z2, Z3, swap);
fe nX2, nZ2, nX3, nZ3;
ladder_step(nX2, nZ2, nX3, nZ3, X2, Z2, X3, Z3, x1);
fe_copy(X2, nX2); fe_copy(Z2, nZ2);
fe_copy(X3, nX3); fe_copy(Z3, nZ3);
}
fe_cswap(X2, X3, prev_bit);
fe_cswap(Z2, Z3, prev_bit);
/* Step 5: convert from projective to affine */
fe Z2_inv;
fe_invert(Z2_inv, Z2);
fe_mul(X2, X2, Z2_inv);
/* Step 6: encode result */
fe_to_bytes(out, X2);
/* Step 7: reject all-zero output */
uint8_t acc = 0;
for (i = 0; i < 32; i++) acc |= out[i];
if (acc == 0) return -1;
return 0;
}
void se050_x25519_sw_clamp(uint8_t *scalar)
{
scalar[0] &= 248;
scalar[31] &= 127;
scalar[31] |= 64;
}
void se050_x25519_sw_zeroize(uint8_t *key, size_t len)
{
memzero_explicit(key, len);
}
int se050_x25519_sw_generate_keypair(se050_x25519_sw_keypair_t *keypair,
x25519_rng_func rng_func,
void *rng_ctx)
{
if (!keypair || !rng_func) return -1;
if (rng_func(keypair->private_key, 32, rng_ctx) != 0) return -1;
se050_x25519_sw_clamp(keypair->private_key);
x25519_sw(keypair->public_key, keypair->private_key, (const uint8_t*)"basepoint");
return 0;
}
int se050_x25519_sw_compute_shared_secret(uint8_t *shared_secret,
const uint8_t *private_key,
const uint8_t *peer_public)
{
if (!shared_secret || !private_key || !peer_public) return -1;
uint8_t clamped[32];
memcpy(clamped, private_key, 32);
se050_x25519_sw_clamp(clamped);
x25519_sw(shared_secret, clamped, peer_public);
se050_x25519_sw_zeroize(clamped, 32);
return 0;
}
int se050_x25519_sw_derive_public_key(uint8_t *public_key,
const uint8_t *private_key)
{
if (!public_key || !private_key) return -1;
uint8_t clamped[32];
memcpy(clamped, private_key, 32);
se050_x25519_sw_clamp(clamped);
x25519_sw(public_key, clamped, (const uint8_t*)"basepoint");
se050_x25519_sw_zeroize(clamped, 32);
return 0;
}
#ifdef X25519_SW_TEST
#include <stdio.h>
/* RFC 7748 §6.1 Test Vector 1 */
static const uint8_t RFC7748_SK_1[32] = {
0xa5,0x46,0xe3,0x6b,0xf0,0x52,0x7c,0x9d,0x3b,0x16,0x15,0x4b,
0x82,0x46,0x5e,0xdd,0x62,0x14,0x4c,0x0a,0xc1,0xfc,0x5a,0x18,
0x50,0x6a,0x22,0x44,0xba,0x44,0x9a,0xc4 };
static const uint8_t RFC7748_PK_1[32] = {
0xe6,0xdb,0x68,0x67,0x58,0x30,0x30,0xdb,0x35,0x94,0xc1,0xa4,
0x24,0xb1,0x5f,0x7c,0x72,0x66,0x24,0xec,0x26,0xb3,0x35,0x3b,
0x10,0xa9,0x03,0xa6,0xd0,0xab,0x1c,0x4c };
static const uint8_t RFC7748_SS_1[32] = {
0xc3,0xda,0x55,0x37,0x9d,0xe9,0xc6,0x90,0x8e,0x94,0xea,0x4d,
0xf2,0x8d,0x08,0x4f,0x32,0xec,0xcf,0x03,0x49,0x1c,0x71,0xf7,
0x54,0xb4,0x07,0x55,0x77,0xa2,0x85,0x52 };
static void print_hex(const char *label, const uint8_t *buf, size_t len)
{
printf("%s: ", label);
for (size_t i = 0; i < len; i++) printf("%02x", buf[i]);
printf("\n");
}
int main(void)
{
uint8_t shared_secret[32];
printf("X25519 Software Implementation Test\n");
printf("====================================\n\n");
printf("RFC 7748 Test Vector 1:\n");
print_hex("Scalar", RFC7748_SK_1, 32);
print_hex("Point", RFC7748_PK_1, 32);
x25519_sw(shared_secret, RFC7748_SK_1, RFC7748_PK_1);
print_hex("Computed SS", shared_secret, 32);
print_hex("Expected SS", RFC7748_SS_1, 32);
if (memcmp(shared_secret, RFC7748_SS_1, 32) == 0) {
printf("[PASS] RFC 7748 Test Vector 1\n");
return 0;
} else {
printf("[FAIL] RFC 7748 Test Vector 1\n");
return 1;
}
}
#endif