a8d28882c7
- Detect ESP32 platform using ESP_PLATFORM and __XTENSA__ macros - Implement 128-bit multiplication and addition using 64-bit arithmetic - Wrap fe_mul(), fe_sq(), and fe_mul_small() with ESP32-specific code paths - Standard platforms use native unsigned __int128 (faster) - ESP32 uses 128-bit emulation (compatible with 32-bit architecture)
684 lines
22 KiB
C
684 lines
22 KiB
C
/**
|
||
* @file se050_x25519_sw.c
|
||
* @brief Software X25519 ECDH Implementation (Clean-room RFC7748)
|
||
* Based on RFC 7748 reference implementation with 5×51-bit limbs
|
||
* License: MIT (Clean-room implementation)
|
||
*/
|
||
|
||
#include "se050_x25519_sw.h"
|
||
#include "se050_crypto_utils.h"
|
||
#include <string.h>
|
||
|
||
/* =========================================================================
|
||
* Platform detection
|
||
* ========================================================================= */
|
||
|
||
#if defined(ESP_PLATFORM) || defined(__XTENSA__)
|
||
#define SE050_X25519_ESP32 1
|
||
#else
|
||
#define SE050_X25519_ESP32 0
|
||
#endif
|
||
|
||
/* =========================================================================
|
||
* Field GF(2^255-19)
|
||
*
|
||
* We represent field elements as arrays of 5 uint64_t limbs in radix 2^51.
|
||
* Each limb holds at most 51 bits in "loose" form.
|
||
*
|
||
* value = limb[0] + limb[1] * 2^51 + limb[2] * 2^102 + limb[3] * 2^153 + limb[4] * 2^204
|
||
*
|
||
* p = 2^255 - 19, so 2^255 ≡ 19 (mod p)
|
||
* ========================================================================= */
|
||
|
||
#define NLIMBS 5
|
||
typedef uint64_t fe[NLIMBS]; /* field element */
|
||
|
||
#define L51 ((uint64_t)1 << 51)
|
||
#define MASK51 (L51 - 1)
|
||
|
||
/* 128-bit helpers */
|
||
#if !SE050_X25519_ESP32
|
||
static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; }
|
||
static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); }
|
||
#else
|
||
/* ESP32: 128-bit emulation using 64-bit arithmetic */
|
||
typedef struct { uint64_t lo, hi; } u128;
|
||
static inline u128 u128_mul(uint64_t a, uint64_t b) {
|
||
u128 r;
|
||
uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32;
|
||
uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32;
|
||
uint64_t p0 = a_lo * b_lo;
|
||
uint64_t p1 = a_lo * b_hi;
|
||
uint64_t p2 = a_hi * b_lo;
|
||
uint64_t p3 = a_hi * b_hi;
|
||
uint64_t mid = p1 + p2;
|
||
r.lo = p0 + (mid << 32);
|
||
r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0);
|
||
return r;
|
||
}
|
||
static inline uint64_t u128_lo(u128 x) { return x.lo; }
|
||
static inline uint64_t u128_hi(u128 x) { return x.hi; }
|
||
static inline u128 u128_add(u128 a, u128 b) {
|
||
u128 r;
|
||
r.lo = a.lo + b.lo;
|
||
r.hi = a.hi + b.hi + (r.lo < a.lo);
|
||
return r;
|
||
}
|
||
#endif
|
||
|
||
/* --- Basic operations --- */
|
||
|
||
static void fe_zero(fe f) { f[0] = f[1] = f[2] = f[3] = f[4] = 0; }
|
||
static void fe_one(fe f) { f[0] = 1; f[1] = f[2] = f[3] = f[4] = 0; }
|
||
|
||
static void fe_copy(fe out, const fe in)
|
||
{
|
||
out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; out[4] = in[4];
|
||
}
|
||
|
||
/* fe_add: out = a + b (loose, ≤ 2·2^51) */
|
||
static void fe_add(fe out, const fe a, const fe b)
|
||
{
|
||
out[0] = a[0] + b[0];
|
||
out[1] = a[1] + b[1];
|
||
out[2] = a[2] + b[2];
|
||
out[3] = a[3] + b[3];
|
||
out[4] = a[4] + b[4];
|
||
}
|
||
|
||
/* fe_sub: out = a - b (loose, uses bias to avoid underflow) */
|
||
static void fe_sub(fe out, const fe a, const fe b)
|
||
{
|
||
out[0] = a[0] + 2*(L51 - 19) - b[0];
|
||
out[1] = a[1] + 2*(L51 - 1) - b[1];
|
||
out[2] = a[2] + 2*(L51 - 1) - b[2];
|
||
out[3] = a[3] + 2*(L51 - 1) - b[3];
|
||
out[4] = a[4] + 2*(L51 - 1) - b[4];
|
||
}
|
||
|
||
/* fe_reduce: propagate carries, keep limbs < 2^51 */
|
||
static void fe_reduce(fe f)
|
||
{
|
||
uint64_t c;
|
||
c = f[0] >> 51; f[0] &= MASK51; f[1] += c;
|
||
c = f[1] >> 51; f[1] &= MASK51; f[2] += c;
|
||
c = f[2] >> 51; f[2] &= MASK51; f[3] += c;
|
||
c = f[3] >> 51; f[3] &= MASK51; f[4] += c;
|
||
c = f[4] >> 51; f[4] &= MASK51; f[0] += 19 * c;
|
||
c = f[0] >> 51; f[0] &= MASK51; f[1] += c;
|
||
}
|
||
|
||
/* --- Multiplication --- */
|
||
|
||
#if !SE050_X25519_ESP32
|
||
/* fe_mul: out = a * b mod p (128-bit accumulators) */
|
||
static void fe_mul(fe out, const fe a, const fe b)
|
||
{
|
||
unsigned __int128 t0, t1, t2, t3, t4;
|
||
uint64_t c;
|
||
uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
|
||
|
||
t0 = (unsigned __int128)a[0] * b[0];
|
||
t0 += (unsigned __int128)a[1] * b4_19;
|
||
t0 += (unsigned __int128)a[2] * b3_19;
|
||
t0 += (unsigned __int128)a[3] * b2_19;
|
||
t0 += (unsigned __int128)a[4] * b1_19;
|
||
|
||
t1 = (unsigned __int128)a[0] * b[1];
|
||
t1 += (unsigned __int128)a[1] * b[0];
|
||
t1 += (unsigned __int128)a[2] * b4_19;
|
||
t1 += (unsigned __int128)a[3] * b3_19;
|
||
t1 += (unsigned __int128)a[4] * b2_19;
|
||
|
||
t2 = (unsigned __int128)a[0] * b[2];
|
||
t2 += (unsigned __int128)a[1] * b[1];
|
||
t2 += (unsigned __int128)a[2] * b[0];
|
||
t2 += (unsigned __int128)a[3] * b4_19;
|
||
t2 += (unsigned __int128)a[4] * b3_19;
|
||
|
||
t3 = (unsigned __int128)a[0] * b[3];
|
||
t3 += (unsigned __int128)a[1] * b[2];
|
||
t3 += (unsigned __int128)a[2] * b[1];
|
||
t3 += (unsigned __int128)a[3] * b[0];
|
||
t3 += (unsigned __int128)a[4] * b4_19;
|
||
|
||
t4 = (unsigned __int128)a[0] * b[4];
|
||
t4 += (unsigned __int128)a[1] * b[3];
|
||
t4 += (unsigned __int128)a[2] * b[2];
|
||
t4 += (unsigned __int128)a[3] * b[1];
|
||
t4 += (unsigned __int128)a[4] * b[0];
|
||
|
||
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c;
|
||
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c;
|
||
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c;
|
||
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
|
||
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
|
||
out[0] += 19 * c;
|
||
|
||
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
|
||
}
|
||
#else
|
||
/* ESP32: fe_mul with 128-bit emulation */
|
||
static void fe_mul(fe out, const fe a, const fe b)
|
||
{
|
||
u128 t0, t1, t2, t3, t4;
|
||
uint64_t c;
|
||
uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
|
||
|
||
t0 = u128_mul(a[0], b[0]);
|
||
t0 = u128_add(t0, u128_mul(a[1], b4_19));
|
||
t0 = u128_add(t0, u128_mul(a[2], b3_19));
|
||
t0 = u128_add(t0, u128_mul(a[3], b2_19));
|
||
t0 = u128_add(t0, u128_mul(a[4], b1_19));
|
||
|
||
t1 = u128_mul(a[0], b[1]);
|
||
t1 = u128_add(t1, u128_mul(a[1], b[0]));
|
||
t1 = u128_add(t1, u128_mul(a[2], b4_19));
|
||
t1 = u128_add(t1, u128_mul(a[3], b3_19));
|
||
t1 = u128_add(t1, u128_mul(a[4], b2_19));
|
||
|
||
t2 = u128_mul(a[0], b[2]);
|
||
t2 = u128_add(t2, u128_mul(a[1], b[1]));
|
||
t2 = u128_add(t2, u128_mul(a[2], b[0]));
|
||
t2 = u128_add(t2, u128_mul(a[3], b4_19));
|
||
t2 = u128_add(t2, u128_mul(a[4], b3_19));
|
||
|
||
t3 = u128_mul(a[0], b[3]);
|
||
t3 = u128_add(t3, u128_mul(a[1], b[2]));
|
||
t3 = u128_add(t3, u128_mul(a[2], b[1]));
|
||
t3 = u128_add(t3, u128_mul(a[3], b[0]));
|
||
t3 = u128_add(t3, u128_mul(a[4], b4_19));
|
||
|
||
t4 = u128_mul(a[0], b[4]);
|
||
t4 = u128_add(t4, u128_mul(a[1], b[3]));
|
||
t4 = u128_add(t4, u128_mul(a[2], b[2]));
|
||
t4 = u128_add(t4, u128_mul(a[3], b[1]));
|
||
t4 = u128_add(t4, u128_mul(a[4], b[0]));
|
||
|
||
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
|
||
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
|
||
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
|
||
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
|
||
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
|
||
out[0] += 19 * c;
|
||
|
||
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
|
||
}
|
||
#endif
|
||
|
||
/* fe_sq: out = a^2 mod p (optimized) */
|
||
#if !SE050_X25519_ESP32
|
||
static void fe_sq(fe out, const fe a)
|
||
{
|
||
unsigned __int128 t0, t1, t2, t3, t4;
|
||
uint64_t c;
|
||
uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
|
||
uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
|
||
|
||
t0 = (unsigned __int128)a[0] * a[0];
|
||
t0 += (unsigned __int128)d1_19 * a[4];
|
||
t0 += (unsigned __int128)d2_19 * a[3];
|
||
|
||
t1 = (unsigned __int128)a[0] * d1;
|
||
t1 += (unsigned __int128)d2_19 * a[4];
|
||
t1 += (unsigned __int128)a3_19 * a[3];
|
||
|
||
t2 = (unsigned __int128)a[0] * d2;
|
||
t2 += (unsigned __int128)a[1] * a[1];
|
||
t2 += (unsigned __int128)d3 * a4_19;
|
||
|
||
t3 = (unsigned __int128)a[0] * d3;
|
||
t3 += (unsigned __int128)d1 * a[2];
|
||
t3 += (unsigned __int128)a[4] * a4_19;
|
||
|
||
t4 = (unsigned __int128)a[0] * (2 * a[4]);
|
||
t4 += (unsigned __int128)d1 * a[3];
|
||
t4 += (unsigned __int128)a[2] * a[2];
|
||
|
||
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c;
|
||
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c;
|
||
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c;
|
||
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
|
||
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
|
||
out[0] += 19 * c;
|
||
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
|
||
}
|
||
#else
|
||
static void fe_sq(fe out, const fe a)
|
||
{
|
||
u128 t0, t1, t2, t3, t4;
|
||
uint64_t c;
|
||
uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
|
||
uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
|
||
|
||
t0 = u128_mul(a[0], a[0]);
|
||
t0 = u128_add(t0, u128_mul(d1_19, a[4]));
|
||
t0 = u128_add(t0, u128_mul(d2_19, a[3]));
|
||
|
||
t1 = u128_mul(a[0], d1);
|
||
t1 = u128_add(t1, u128_mul(d2_19, a[4]));
|
||
t1 = u128_add(t1, u128_mul(a3_19, a[3]));
|
||
|
||
t2 = u128_mul(a[0], d2);
|
||
t2 = u128_add(t2, u128_mul(a[1], a[1]));
|
||
t2 = u128_add(t2, u128_mul(d3, a4_19));
|
||
|
||
t3 = u128_mul(a[0], d3);
|
||
t3 = u128_add(t3, u128_mul(d1, a[2]));
|
||
t3 = u128_add(t3, u128_mul(a[4], a4_19));
|
||
|
||
t4 = u128_mul(a[0], 2 * a[4]);
|
||
t4 = u128_add(t4, u128_mul(d1, a[3]));
|
||
t4 = u128_add(t4, u128_mul(a[2], a[2]));
|
||
|
||
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
|
||
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
|
||
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
|
||
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
|
||
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
|
||
out[0] += 19 * c;
|
||
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
|
||
}
|
||
#endif
|
||
|
||
/* fe_mul_small: out = f * n (n < 2^22) */
|
||
#if !SE050_X25519_ESP32
|
||
static void fe_mul_small(fe out, const fe f, uint64_t n)
|
||
{
|
||
unsigned __int128 t0, t1, t2, t3, t4;
|
||
uint64_t c;
|
||
t0 = (unsigned __int128)f[0] * n;
|
||
t1 = (unsigned __int128)f[1] * n;
|
||
t2 = (unsigned __int128)f[2] * n;
|
||
t3 = (unsigned __int128)f[3] * n;
|
||
t4 = (unsigned __int128)f[4] * n;
|
||
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c;
|
||
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c;
|
||
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c;
|
||
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
|
||
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
|
||
out[0] += 19 * c;
|
||
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
|
||
}
|
||
#else
|
||
static void fe_mul_small(fe out, const fe f, uint64_t n)
|
||
{
|
||
u128 t0, t1, t2, t3, t4;
|
||
uint64_t c;
|
||
t0 = u128_mul(f[0], n);
|
||
t1 = u128_mul(f[1], n);
|
||
t2 = u128_mul(f[2], n);
|
||
t3 = u128_mul(f[3], n);
|
||
t4 = u128_mul(f[4], n);
|
||
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
|
||
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
|
||
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
|
||
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
|
||
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
|
||
out[0] += 19 * c;
|
||
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
|
||
}
|
||
#endif
|
||
|
||
/* --- Inversion --- */
|
||
|
||
/* fe_invert: out = a^(-1) = a^(p-2) using addition chain */
|
||
static void fe_invert(fe out, const fe a)
|
||
{
|
||
fe t0, t1, t2, t3;
|
||
int i;
|
||
|
||
fe_sq(t0, a); /* t0 = a^2 */
|
||
fe_sq(t1, t0); /* t1 = a^4 */
|
||
fe_sq(t1, t1); /* t1 = a^8 */
|
||
fe_mul(t1, t1, a); /* t1 = a^9 */
|
||
fe_mul(t0, t0, t1); /* t0 = a^11 */
|
||
fe_sq(t2, t0); /* t2 = a^22 */
|
||
fe_mul(t1, t1, t2); /* t1 = a^31 */
|
||
|
||
fe_sq(t2, t1);
|
||
for (i = 1; i < 5; i++) fe_sq(t2, t2);
|
||
fe_mul(t1, t2, t1); /* t1 = a^(2^10-1) */
|
||
|
||
fe_sq(t2, t1);
|
||
for (i = 1; i < 10; i++) fe_sq(t2, t2);
|
||
fe_mul(t2, t2, t1); /* t2 = a^(2^20-1) */
|
||
|
||
fe_sq(t3, t2);
|
||
for (i = 1; i < 20; i++) fe_sq(t3, t3);
|
||
fe_mul(t2, t3, t2); /* t2 = a^(2^40-1) */
|
||
|
||
fe_sq(t2, t2);
|
||
for (i = 1; i < 10; i++) fe_sq(t2, t2);
|
||
fe_mul(t1, t2, t1); /* t1 = a^(2^50-1) */
|
||
|
||
fe_sq(t2, t1);
|
||
for (i = 1; i < 50; i++) fe_sq(t2, t2);
|
||
fe_mul(t2, t2, t1); /* t2 = a^(2^100-1) */
|
||
|
||
fe_sq(t3, t2);
|
||
for (i = 1; i < 100; i++) fe_sq(t3, t3);
|
||
fe_mul(t2, t3, t2); /* t2 = a^(2^200-1) */
|
||
|
||
fe_sq(t2, t2);
|
||
for (i = 1; i < 50; i++) fe_sq(t2, t2);
|
||
fe_mul(t1, t2, t1); /* t1 = a^(2^250-1) */
|
||
|
||
fe_sq(t1, t1);
|
||
fe_sq(t1, t1);
|
||
fe_sq(t1, t1);
|
||
fe_sq(t1, t1);
|
||
fe_sq(t1, t1); /* t1 = a^(2^255-2^5) */
|
||
fe_mul(out, t1, t0); /* out = a^(2^255-21) = a^(p-2) */
|
||
}
|
||
|
||
/* --- Byte conversion --- */
|
||
|
||
/* fe_from_bytes: 32-byte little-endian → field element */
|
||
static void fe_from_bytes(fe out, const uint8_t in[32])
|
||
{
|
||
uint8_t buf[32];
|
||
memcpy(buf, in, 32);
|
||
buf[31] &= 0x7f; /* clear top bit per RFC 7748 §5 */
|
||
|
||
out[0] = ((uint64_t)buf[ 0])
|
||
| ((uint64_t)buf[ 1] << 8)
|
||
| ((uint64_t)buf[ 2] << 16)
|
||
| ((uint64_t)buf[ 3] << 24)
|
||
| ((uint64_t)buf[ 4] << 32)
|
||
| ((uint64_t)buf[ 5] << 40)
|
||
| ((uint64_t)(buf[6] & 0x07) << 48);
|
||
|
||
out[1] = ((uint64_t)buf[ 6] >> 3)
|
||
| ((uint64_t)buf[ 7] << 5)
|
||
| ((uint64_t)buf[ 8] << 13)
|
||
| ((uint64_t)buf[ 9] << 21)
|
||
| ((uint64_t)buf[10] << 29)
|
||
| ((uint64_t)buf[11] << 37)
|
||
| ((uint64_t)(buf[12] & 0x3f) << 45);
|
||
|
||
out[2] = ((uint64_t)buf[12] >> 6)
|
||
| ((uint64_t)buf[13] << 2)
|
||
| ((uint64_t)buf[14] << 10)
|
||
| ((uint64_t)buf[15] << 18)
|
||
| ((uint64_t)buf[16] << 26)
|
||
| ((uint64_t)buf[17] << 34)
|
||
| ((uint64_t)buf[18] << 42)
|
||
| ((uint64_t)(buf[19] & 0x01) << 50);
|
||
|
||
out[3] = ((uint64_t)buf[19] >> 1)
|
||
| ((uint64_t)buf[20] << 7)
|
||
| ((uint64_t)buf[21] << 15)
|
||
| ((uint64_t)buf[22] << 23)
|
||
| ((uint64_t)buf[23] << 31)
|
||
| ((uint64_t)buf[24] << 39)
|
||
| ((uint64_t)(buf[25] & 0x0f) << 47);
|
||
|
||
out[4] = ((uint64_t)buf[25] >> 4)
|
||
| ((uint64_t)buf[26] << 4)
|
||
| ((uint64_t)buf[27] << 12)
|
||
| ((uint64_t)buf[28] << 20)
|
||
| ((uint64_t)buf[29] << 28)
|
||
| ((uint64_t)buf[30] << 36)
|
||
| ((uint64_t)(buf[31] & 0x7f) << 44);
|
||
}
|
||
|
||
/* fe_to_bytes: field element → 32-byte little-endian */
|
||
static void fe_to_bytes(uint8_t out[32], const fe in)
|
||
{
|
||
fe f;
|
||
uint64_t c, t;
|
||
|
||
fe_copy(f, in);
|
||
fe_reduce(f);
|
||
fe_reduce(f);
|
||
|
||
/* Conditional subtract p = 2^255 - 19 */
|
||
t = f[0] + 19;
|
||
c = t >> 51; t &= MASK51; uint64_t g0 = t;
|
||
t = f[1] + c; c = t >> 51; t &= MASK51; uint64_t g1 = t;
|
||
t = f[2] + c; c = t >> 51; t &= MASK51; uint64_t g2 = t;
|
||
t = f[3] + c; c = t >> 51; t &= MASK51; uint64_t g3 = t;
|
||
t = f[4] + c; uint64_t g4 = t & MASK51;
|
||
uint64_t mask = -((t >> 51) & 1);
|
||
f[0] = (f[0] & ~mask) | (g0 & mask);
|
||
f[1] = (f[1] & ~mask) | (g1 & mask);
|
||
f[2] = (f[2] & ~mask) | (g2 & mask);
|
||
f[3] = (f[3] & ~mask) | (g3 & mask);
|
||
f[4] = (f[4] & ~mask) | (g4 & mask);
|
||
|
||
/* Unpack to bytes */
|
||
out[ 0] = (uint8_t)(f[0]);
|
||
out[ 1] = (uint8_t)(f[0] >> 8);
|
||
out[ 2] = (uint8_t)(f[0] >> 16);
|
||
out[ 3] = (uint8_t)(f[0] >> 24);
|
||
out[ 4] = (uint8_t)(f[0] >> 32);
|
||
out[ 5] = (uint8_t)(f[0] >> 40);
|
||
out[ 6] = (uint8_t)((f[0] >> 48) | (f[1] << 3));
|
||
out[ 7] = (uint8_t)(f[1] >> 5);
|
||
out[ 8] = (uint8_t)(f[1] >> 13);
|
||
out[ 9] = (uint8_t)(f[1] >> 21);
|
||
out[10] = (uint8_t)(f[1] >> 29);
|
||
out[11] = (uint8_t)(f[1] >> 37);
|
||
out[12] = (uint8_t)((f[1] >> 45) | (f[2] << 6));
|
||
out[13] = (uint8_t)(f[2] >> 2);
|
||
out[14] = (uint8_t)(f[2] >> 10);
|
||
out[15] = (uint8_t)(f[2] >> 18);
|
||
out[16] = (uint8_t)(f[2] >> 26);
|
||
out[17] = (uint8_t)(f[2] >> 34);
|
||
out[18] = (uint8_t)(f[2] >> 42);
|
||
out[19] = (uint8_t)((f[2] >> 50) | (f[3] << 1));
|
||
out[20] = (uint8_t)(f[3] >> 7);
|
||
out[21] = (uint8_t)(f[3] >> 15);
|
||
out[22] = (uint8_t)(f[3] >> 23);
|
||
out[23] = (uint8_t)(f[3] >> 31);
|
||
out[24] = (uint8_t)(f[3] >> 39);
|
||
out[25] = (uint8_t)((f[3] >> 47) | (f[4] << 4));
|
||
out[26] = (uint8_t)(f[4] >> 4);
|
||
out[27] = (uint8_t)(f[4] >> 12);
|
||
out[28] = (uint8_t)(f[4] >> 20);
|
||
out[29] = (uint8_t)(f[4] >> 28);
|
||
out[30] = (uint8_t)(f[4] >> 36);
|
||
out[31] = (uint8_t)(f[4] >> 44);
|
||
}
|
||
|
||
/* --- Montgomery ladder --- */
|
||
|
||
#define A24 121665ULL
|
||
|
||
/* fe_cswap: conditional swap */
|
||
static void fe_cswap(fe a, fe b, uint64_t swap)
|
||
{
|
||
uint64_t mask = -(swap & 1);
|
||
for (int i = 0; i < NLIMBS; i++) {
|
||
uint64_t t = mask & (a[i] ^ b[i]);
|
||
a[i] ^= t;
|
||
b[i] ^= t;
|
||
}
|
||
}
|
||
|
||
/* ladder_step: one Montgomery ladder step */
|
||
static void ladder_step(
|
||
fe X2, fe Z2, fe X3, fe Z3,
|
||
const fe X2_in, const fe Z2_in,
|
||
const fe X3_in, const fe Z3_in,
|
||
const fe x1)
|
||
{
|
||
fe A, AA, B, BB, E, C, D, DA, CB, tmp, a24_E;
|
||
|
||
fe_add(A, X2_in, Z2_in);
|
||
fe_sq (AA, A);
|
||
fe_sub(B, X2_in, Z2_in);
|
||
fe_sq (BB, B);
|
||
fe_sub(E, AA, BB);
|
||
fe_add(C, X3_in, Z3_in);
|
||
fe_sub(D, X3_in, Z3_in);
|
||
fe_mul(DA, D, A);
|
||
fe_mul(CB, C, B);
|
||
|
||
fe_add(tmp, DA, CB);
|
||
fe_sq (X3, tmp);
|
||
fe_sub(tmp, DA, CB);
|
||
fe_sq (tmp, tmp);
|
||
fe_mul(Z3, tmp, x1);
|
||
fe_mul(X2, AA, BB);
|
||
|
||
fe_mul_small(a24_E, E, A24);
|
||
fe_add(tmp, AA, a24_E);
|
||
fe_mul(Z2, E, tmp);
|
||
}
|
||
|
||
/* --- Public API --- */
|
||
|
||
const uint8_t X25519_BASE_POINT[32] = { 9 };
|
||
|
||
int x25519_sw(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32])
|
||
{
|
||
uint8_t e[32];
|
||
fe x1, X2, Z2, X3, Z3;
|
||
uint64_t prev_bit, swap;
|
||
int i;
|
||
|
||
/* Step 1: clamp scalar */
|
||
memcpy(e, scalar, 32);
|
||
e[ 0] &= 248;
|
||
e[31] &= 127;
|
||
e[31] |= 64;
|
||
|
||
/* Step 2: decode u-coordinate */
|
||
if (point == NULL)
|
||
fe_from_bytes(x1, X25519_BASE_POINT);
|
||
else
|
||
fe_from_bytes(x1, point);
|
||
|
||
/* Step 3: initialise projective points */
|
||
fe_one (X2); fe_zero(Z2);
|
||
fe_copy(X3, x1); fe_one(Z3);
|
||
|
||
/* Step 4: Montgomery ladder */
|
||
prev_bit = 0;
|
||
for (i = 254; i >= 0; i--) {
|
||
uint64_t bit = (e[i / 8] >> (i % 8)) & 1;
|
||
swap = bit ^ prev_bit;
|
||
prev_bit = bit;
|
||
|
||
fe_cswap(X2, X3, swap);
|
||
fe_cswap(Z2, Z3, swap);
|
||
|
||
fe nX2, nZ2, nX3, nZ3;
|
||
ladder_step(nX2, nZ2, nX3, nZ3, X2, Z2, X3, Z3, x1);
|
||
fe_copy(X2, nX2); fe_copy(Z2, nZ2);
|
||
fe_copy(X3, nX3); fe_copy(Z3, nZ3);
|
||
}
|
||
fe_cswap(X2, X3, prev_bit);
|
||
fe_cswap(Z2, Z3, prev_bit);
|
||
|
||
/* Step 5: convert from projective to affine */
|
||
fe Z2_inv;
|
||
fe_invert(Z2_inv, Z2);
|
||
fe_mul(X2, X2, Z2_inv);
|
||
|
||
/* Step 6: encode result */
|
||
fe_to_bytes(out, X2);
|
||
|
||
/* Step 7: reject all-zero output */
|
||
uint8_t acc = 0;
|
||
for (i = 0; i < 32; i++) acc |= out[i];
|
||
if (acc == 0) return -1;
|
||
|
||
return 0;
|
||
}
|
||
|
||
void se050_x25519_sw_clamp(uint8_t *scalar)
|
||
{
|
||
scalar[0] &= 248;
|
||
scalar[31] &= 127;
|
||
scalar[31] |= 64;
|
||
}
|
||
|
||
void se050_x25519_sw_zeroize(uint8_t *key, size_t len)
|
||
{
|
||
memzero_explicit(key, len);
|
||
}
|
||
|
||
int se050_x25519_sw_generate_keypair(se050_x25519_sw_keypair_t *keypair,
|
||
x25519_rng_func rng_func,
|
||
void *rng_ctx)
|
||
{
|
||
if (!keypair || !rng_func) return -1;
|
||
if (rng_func(keypair->private_key, 32, rng_ctx) != 0) return -1;
|
||
se050_x25519_sw_clamp(keypair->private_key);
|
||
x25519_sw(keypair->public_key, keypair->private_key, (const uint8_t*)"basepoint");
|
||
return 0;
|
||
}
|
||
|
||
int se050_x25519_sw_compute_shared_secret(uint8_t *shared_secret,
|
||
const uint8_t *private_key,
|
||
const uint8_t *peer_public)
|
||
{
|
||
if (!shared_secret || !private_key || !peer_public) return -1;
|
||
uint8_t clamped[32];
|
||
memcpy(clamped, private_key, 32);
|
||
se050_x25519_sw_clamp(clamped);
|
||
x25519_sw(shared_secret, clamped, peer_public);
|
||
se050_x25519_sw_zeroize(clamped, 32);
|
||
return 0;
|
||
}
|
||
|
||
int se050_x25519_sw_derive_public_key(uint8_t *public_key,
|
||
const uint8_t *private_key)
|
||
{
|
||
if (!public_key || !private_key) return -1;
|
||
uint8_t clamped[32];
|
||
memcpy(clamped, private_key, 32);
|
||
se050_x25519_sw_clamp(clamped);
|
||
x25519_sw(public_key, clamped, (const uint8_t*)"basepoint");
|
||
se050_x25519_sw_zeroize(clamped, 32);
|
||
return 0;
|
||
}
|
||
|
||
#ifdef X25519_SW_TEST
|
||
#include <stdio.h>
|
||
|
||
/* RFC 7748 §6.1 Test Vector 1 */
|
||
static const uint8_t RFC7748_SK_1[32] = {
|
||
0xa5,0x46,0xe3,0x6b,0xf0,0x52,0x7c,0x9d,0x3b,0x16,0x15,0x4b,
|
||
0x82,0x46,0x5e,0xdd,0x62,0x14,0x4c,0x0a,0xc1,0xfc,0x5a,0x18,
|
||
0x50,0x6a,0x22,0x44,0xba,0x44,0x9a,0xc4 };
|
||
static const uint8_t RFC7748_PK_1[32] = {
|
||
0xe6,0xdb,0x68,0x67,0x58,0x30,0x30,0xdb,0x35,0x94,0xc1,0xa4,
|
||
0x24,0xb1,0x5f,0x7c,0x72,0x66,0x24,0xec,0x26,0xb3,0x35,0x3b,
|
||
0x10,0xa9,0x03,0xa6,0xd0,0xab,0x1c,0x4c };
|
||
static const uint8_t RFC7748_SS_1[32] = {
|
||
0xc3,0xda,0x55,0x37,0x9d,0xe9,0xc6,0x90,0x8e,0x94,0xea,0x4d,
|
||
0xf2,0x8d,0x08,0x4f,0x32,0xec,0xcf,0x03,0x49,0x1c,0x71,0xf7,
|
||
0x54,0xb4,0x07,0x55,0x77,0xa2,0x85,0x52 };
|
||
|
||
static void print_hex(const char *label, const uint8_t *buf, size_t len)
|
||
{
|
||
printf("%s: ", label);
|
||
for (size_t i = 0; i < len; i++) printf("%02x", buf[i]);
|
||
printf("\n");
|
||
}
|
||
|
||
int main(void)
|
||
{
|
||
uint8_t shared_secret[32];
|
||
printf("X25519 Software Implementation Test\n");
|
||
printf("====================================\n\n");
|
||
printf("RFC 7748 Test Vector 1:\n");
|
||
print_hex("Scalar", RFC7748_SK_1, 32);
|
||
print_hex("Point", RFC7748_PK_1, 32);
|
||
x25519_sw(shared_secret, RFC7748_SK_1, RFC7748_PK_1);
|
||
print_hex("Computed SS", shared_secret, 32);
|
||
print_hex("Expected SS", RFC7748_SS_1, 32);
|
||
if (memcmp(shared_secret, RFC7748_SS_1, 32) == 0) {
|
||
printf("[PASS] RFC 7748 Test Vector 1\n");
|
||
return 0;
|
||
} else {
|
||
printf("[FAIL] RFC 7748 Test Vector 1\n");
|
||
return 1;
|
||
}
|
||
}
|
||
#endif
|