/** * @file se050_x25519_sw.c * @brief Software X25519 ECDH Implementation (Clean-room RFC7748) * Based on RFC 7748 reference implementation with 5×51-bit limbs * License: MIT (Clean-room implementation) */ #include "se050_x25519_sw.h" #include "se050_crypto_utils.h" #include /* ========================================================================= * Platform detection * ========================================================================= */ #if defined(ESP_PLATFORM) || defined(__XTENSA__) #define SE050_X25519_ESP32 1 #else #define SE050_X25519_ESP32 0 #endif /* ========================================================================= * Field GF(2^255-19) * * We represent field elements as arrays of 5 uint64_t limbs in radix 2^51. * Each limb holds at most 51 bits in "loose" form. * * value = limb[0] + limb[1] * 2^51 + limb[2] * 2^102 + limb[3] * 2^153 + limb[4] * 2^204 * * p = 2^255 - 19, so 2^255 ≡ 19 (mod p) * ========================================================================= */ #define NLIMBS 5 typedef uint64_t fe[NLIMBS]; /* field element */ #define L51 ((uint64_t)1 << 51) #define MASK51 (L51 - 1) /* 128-bit helpers */ #if !SE050_X25519_ESP32 static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; } static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); } #else /* ESP32: 128-bit emulation using 64-bit arithmetic */ typedef struct { uint64_t lo, hi; } u128; static inline u128 u128_mul(uint64_t a, uint64_t b) { u128 r; uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32; uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32; uint64_t p0 = a_lo * b_lo; uint64_t p1 = a_lo * b_hi; uint64_t p2 = a_hi * b_lo; uint64_t p3 = a_hi * b_hi; uint64_t mid = p1 + p2; r.lo = p0 + (mid << 32); r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0); return r; } static inline uint64_t u128_lo(u128 x) { return x.lo; } static inline uint64_t u128_hi(u128 x) { return x.hi; } static inline u128 u128_add(u128 a, u128 b) { u128 r; r.lo = a.lo + b.lo; r.hi = a.hi + b.hi + (r.lo < a.lo); return r; } #endif /* --- Basic operations --- */ static void fe_zero(fe f) { f[0] = f[1] = f[2] = f[3] = f[4] = 0; } static void fe_one(fe f) { f[0] = 1; f[1] = f[2] = f[3] = f[4] = 0; } static void fe_copy(fe out, const fe in) { out[0] = in[0]; out[1] = in[1]; out[2] = in[2]; out[3] = in[3]; out[4] = in[4]; } /* fe_add: out = a + b (loose, ≤ 2·2^51) */ static void fe_add(fe out, const fe a, const fe b) { out[0] = a[0] + b[0]; out[1] = a[1] + b[1]; out[2] = a[2] + b[2]; out[3] = a[3] + b[3]; out[4] = a[4] + b[4]; } /* fe_sub: out = a - b (loose, uses bias to avoid underflow) */ static void fe_sub(fe out, const fe a, const fe b) { out[0] = a[0] + 2*(L51 - 19) - b[0]; out[1] = a[1] + 2*(L51 - 1) - b[1]; out[2] = a[2] + 2*(L51 - 1) - b[2]; out[3] = a[3] + 2*(L51 - 1) - b[3]; out[4] = a[4] + 2*(L51 - 1) - b[4]; } /* fe_reduce: propagate carries, keep limbs < 2^51 */ static void fe_reduce(fe f) { uint64_t c; c = f[0] >> 51; f[0] &= MASK51; f[1] += c; c = f[1] >> 51; f[1] &= MASK51; f[2] += c; c = f[2] >> 51; f[2] &= MASK51; f[3] += c; c = f[3] >> 51; f[3] &= MASK51; f[4] += c; c = f[4] >> 51; f[4] &= MASK51; f[0] += 19 * c; c = f[0] >> 51; f[0] &= MASK51; f[1] += c; } /* --- Multiplication --- */ #if !SE050_X25519_ESP32 /* fe_mul: out = a * b mod p (128-bit accumulators) */ static void fe_mul(fe out, const fe a, const fe b) { unsigned __int128 t0, t1, t2, t3, t4; uint64_t c; uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4]; t0 = (unsigned __int128)a[0] * b[0]; t0 += (unsigned __int128)a[1] * b4_19; t0 += (unsigned __int128)a[2] * b3_19; t0 += (unsigned __int128)a[3] * b2_19; t0 += (unsigned __int128)a[4] * b1_19; t1 = (unsigned __int128)a[0] * b[1]; t1 += (unsigned __int128)a[1] * b[0]; t1 += (unsigned __int128)a[2] * b4_19; t1 += (unsigned __int128)a[3] * b3_19; t1 += (unsigned __int128)a[4] * b2_19; t2 = (unsigned __int128)a[0] * b[2]; t2 += (unsigned __int128)a[1] * b[1]; t2 += (unsigned __int128)a[2] * b[0]; t2 += (unsigned __int128)a[3] * b4_19; t2 += (unsigned __int128)a[4] * b3_19; t3 = (unsigned __int128)a[0] * b[3]; t3 += (unsigned __int128)a[1] * b[2]; t3 += (unsigned __int128)a[2] * b[1]; t3 += (unsigned __int128)a[3] * b[0]; t3 += (unsigned __int128)a[4] * b4_19; t4 = (unsigned __int128)a[0] * b[4]; t4 += (unsigned __int128)a[1] * b[3]; t4 += (unsigned __int128)a[2] * b[2]; t4 += (unsigned __int128)a[3] * b[1]; t4 += (unsigned __int128)a[4] * b[0]; out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c; out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c; out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c; out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c; out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } #else /* ESP32: fe_mul with 128-bit emulation */ static void fe_mul(fe out, const fe a, const fe b) { u128 t0, t1, t2, t3, t4; uint64_t c; uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4]; t0 = u128_mul(a[0], b[0]); t0 = u128_add(t0, u128_mul(a[1], b4_19)); t0 = u128_add(t0, u128_mul(a[2], b3_19)); t0 = u128_add(t0, u128_mul(a[3], b2_19)); t0 = u128_add(t0, u128_mul(a[4], b1_19)); t1 = u128_mul(a[0], b[1]); t1 = u128_add(t1, u128_mul(a[1], b[0])); t1 = u128_add(t1, u128_mul(a[2], b4_19)); t1 = u128_add(t1, u128_mul(a[3], b3_19)); t1 = u128_add(t1, u128_mul(a[4], b2_19)); t2 = u128_mul(a[0], b[2]); t2 = u128_add(t2, u128_mul(a[1], b[1])); t2 = u128_add(t2, u128_mul(a[2], b[0])); t2 = u128_add(t2, u128_mul(a[3], b4_19)); t2 = u128_add(t2, u128_mul(a[4], b3_19)); t3 = u128_mul(a[0], b[3]); t3 = u128_add(t3, u128_mul(a[1], b[2])); t3 = u128_add(t3, u128_mul(a[2], b[1])); t3 = u128_add(t3, u128_mul(a[3], b[0])); t3 = u128_add(t3, u128_mul(a[4], b4_19)); t4 = u128_mul(a[0], b[4]); t4 = u128_add(t4, u128_mul(a[1], b[3])); t4 = u128_add(t4, u128_mul(a[2], b[2])); t4 = u128_add(t4, u128_mul(a[3], b[1])); t4 = u128_add(t4, u128_mul(a[4], b[0])); out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0}); out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0}); out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0}); out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0}); out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } #endif /* fe_sq: out = a^2 mod p (optimized) */ #if !SE050_X25519_ESP32 static void fe_sq(fe out, const fe a) { unsigned __int128 t0, t1, t2, t3, t4; uint64_t c; uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3]; uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3]; t0 = (unsigned __int128)a[0] * a[0]; t0 += (unsigned __int128)d1_19 * a[4]; t0 += (unsigned __int128)d2_19 * a[3]; t1 = (unsigned __int128)a[0] * d1; t1 += (unsigned __int128)d2_19 * a[4]; t1 += (unsigned __int128)a3_19 * a[3]; t2 = (unsigned __int128)a[0] * d2; t2 += (unsigned __int128)a[1] * a[1]; t2 += (unsigned __int128)d3 * a4_19; t3 = (unsigned __int128)a[0] * d3; t3 += (unsigned __int128)d1 * a[2]; t3 += (unsigned __int128)a[4] * a4_19; t4 = (unsigned __int128)a[0] * (2 * a[4]); t4 += (unsigned __int128)d1 * a[3]; t4 += (unsigned __int128)a[2] * a[2]; out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c; out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c; out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c; out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c; out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } #else static void fe_sq(fe out, const fe a) { u128 t0, t1, t2, t3, t4; uint64_t c; uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3]; uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3]; t0 = u128_mul(a[0], a[0]); t0 = u128_add(t0, u128_mul(d1_19, a[4])); t0 = u128_add(t0, u128_mul(d2_19, a[3])); t1 = u128_mul(a[0], d1); t1 = u128_add(t1, u128_mul(d2_19, a[4])); t1 = u128_add(t1, u128_mul(a3_19, a[3])); t2 = u128_mul(a[0], d2); t2 = u128_add(t2, u128_mul(a[1], a[1])); t2 = u128_add(t2, u128_mul(d3, a4_19)); t3 = u128_mul(a[0], d3); t3 = u128_add(t3, u128_mul(d1, a[2])); t3 = u128_add(t3, u128_mul(a[4], a4_19)); t4 = u128_mul(a[0], 2 * a[4]); t4 = u128_add(t4, u128_mul(d1, a[3])); t4 = u128_add(t4, u128_mul(a[2], a[2])); out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0}); out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0}); out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0}); out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0}); out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } #endif /* fe_mul_small: out = f * n (n < 2^22) */ #if !SE050_X25519_ESP32 static void fe_mul_small(fe out, const fe f, uint64_t n) { unsigned __int128 t0, t1, t2, t3, t4; uint64_t c; t0 = (unsigned __int128)f[0] * n; t1 = (unsigned __int128)f[1] * n; t2 = (unsigned __int128)f[2] * n; t3 = (unsigned __int128)f[3] * n; t4 = (unsigned __int128)f[4] * n; out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 += c; out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 += c; out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 += c; out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c; out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } #else static void fe_mul_small(fe out, const fe f, uint64_t n) { u128 t0, t1, t2, t3, t4; uint64_t c; t0 = u128_mul(f[0], n); t1 = u128_mul(f[1], n); t2 = u128_mul(f[2], n); t3 = u128_mul(f[3], n); t4 = u128_mul(f[4], n); out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0}); out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0}); out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0}); out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0}); out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } #endif /* --- Inversion --- */ /* fe_invert: out = a^(-1) = a^(p-2) using addition chain */ static void fe_invert(fe out, const fe a) { fe t0, t1, t2, t3; int i; fe_sq(t0, a); /* t0 = a^2 */ fe_sq(t1, t0); /* t1 = a^4 */ fe_sq(t1, t1); /* t1 = a^8 */ fe_mul(t1, t1, a); /* t1 = a^9 */ fe_mul(t0, t0, t1); /* t0 = a^11 */ fe_sq(t2, t0); /* t2 = a^22 */ fe_mul(t1, t1, t2); /* t1 = a^31 */ fe_sq(t2, t1); for (i = 1; i < 5; i++) fe_sq(t2, t2); fe_mul(t1, t2, t1); /* t1 = a^(2^10-1) */ fe_sq(t2, t1); for (i = 1; i < 10; i++) fe_sq(t2, t2); fe_mul(t2, t2, t1); /* t2 = a^(2^20-1) */ fe_sq(t3, t2); for (i = 1; i < 20; i++) fe_sq(t3, t3); fe_mul(t2, t3, t2); /* t2 = a^(2^40-1) */ fe_sq(t2, t2); for (i = 1; i < 10; i++) fe_sq(t2, t2); fe_mul(t1, t2, t1); /* t1 = a^(2^50-1) */ fe_sq(t2, t1); for (i = 1; i < 50; i++) fe_sq(t2, t2); fe_mul(t2, t2, t1); /* t2 = a^(2^100-1) */ fe_sq(t3, t2); for (i = 1; i < 100; i++) fe_sq(t3, t3); fe_mul(t2, t3, t2); /* t2 = a^(2^200-1) */ fe_sq(t2, t2); for (i = 1; i < 50; i++) fe_sq(t2, t2); fe_mul(t1, t2, t1); /* t1 = a^(2^250-1) */ fe_sq(t1, t1); fe_sq(t1, t1); fe_sq(t1, t1); fe_sq(t1, t1); fe_sq(t1, t1); /* t1 = a^(2^255-2^5) */ fe_mul(out, t1, t0); /* out = a^(2^255-21) = a^(p-2) */ } /* --- Byte conversion --- */ /* fe_from_bytes: 32-byte little-endian → field element */ static void fe_from_bytes(fe out, const uint8_t in[32]) { uint8_t buf[32]; memcpy(buf, in, 32); buf[31] &= 0x7f; /* clear top bit per RFC 7748 §5 */ out[0] = ((uint64_t)buf[ 0]) | ((uint64_t)buf[ 1] << 8) | ((uint64_t)buf[ 2] << 16) | ((uint64_t)buf[ 3] << 24) | ((uint64_t)buf[ 4] << 32) | ((uint64_t)buf[ 5] << 40) | ((uint64_t)(buf[6] & 0x07) << 48); out[1] = ((uint64_t)buf[ 6] >> 3) | ((uint64_t)buf[ 7] << 5) | ((uint64_t)buf[ 8] << 13) | ((uint64_t)buf[ 9] << 21) | ((uint64_t)buf[10] << 29) | ((uint64_t)buf[11] << 37) | ((uint64_t)(buf[12] & 0x3f) << 45); out[2] = ((uint64_t)buf[12] >> 6) | ((uint64_t)buf[13] << 2) | ((uint64_t)buf[14] << 10) | ((uint64_t)buf[15] << 18) | ((uint64_t)buf[16] << 26) | ((uint64_t)buf[17] << 34) | ((uint64_t)buf[18] << 42) | ((uint64_t)(buf[19] & 0x01) << 50); out[3] = ((uint64_t)buf[19] >> 1) | ((uint64_t)buf[20] << 7) | ((uint64_t)buf[21] << 15) | ((uint64_t)buf[22] << 23) | ((uint64_t)buf[23] << 31) | ((uint64_t)buf[24] << 39) | ((uint64_t)(buf[25] & 0x0f) << 47); out[4] = ((uint64_t)buf[25] >> 4) | ((uint64_t)buf[26] << 4) | ((uint64_t)buf[27] << 12) | ((uint64_t)buf[28] << 20) | ((uint64_t)buf[29] << 28) | ((uint64_t)buf[30] << 36) | ((uint64_t)(buf[31] & 0x7f) << 44); } /* fe_to_bytes: field element → 32-byte little-endian */ static void fe_to_bytes(uint8_t out[32], const fe in) { fe f; uint64_t c, t; fe_copy(f, in); fe_reduce(f); fe_reduce(f); /* Conditional subtract p = 2^255 - 19 */ t = f[0] + 19; c = t >> 51; t &= MASK51; uint64_t g0 = t; t = f[1] + c; c = t >> 51; t &= MASK51; uint64_t g1 = t; t = f[2] + c; c = t >> 51; t &= MASK51; uint64_t g2 = t; t = f[3] + c; c = t >> 51; t &= MASK51; uint64_t g3 = t; t = f[4] + c; uint64_t g4 = t & MASK51; uint64_t mask = -((t >> 51) & 1); f[0] = (f[0] & ~mask) | (g0 & mask); f[1] = (f[1] & ~mask) | (g1 & mask); f[2] = (f[2] & ~mask) | (g2 & mask); f[3] = (f[3] & ~mask) | (g3 & mask); f[4] = (f[4] & ~mask) | (g4 & mask); /* Unpack to bytes */ out[ 0] = (uint8_t)(f[0]); out[ 1] = (uint8_t)(f[0] >> 8); out[ 2] = (uint8_t)(f[0] >> 16); out[ 3] = (uint8_t)(f[0] >> 24); out[ 4] = (uint8_t)(f[0] >> 32); out[ 5] = (uint8_t)(f[0] >> 40); out[ 6] = (uint8_t)((f[0] >> 48) | (f[1] << 3)); out[ 7] = (uint8_t)(f[1] >> 5); out[ 8] = (uint8_t)(f[1] >> 13); out[ 9] = (uint8_t)(f[1] >> 21); out[10] = (uint8_t)(f[1] >> 29); out[11] = (uint8_t)(f[1] >> 37); out[12] = (uint8_t)((f[1] >> 45) | (f[2] << 6)); out[13] = (uint8_t)(f[2] >> 2); out[14] = (uint8_t)(f[2] >> 10); out[15] = (uint8_t)(f[2] >> 18); out[16] = (uint8_t)(f[2] >> 26); out[17] = (uint8_t)(f[2] >> 34); out[18] = (uint8_t)(f[2] >> 42); out[19] = (uint8_t)((f[2] >> 50) | (f[3] << 1)); out[20] = (uint8_t)(f[3] >> 7); out[21] = (uint8_t)(f[3] >> 15); out[22] = (uint8_t)(f[3] >> 23); out[23] = (uint8_t)(f[3] >> 31); out[24] = (uint8_t)(f[3] >> 39); out[25] = (uint8_t)((f[3] >> 47) | (f[4] << 4)); out[26] = (uint8_t)(f[4] >> 4); out[27] = (uint8_t)(f[4] >> 12); out[28] = (uint8_t)(f[4] >> 20); out[29] = (uint8_t)(f[4] >> 28); out[30] = (uint8_t)(f[4] >> 36); out[31] = (uint8_t)(f[4] >> 44); } /* --- Montgomery ladder --- */ #define A24 121665ULL /* fe_cswap: conditional swap */ static void fe_cswap(fe a, fe b, uint64_t swap) { uint64_t mask = -(swap & 1); for (int i = 0; i < NLIMBS; i++) { uint64_t t = mask & (a[i] ^ b[i]); a[i] ^= t; b[i] ^= t; } } /* ladder_step: one Montgomery ladder step */ static void ladder_step( fe X2, fe Z2, fe X3, fe Z3, const fe X2_in, const fe Z2_in, const fe X3_in, const fe Z3_in, const fe x1) { fe A, AA, B, BB, E, C, D, DA, CB, tmp, a24_E; fe_add(A, X2_in, Z2_in); fe_sq (AA, A); fe_sub(B, X2_in, Z2_in); fe_sq (BB, B); fe_sub(E, AA, BB); fe_add(C, X3_in, Z3_in); fe_sub(D, X3_in, Z3_in); fe_mul(DA, D, A); fe_mul(CB, C, B); fe_add(tmp, DA, CB); fe_sq (X3, tmp); fe_sub(tmp, DA, CB); fe_sq (tmp, tmp); fe_mul(Z3, tmp, x1); fe_mul(X2, AA, BB); fe_mul_small(a24_E, E, A24); fe_add(tmp, AA, a24_E); fe_mul(Z2, E, tmp); } /* --- Public API --- */ const uint8_t X25519_BASE_POINT[32] = { 9 }; int x25519_sw(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32]) { uint8_t e[32]; fe x1, X2, Z2, X3, Z3; uint64_t prev_bit, swap; int i; /* Step 1: clamp scalar */ memcpy(e, scalar, 32); e[ 0] &= 248; e[31] &= 127; e[31] |= 64; /* Step 2: decode u-coordinate */ if (point == NULL) fe_from_bytes(x1, X25519_BASE_POINT); else fe_from_bytes(x1, point); /* Step 3: initialise projective points */ fe_one (X2); fe_zero(Z2); fe_copy(X3, x1); fe_one(Z3); /* Step 4: Montgomery ladder */ prev_bit = 0; for (i = 254; i >= 0; i--) { uint64_t bit = (e[i / 8] >> (i % 8)) & 1; swap = bit ^ prev_bit; prev_bit = bit; fe_cswap(X2, X3, swap); fe_cswap(Z2, Z3, swap); fe nX2, nZ2, nX3, nZ3; ladder_step(nX2, nZ2, nX3, nZ3, X2, Z2, X3, Z3, x1); fe_copy(X2, nX2); fe_copy(Z2, nZ2); fe_copy(X3, nX3); fe_copy(Z3, nZ3); } fe_cswap(X2, X3, prev_bit); fe_cswap(Z2, Z3, prev_bit); /* Step 5: convert from projective to affine */ fe Z2_inv; fe_invert(Z2_inv, Z2); fe_mul(X2, X2, Z2_inv); /* Step 6: encode result */ fe_to_bytes(out, X2); /* Step 7: reject all-zero output */ uint8_t acc = 0; for (i = 0; i < 32; i++) acc |= out[i]; if (acc == 0) return -1; return 0; } void se050_x25519_sw_clamp(uint8_t *scalar) { scalar[0] &= 248; scalar[31] &= 127; scalar[31] |= 64; } void se050_x25519_sw_zeroize(uint8_t *key, size_t len) { memzero_explicit(key, len); } int se050_x25519_sw_generate_keypair(se050_x25519_sw_keypair_t *keypair, x25519_rng_func rng_func, void *rng_ctx) { if (!keypair || !rng_func) return -1; if (rng_func(keypair->private_key, 32, rng_ctx) != 0) return -1; se050_x25519_sw_clamp(keypair->private_key); x25519_sw(keypair->public_key, keypair->private_key, (const uint8_t*)"basepoint"); return 0; } int se050_x25519_sw_compute_shared_secret(uint8_t *shared_secret, const uint8_t *private_key, const uint8_t *peer_public) { if (!shared_secret || !private_key || !peer_public) return -1; uint8_t clamped[32]; memcpy(clamped, private_key, 32); se050_x25519_sw_clamp(clamped); x25519_sw(shared_secret, clamped, peer_public); se050_x25519_sw_zeroize(clamped, 32); return 0; } int se050_x25519_sw_derive_public_key(uint8_t *public_key, const uint8_t *private_key) { if (!public_key || !private_key) return -1; uint8_t clamped[32]; memcpy(clamped, private_key, 32); se050_x25519_sw_clamp(clamped); x25519_sw(public_key, clamped, (const uint8_t*)"basepoint"); se050_x25519_sw_zeroize(clamped, 32); return 0; } #ifdef X25519_SW_TEST #include /* RFC 7748 §6.1 Test Vector 1 */ static const uint8_t RFC7748_SK_1[32] = { 0xa5,0x46,0xe3,0x6b,0xf0,0x52,0x7c,0x9d,0x3b,0x16,0x15,0x4b, 0x82,0x46,0x5e,0xdd,0x62,0x14,0x4c,0x0a,0xc1,0xfc,0x5a,0x18, 0x50,0x6a,0x22,0x44,0xba,0x44,0x9a,0xc4 }; static const uint8_t RFC7748_PK_1[32] = { 0xe6,0xdb,0x68,0x67,0x58,0x30,0x30,0xdb,0x35,0x94,0xc1,0xa4, 0x24,0xb1,0x5f,0x7c,0x72,0x66,0x24,0xec,0x26,0xb3,0x35,0x3b, 0x10,0xa9,0x03,0xa6,0xd0,0xab,0x1c,0x4c }; static const uint8_t RFC7748_SS_1[32] = { 0xc3,0xda,0x55,0x37,0x9d,0xe9,0xc6,0x90,0x8e,0x94,0xea,0x4d, 0xf2,0x8d,0x08,0x4f,0x32,0xec,0xcf,0x03,0x49,0x1c,0x71,0xf7, 0x54,0xb4,0x07,0x55,0x77,0xa2,0x85,0x52 }; static void print_hex(const char *label, const uint8_t *buf, size_t len) { printf("%s: ", label); for (size_t i = 0; i < len; i++) printf("%02x", buf[i]); printf("\n"); } int main(void) { uint8_t shared_secret[32]; printf("X25519 Software Implementation Test\n"); printf("====================================\n\n"); printf("RFC 7748 Test Vector 1:\n"); print_hex("Scalar", RFC7748_SK_1, 32); print_hex("Point", RFC7748_PK_1, 32); x25519_sw(shared_secret, RFC7748_SK_1, RFC7748_PK_1); print_hex("Computed SS", shared_secret, 32); print_hex("Expected SS", RFC7748_SS_1, 32); if (memcmp(shared_secret, RFC7748_SS_1, 32) == 0) { printf("[PASS] RFC 7748 Test Vector 1\n"); return 0; } else { printf("[FAIL] RFC 7748 Test Vector 1\n"); return 1; } } #endif