diff --git a/src/se050_x25519_sw.c b/src/se050_x25519_sw.c index deba93e..1b464df 100644 --- a/src/se050_x25519_sw.c +++ b/src/se050_x25519_sw.c @@ -9,6 +9,16 @@ #include "se050_crypto_utils.h" #include +/* ========================================================================= + * Platform detection + * ========================================================================= */ + +#if defined(ESP_PLATFORM) || defined(__XTENSA__) +#define SE050_X25519_ESP32 1 +#else +#define SE050_X25519_ESP32 0 +#endif + /* ========================================================================= * Field GF(2^255-19) * @@ -27,8 +37,34 @@ typedef uint64_t fe[NLIMBS]; /* field element */ #define MASK51 (L51 - 1) /* 128-bit helpers */ +#if !SE050_X25519_ESP32 static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; } static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); } +#else +/* ESP32: 128-bit emulation using 64-bit arithmetic */ +typedef struct { uint64_t lo, hi; } u128; +static inline u128 u128_mul(uint64_t a, uint64_t b) { + u128 r; + uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32; + uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32; + uint64_t p0 = a_lo * b_lo; + uint64_t p1 = a_lo * b_hi; + uint64_t p2 = a_hi * b_lo; + uint64_t p3 = a_hi * b_hi; + uint64_t mid = p1 + p2; + r.lo = p0 + (mid << 32); + r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0); + return r; +} +static inline uint64_t u128_lo(u128 x) { return x.lo; } +static inline uint64_t u128_hi(u128 x) { return x.hi; } +static inline u128 u128_add(u128 a, u128 b) { + u128 r; + r.lo = a.lo + b.lo; + r.hi = a.hi + b.hi + (r.lo < a.lo); + return r; +} +#endif /* --- Basic operations --- */ @@ -74,6 +110,7 @@ static void fe_reduce(fe f) /* --- Multiplication --- */ +#if !SE050_X25519_ESP32 /* fe_mul: out = a * b mod p (128-bit accumulators) */ static void fe_mul(fe out, const fe a, const fe b) { @@ -120,8 +157,57 @@ static void fe_mul(fe out, const fe a, const fe b) c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } +#else +/* ESP32: fe_mul with 128-bit emulation */ +static void fe_mul(fe out, const fe a, const fe b) +{ + u128 t0, t1, t2, t3, t4; + uint64_t c; + uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4]; + + t0 = u128_mul(a[0], b[0]); + t0 = u128_add(t0, u128_mul(a[1], b4_19)); + t0 = u128_add(t0, u128_mul(a[2], b3_19)); + t0 = u128_add(t0, u128_mul(a[3], b2_19)); + t0 = u128_add(t0, u128_mul(a[4], b1_19)); + + t1 = u128_mul(a[0], b[1]); + t1 = u128_add(t1, u128_mul(a[1], b[0])); + t1 = u128_add(t1, u128_mul(a[2], b4_19)); + t1 = u128_add(t1, u128_mul(a[3], b3_19)); + t1 = u128_add(t1, u128_mul(a[4], b2_19)); + + t2 = u128_mul(a[0], b[2]); + t2 = u128_add(t2, u128_mul(a[1], b[1])); + t2 = u128_add(t2, u128_mul(a[2], b[0])); + t2 = u128_add(t2, u128_mul(a[3], b4_19)); + t2 = u128_add(t2, u128_mul(a[4], b3_19)); + + t3 = u128_mul(a[0], b[3]); + t3 = u128_add(t3, u128_mul(a[1], b[2])); + t3 = u128_add(t3, u128_mul(a[2], b[1])); + t3 = u128_add(t3, u128_mul(a[3], b[0])); + t3 = u128_add(t3, u128_mul(a[4], b4_19)); + + t4 = u128_mul(a[0], b[4]); + t4 = u128_add(t4, u128_mul(a[1], b[3])); + t4 = u128_add(t4, u128_mul(a[2], b[2])); + t4 = u128_add(t4, u128_mul(a[3], b[1])); + t4 = u128_add(t4, u128_mul(a[4], b[0])); + + out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0}); + out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0}); + out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0}); + out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0}); + out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; + out[0] += 19 * c; + + c = out[0] >> 51; out[0] &= MASK51; out[1] += c; +} +#endif /* fe_sq: out = a^2 mod p (optimized) */ +#if !SE050_X25519_ESP32 static void fe_sq(fe out, const fe a) { unsigned __int128 t0, t1, t2, t3, t4; @@ -155,12 +241,48 @@ static void fe_sq(fe out, const fe a) out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c; out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; out[0] += 19 * c; - - /* Final carry from limb 0 */ c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } +#else +static void fe_sq(fe out, const fe a) +{ + u128 t0, t1, t2, t3, t4; + uint64_t c; + uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3]; + uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3]; + + t0 = u128_mul(a[0], a[0]); + t0 = u128_add(t0, u128_mul(d1_19, a[4])); + t0 = u128_add(t0, u128_mul(d2_19, a[3])); + + t1 = u128_mul(a[0], d1); + t1 = u128_add(t1, u128_mul(d2_19, a[4])); + t1 = u128_add(t1, u128_mul(a3_19, a[3])); + + t2 = u128_mul(a[0], d2); + t2 = u128_add(t2, u128_mul(a[1], a[1])); + t2 = u128_add(t2, u128_mul(d3, a4_19)); + + t3 = u128_mul(a[0], d3); + t3 = u128_add(t3, u128_mul(d1, a[2])); + t3 = u128_add(t3, u128_mul(a[4], a4_19)); + + t4 = u128_mul(a[0], 2 * a[4]); + t4 = u128_add(t4, u128_mul(d1, a[3])); + t4 = u128_add(t4, u128_mul(a[2], a[2])); + + out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0}); + out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0}); + out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0}); + out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0}); + out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; + out[0] += 19 * c; + c = out[0] >> 51; out[0] &= MASK51; out[1] += c; +} +#endif /* fe_mul_small: out = f * n (n < 2^22) */ +#if !SE050_X25519_ESP32 static void fe_mul_small(fe out, const fe f, uint64_t n) { unsigned __int128 t0, t1, t2, t3, t4; @@ -178,6 +300,25 @@ static void fe_mul_small(fe out, const fe f, uint64_t n) out[0] += 19 * c; c = out[0] >> 51; out[0] &= MASK51; out[1] += c; } +#else +static void fe_mul_small(fe out, const fe f, uint64_t n) +{ + u128 t0, t1, t2, t3, t4; + uint64_t c; + t0 = u128_mul(f[0], n); + t1 = u128_mul(f[1], n); + t2 = u128_mul(f[2], n); + t3 = u128_mul(f[3], n); + t4 = u128_mul(f[4], n); + out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0}); + out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0}); + out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0}); + out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0}); + out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13; + out[0] += 19 * c; + c = out[0] >> 51; out[0] &= MASK51; out[1] += c; +} +#endif /* --- Inversion --- */