Add ESP32 support with 128-bit arithmetic emulation

- Detect ESP32 platform using ESP_PLATFORM and __XTENSA__ macros - Implement 128-bit multiplication and addition using 64-bit arithmetic - Wrap fe_mul(), fe_sq(), and fe_mul_small() with ESP32-specific code paths - Standard platforms use native unsigned __int128 (faster) - ESP32 uses 128-bit emulation (compatible with 32-bit architecture)
2026-03-28 07:40:38 +09:00
parent f6298c7725
commit a8d28882c7
1 changed files with 143 additions and 2 deletions
@@ -9,6 +9,16 @@
 #include "se050_crypto_utils.h"
 #include <string.h>
 /* =========================================================================
 * Platform detection
 * ========================================================================= */
 #if defined(ESP_PLATFORM) || defined(__XTENSA__)
 #define SE050_X25519_ESP32 1
 #else
 #define SE050_X25519_ESP32 0
 #endif
 /* =========================================================================
 * Field GF(2^255-19)
 * 
@@ -27,8 +37,34 @@ typedef uint64_t fe[NLIMBS];   /* field element */
 #define MASK51 (L51 - 1)
 /* 128-bit helpers */
 #if !SE050_X25519_ESP32
 static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; }
 static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); }
 #else
 /* ESP32: 128-bit emulation using 64-bit arithmetic */
 typedef struct { uint64_t lo, hi; } u128;
 static inline u128 u128_mul(uint64_t a, uint64_t b) {
    u128 r;
    uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32;
    uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32;
    uint64_t p0 = a_lo * b_lo;
    uint64_t p1 = a_lo * b_hi;
    uint64_t p2 = a_hi * b_lo;
    uint64_t p3 = a_hi * b_hi;
    uint64_t mid = p1 + p2;
    r.lo = p0 + (mid << 32);
    r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0);
    return r;
 }
 static inline uint64_t u128_lo(u128 x) { return x.lo; }
 static inline uint64_t u128_hi(u128 x) { return x.hi; }
 static inline u128 u128_add(u128 a, u128 b) {
    u128 r;
    r.lo = a.lo + b.lo;
    r.hi = a.hi + b.hi + (r.lo < a.lo);
    return r;
 }
 #endif
 /* --- Basic operations --- */
@@ -74,6 +110,7 @@ static void fe_reduce(fe f)
 /* --- Multiplication --- */
 #if !SE050_X25519_ESP32
 /* fe_mul: out = a * b mod p (128-bit accumulators) */
 static void fe_mul(fe out, const fe a, const fe b)
 {
@@ -120,8 +157,57 @@ static void fe_mul(fe out, const fe a, const fe b)
    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
 #else
 /* ESP32: fe_mul with 128-bit emulation */
 static void fe_mul(fe out, const fe a, const fe b)
 {
    u128 t0, t1, t2, t3, t4;
    uint64_t c;
    uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
    t0 = u128_mul(a[0], b[0]);
    t0 = u128_add(t0, u128_mul(a[1], b4_19));
    t0 = u128_add(t0, u128_mul(a[2], b3_19));
    t0 = u128_add(t0, u128_mul(a[3], b2_19));
    t0 = u128_add(t0, u128_mul(a[4], b1_19));
    t1 = u128_mul(a[0], b[1]);
    t1 = u128_add(t1, u128_mul(a[1], b[0]));
    t1 = u128_add(t1, u128_mul(a[2], b4_19));
    t1 = u128_add(t1, u128_mul(a[3], b3_19));
    t1 = u128_add(t1, u128_mul(a[4], b2_19));
    t2 = u128_mul(a[0], b[2]);
    t2 = u128_add(t2, u128_mul(a[1], b[1]));
    t2 = u128_add(t2, u128_mul(a[2], b[0]));
    t2 = u128_add(t2, u128_mul(a[3], b4_19));
    t2 = u128_add(t2, u128_mul(a[4], b3_19));
    t3 = u128_mul(a[0], b[3]);
    t3 = u128_add(t3, u128_mul(a[1], b[2]));
    t3 = u128_add(t3, u128_mul(a[2], b[1]));
    t3 = u128_add(t3, u128_mul(a[3], b[0]));
    t3 = u128_add(t3, u128_mul(a[4], b4_19));
    t4 = u128_mul(a[0], b[4]);
    t4 = u128_add(t4, u128_mul(a[1], b[3]));
    t4 = u128_add(t4, u128_mul(a[2], b[2]));
    t4 = u128_add(t4, u128_mul(a[3], b[1]));
    t4 = u128_add(t4, u128_mul(a[4], b[0]));
    out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
    out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
    out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
    out[0] += 19 * c;
    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
 #endif
 /* fe_sq: out = a^2 mod p (optimized) */
 #if !SE050_X25519_ESP32
 static void fe_sq(fe out, const fe a)
 {
    unsigned __int128 t0, t1, t2, t3, t4;
@@ -155,12 +241,48 @@ static void fe_sq(fe out, const fe a)
    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
    out[0] += 19 * c;
    /* Final carry from limb 0 */
    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
 #else
 static void fe_sq(fe out, const fe a)
 {
    u128 t0, t1, t2, t3, t4;
    uint64_t c;
    uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
    uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
    t0 = u128_mul(a[0], a[0]);
    t0 = u128_add(t0, u128_mul(d1_19, a[4]));
    t0 = u128_add(t0, u128_mul(d2_19, a[3]));
    t1 = u128_mul(a[0], d1);
    t1 = u128_add(t1, u128_mul(d2_19, a[4]));
    t1 = u128_add(t1, u128_mul(a3_19, a[3]));
    t2 = u128_mul(a[0], d2);
    t2 = u128_add(t2, u128_mul(a[1], a[1]));
    t2 = u128_add(t2, u128_mul(d3, a4_19));
    t3 = u128_mul(a[0], d3);
    t3 = u128_add(t3, u128_mul(d1, a[2]));
    t3 = u128_add(t3, u128_mul(a[4], a4_19));
    t4 = u128_mul(a[0], 2 * a[4]);
    t4 = u128_add(t4, u128_mul(d1, a[3]));
    t4 = u128_add(t4, u128_mul(a[2], a[2]));
    out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
    out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
    out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
    out[0] += 19 * c;
    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
 #endif
 /* fe_mul_small: out = f * n (n < 2^22) */
 #if !SE050_X25519_ESP32
 static void fe_mul_small(fe out, const fe f, uint64_t n)
 {
    unsigned __int128 t0, t1, t2, t3, t4;
@@ -178,6 +300,25 @@ static void fe_mul_small(fe out, const fe f, uint64_t n)
    out[0] += 19 * c;
    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
 #else
 static void fe_mul_small(fe out, const fe f, uint64_t n)
 {
    u128 t0, t1, t2, t3, t4;
    uint64_t c;
    t0 = u128_mul(f[0], n);
    t1 = u128_mul(f[1], n);
    t2 = u128_mul(f[2], n);
    t3 = u128_mul(f[3], n);
    t4 = u128_mul(f[4], n);
    out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
    out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
    out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
    out[0] += 19 * c;
    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
 #endif
 /* --- Inversion --- */