Add ESP32 support with 128-bit arithmetic emulation

- Detect ESP32 platform using ESP_PLATFORM and __XTENSA__ macros
- Implement 128-bit multiplication and addition using 64-bit arithmetic
- Wrap fe_mul(), fe_sq(), and fe_mul_small() with ESP32-specific code paths
- Standard platforms use native unsigned __int128 (faster)
- ESP32 uses 128-bit emulation (compatible with 32-bit architecture)
This commit is contained in:
km
2026-03-28 07:40:38 +09:00
parent f6298c7725
commit a8d28882c7
+143 -2
View File
@@ -9,6 +9,16 @@
#include "se050_crypto_utils.h"
#include <string.h>
/* =========================================================================
* Platform detection
* ========================================================================= */
#if defined(ESP_PLATFORM) || defined(__XTENSA__)
#define SE050_X25519_ESP32 1
#else
#define SE050_X25519_ESP32 0
#endif
/* =========================================================================
* Field GF(2^255-19)
*
@@ -27,8 +37,34 @@ typedef uint64_t fe[NLIMBS]; /* field element */
#define MASK51 (L51 - 1)
/* 128-bit helpers */
#if !SE050_X25519_ESP32
static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; }
static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); }
#else
/* ESP32: 128-bit emulation using 64-bit arithmetic */
typedef struct { uint64_t lo, hi; } u128;
static inline u128 u128_mul(uint64_t a, uint64_t b) {
u128 r;
uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32;
uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32;
uint64_t p0 = a_lo * b_lo;
uint64_t p1 = a_lo * b_hi;
uint64_t p2 = a_hi * b_lo;
uint64_t p3 = a_hi * b_hi;
uint64_t mid = p1 + p2;
r.lo = p0 + (mid << 32);
r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0);
return r;
}
static inline uint64_t u128_lo(u128 x) { return x.lo; }
static inline uint64_t u128_hi(u128 x) { return x.hi; }
static inline u128 u128_add(u128 a, u128 b) {
u128 r;
r.lo = a.lo + b.lo;
r.hi = a.hi + b.hi + (r.lo < a.lo);
return r;
}
#endif
/* --- Basic operations --- */
@@ -74,6 +110,7 @@ static void fe_reduce(fe f)
/* --- Multiplication --- */
#if !SE050_X25519_ESP32
/* fe_mul: out = a * b mod p (128-bit accumulators) */
static void fe_mul(fe out, const fe a, const fe b)
{
@@ -120,8 +157,57 @@ static void fe_mul(fe out, const fe a, const fe b)
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#else
/* ESP32: fe_mul with 128-bit emulation */
static void fe_mul(fe out, const fe a, const fe b)
{
u128 t0, t1, t2, t3, t4;
uint64_t c;
uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
t0 = u128_mul(a[0], b[0]);
t0 = u128_add(t0, u128_mul(a[1], b4_19));
t0 = u128_add(t0, u128_mul(a[2], b3_19));
t0 = u128_add(t0, u128_mul(a[3], b2_19));
t0 = u128_add(t0, u128_mul(a[4], b1_19));
t1 = u128_mul(a[0], b[1]);
t1 = u128_add(t1, u128_mul(a[1], b[0]));
t1 = u128_add(t1, u128_mul(a[2], b4_19));
t1 = u128_add(t1, u128_mul(a[3], b3_19));
t1 = u128_add(t1, u128_mul(a[4], b2_19));
t2 = u128_mul(a[0], b[2]);
t2 = u128_add(t2, u128_mul(a[1], b[1]));
t2 = u128_add(t2, u128_mul(a[2], b[0]));
t2 = u128_add(t2, u128_mul(a[3], b4_19));
t2 = u128_add(t2, u128_mul(a[4], b3_19));
t3 = u128_mul(a[0], b[3]);
t3 = u128_add(t3, u128_mul(a[1], b[2]));
t3 = u128_add(t3, u128_mul(a[2], b[1]));
t3 = u128_add(t3, u128_mul(a[3], b[0]));
t3 = u128_add(t3, u128_mul(a[4], b4_19));
t4 = u128_mul(a[0], b[4]);
t4 = u128_add(t4, u128_mul(a[1], b[3]));
t4 = u128_add(t4, u128_mul(a[2], b[2]));
t4 = u128_add(t4, u128_mul(a[3], b[1]));
t4 = u128_add(t4, u128_mul(a[4], b[0]));
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#endif
/* fe_sq: out = a^2 mod p (optimized) */
#if !SE050_X25519_ESP32
static void fe_sq(fe out, const fe a)
{
unsigned __int128 t0, t1, t2, t3, t4;
@@ -155,12 +241,48 @@ static void fe_sq(fe out, const fe a)
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
/* Final carry from limb 0 */
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#else
static void fe_sq(fe out, const fe a)
{
u128 t0, t1, t2, t3, t4;
uint64_t c;
uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
t0 = u128_mul(a[0], a[0]);
t0 = u128_add(t0, u128_mul(d1_19, a[4]));
t0 = u128_add(t0, u128_mul(d2_19, a[3]));
t1 = u128_mul(a[0], d1);
t1 = u128_add(t1, u128_mul(d2_19, a[4]));
t1 = u128_add(t1, u128_mul(a3_19, a[3]));
t2 = u128_mul(a[0], d2);
t2 = u128_add(t2, u128_mul(a[1], a[1]));
t2 = u128_add(t2, u128_mul(d3, a4_19));
t3 = u128_mul(a[0], d3);
t3 = u128_add(t3, u128_mul(d1, a[2]));
t3 = u128_add(t3, u128_mul(a[4], a4_19));
t4 = u128_mul(a[0], 2 * a[4]);
t4 = u128_add(t4, u128_mul(d1, a[3]));
t4 = u128_add(t4, u128_mul(a[2], a[2]));
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#endif
/* fe_mul_small: out = f * n (n < 2^22) */
#if !SE050_X25519_ESP32
static void fe_mul_small(fe out, const fe f, uint64_t n)
{
unsigned __int128 t0, t1, t2, t3, t4;
@@ -178,6 +300,25 @@ static void fe_mul_small(fe out, const fe f, uint64_t n)
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#else
static void fe_mul_small(fe out, const fe f, uint64_t n)
{
u128 t0, t1, t2, t3, t4;
uint64_t c;
t0 = u128_mul(f[0], n);
t1 = u128_mul(f[1], n);
t2 = u128_mul(f[2], n);
t3 = u128_mul(f[3], n);
t4 = u128_mul(f[4], n);
out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
out[0] += 19 * c;
c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
}
#endif
/* --- Inversion --- */