diff --git a/src/se050_x25519_sw.c b/src/se050_x25519_sw.c
index deba93e..1b464df 100644
--- a/src/se050_x25519_sw.c
+++ b/src/se050_x25519_sw.c
@@ -9,6 +9,16 @@
 #include "se050_crypto_utils.h"
 #include <string.h>
 
+/* =========================================================================
+ * Platform detection
+ * ========================================================================= */
+
+#if defined(ESP_PLATFORM) || defined(__XTENSA__)
+#define SE050_X25519_ESP32 1
+#else
+#define SE050_X25519_ESP32 0
+#endif
+
 /* =========================================================================
  * Field GF(2^255-19)
  * 
@@ -27,8 +37,34 @@ typedef uint64_t fe[NLIMBS];   /* field element */
 #define MASK51 (L51 - 1)
 
 /* 128-bit helpers */
+#if !SE050_X25519_ESP32
 static inline uint64_t u128_lo(unsigned __int128 x) { return (uint64_t)x; }
 static inline uint64_t u128_hi(unsigned __int128 x) { return (uint64_t)(x >> 64); }
+#else
+/* ESP32: 128-bit emulation using 64-bit arithmetic */
+typedef struct { uint64_t lo, hi; } u128;
+static inline u128 u128_mul(uint64_t a, uint64_t b) {
+    u128 r;
+    uint64_t a_lo = a & 0xFFFFFFFFULL, a_hi = a >> 32;
+    uint64_t b_lo = b & 0xFFFFFFFFULL, b_hi = b >> 32;
+    uint64_t p0 = a_lo * b_lo;
+    uint64_t p1 = a_lo * b_hi;
+    uint64_t p2 = a_hi * b_lo;
+    uint64_t p3 = a_hi * b_hi;
+    uint64_t mid = p1 + p2;
+    r.lo = p0 + (mid << 32);
+    r.hi = p3 + (mid >> 32) + ((p0 + (mid << 32)) < p0);
+    return r;
+}
+static inline uint64_t u128_lo(u128 x) { return x.lo; }
+static inline uint64_t u128_hi(u128 x) { return x.hi; }
+static inline u128 u128_add(u128 a, u128 b) {
+    u128 r;
+    r.lo = a.lo + b.lo;
+    r.hi = a.hi + b.hi + (r.lo < a.lo);
+    return r;
+}
+#endif
 
 /* --- Basic operations --- */
 
@@ -74,6 +110,7 @@ static void fe_reduce(fe f)
 
 /* --- Multiplication --- */
 
+#if !SE050_X25519_ESP32
 /* fe_mul: out = a * b mod p (128-bit accumulators) */
 static void fe_mul(fe out, const fe a, const fe b)
 {
@@ -120,8 +157,57 @@ static void fe_mul(fe out, const fe a, const fe b)
 
     c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
+#else
+/* ESP32: fe_mul with 128-bit emulation */
+static void fe_mul(fe out, const fe a, const fe b)
+{
+    u128 t0, t1, t2, t3, t4;
+    uint64_t c;
+    uint64_t b1_19 = 19 * b[1], b2_19 = 19 * b[2], b3_19 = 19 * b[3], b4_19 = 19 * b[4];
+
+    t0 = u128_mul(a[0], b[0]);
+    t0 = u128_add(t0, u128_mul(a[1], b4_19));
+    t0 = u128_add(t0, u128_mul(a[2], b3_19));
+    t0 = u128_add(t0, u128_mul(a[3], b2_19));
+    t0 = u128_add(t0, u128_mul(a[4], b1_19));
+
+    t1 = u128_mul(a[0], b[1]);
+    t1 = u128_add(t1, u128_mul(a[1], b[0]));
+    t1 = u128_add(t1, u128_mul(a[2], b4_19));
+    t1 = u128_add(t1, u128_mul(a[3], b3_19));
+    t1 = u128_add(t1, u128_mul(a[4], b2_19));
+
+    t2 = u128_mul(a[0], b[2]);
+    t2 = u128_add(t2, u128_mul(a[1], b[1]));
+    t2 = u128_add(t2, u128_mul(a[2], b[0]));
+    t2 = u128_add(t2, u128_mul(a[3], b4_19));
+    t2 = u128_add(t2, u128_mul(a[4], b3_19));
+
+    t3 = u128_mul(a[0], b[3]);
+    t3 = u128_add(t3, u128_mul(a[1], b[2]));
+    t3 = u128_add(t3, u128_mul(a[2], b[1]));
+    t3 = u128_add(t3, u128_mul(a[3], b[0]));
+    t3 = u128_add(t3, u128_mul(a[4], b4_19));
+
+    t4 = u128_mul(a[0], b[4]);
+    t4 = u128_add(t4, u128_mul(a[1], b[3]));
+    t4 = u128_add(t4, u128_mul(a[2], b[2]));
+    t4 = u128_add(t4, u128_mul(a[3], b[1]));
+    t4 = u128_add(t4, u128_mul(a[4], b[0]));
+
+    out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
+    out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
+    out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
+    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
+    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
+    out[0] += 19 * c;
+
+    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
+}
+#endif
 
 /* fe_sq: out = a^2 mod p (optimized) */
+#if !SE050_X25519_ESP32
 static void fe_sq(fe out, const fe a)
 {
     unsigned __int128 t0, t1, t2, t3, t4;
@@ -155,12 +241,48 @@ static void fe_sq(fe out, const fe a)
     out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 += c;
     out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
     out[0] += 19 * c;
-
-    /* Final carry from limb 0 */
     c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
+#else
+static void fe_sq(fe out, const fe a)
+{
+    u128 t0, t1, t2, t3, t4;
+    uint64_t c;
+    uint64_t d1 = 2 * a[1], d2 = 2 * a[2], d3 = 2 * a[3];
+    uint64_t a4_19 = 19 * a[4], d1_19 = 19 * d1, d2_19 = 19 * d2, a3_19 = 19 * a[3];
+
+    t0 = u128_mul(a[0], a[0]);
+    t0 = u128_add(t0, u128_mul(d1_19, a[4]));
+    t0 = u128_add(t0, u128_mul(d2_19, a[3]));
+
+    t1 = u128_mul(a[0], d1);
+    t1 = u128_add(t1, u128_mul(d2_19, a[4]));
+    t1 = u128_add(t1, u128_mul(a3_19, a[3]));
+
+    t2 = u128_mul(a[0], d2);
+    t2 = u128_add(t2, u128_mul(a[1], a[1]));
+    t2 = u128_add(t2, u128_mul(d3, a4_19));
+
+    t3 = u128_mul(a[0], d3);
+    t3 = u128_add(t3, u128_mul(d1, a[2]));
+    t3 = u128_add(t3, u128_mul(a[4], a4_19));
+
+    t4 = u128_mul(a[0], 2 * a[4]);
+    t4 = u128_add(t4, u128_mul(d1, a[3]));
+    t4 = u128_add(t4, u128_mul(a[2], a[2]));
+
+    out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
+    out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
+    out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
+    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
+    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
+    out[0] += 19 * c;
+    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
+}
+#endif
 
 /* fe_mul_small: out = f * n (n < 2^22) */
+#if !SE050_X25519_ESP32
 static void fe_mul_small(fe out, const fe f, uint64_t n)
 {
     unsigned __int128 t0, t1, t2, t3, t4;
@@ -178,6 +300,25 @@ static void fe_mul_small(fe out, const fe f, uint64_t n)
     out[0] += 19 * c;
     c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
 }
+#else
+static void fe_mul_small(fe out, const fe f, uint64_t n)
+{
+    u128 t0, t1, t2, t3, t4;
+    uint64_t c;
+    t0 = u128_mul(f[0], n);
+    t1 = u128_mul(f[1], n);
+    t2 = u128_mul(f[2], n);
+    t3 = u128_mul(f[3], n);
+    t4 = u128_mul(f[4], n);
+    out[0] = u128_lo(t0) & MASK51; c = u128_lo(t0) >> 51 | u128_hi(t0) << 13; t1 = u128_add(t1, (u128){c, 0});
+    out[1] = u128_lo(t1) & MASK51; c = u128_lo(t1) >> 51 | u128_hi(t1) << 13; t2 = u128_add(t2, (u128){c, 0});
+    out[2] = u128_lo(t2) & MASK51; c = u128_lo(t2) >> 51 | u128_hi(t2) << 13; t3 = u128_add(t3, (u128){c, 0});
+    out[3] = u128_lo(t3) & MASK51; c = u128_lo(t3) >> 51 | u128_hi(t3) << 13; t4 = u128_add(t4, (u128){c, 0});
+    out[4] = u128_lo(t4) & MASK51; c = u128_lo(t4) >> 51 | u128_hi(t4) << 13;
+    out[0] += 19 * c;
+    c = out[0] >> 51; out[0] &= MASK51; out[1] += c;
+}
+#endif
 
 /* --- Inversion --- */