/* * Copyright 2018-2021 NXP * All rights reserved. * * * SPDX-License-Identifier: BSD-3-Clause */ #include "fsl_casper.h" #include /* ceil TODO check if really need it */ /******************************************************************************* * Definitions ******************************************************************************/ /* Component ID definition, used by tools. */ #ifndef FSL_COMPONENT_ID #define FSL_COMPONENT_ID "platform.drivers.casper" #endif /* Recoding length for the secure scalar multiplication: * Use n=256 and w=4 --> compute ciel(384/3) = 86 + 1 digits * Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits * Use n=521 and w=4 --> compute ciel(521/3) = 174 + 1 digits */ /*!< Recoding length for the secure scalar multiplication */ enum _casper_ecc_recode_len { kCASPER_ECC_P256_recode_len = 87u, kCASPER_ECC_P384_recode_len = 129u, kCASPER_ECC_P521_recode_len = 175u, }; enum _casper_ecc_N_bitlen { kCASPER_ECC_P256_N_bitlen = 256u, kCASPER_ECC_P384_N_bitlen = 384u, kCASPER_ECC_P521_N_bitlen = 576u, }; enum _casper_ecc_N_wordlen { kCASPER_ECC_P256_wordlen = 256U / 32U, kCASPER_ECC_P384_wordlen = 384u / 32U, kCASPER_ECC_P521_wordlen = 576u / 32U, }; #if defined(__GNUC__) /* Enforce O1 optimize level, specifically to remove strict-aliasing option. (-fno-strict-aliasing is required for this driver). */ #pragma GCC push_options #pragma GCC optimize("-O1") #endif #if (defined(__CC_ARM) || defined(__ARMCC_VERSION)) /* Enforce optimization off for clang, specifically to remove strict-aliasing option. (-fno-strict-aliasing is required for this driver). */ #pragma clang optimize off #endif /* CASPER driver allows usage of 256, 384 and 521 ECC */ #define CASPER_MAX_ECC_SIZE_WORDLEN (576u / 32U) #define CASPER_RECODE_LENGTH_MAX 175 #define CASPER_RAM_BASE_NS (FSL_FEATURE_CASPER_RAM_BASE_ADDRESS) #if defined(FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED) && FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED #define CASPER_RAM_OFFSET (FSL_FEATURE_CASPER_RAM_OFFSET) #define INTERLEAVE(addr) \ (((((((addr) >> 2U) & 0x00000001U) << CASPER_RAM_OFFSET) + (((addr) >> 3U) << 2U) + ((addr)&0x00000003U)) & \ 0xFFFFU) | \ s_casperRamBase) #define DEINTERLEAVE(addr) INTERLEAVE(addr) #define GET_WORD(addr) (*((uint32_t *)DEINTERLEAVE((uint32_t)(addr)))) #define GET_DWORD(addr) (((uint64_t)GET_WORD(addr)) | (((uint64_t)GET_WORD(((uint32_t)(addr)) + 4U)) << 32U)) #define SET_WORD(addr, value) *((uint32_t *)INTERLEAVE((uint32_t)(addr))) = ((uint32_t)(value)) #define SET_DWORD(addr, value) \ do \ { \ SET_WORD(addr, (uint32_t)(value & 0xFFFFFFFFU)); \ SET_WORD(((uint32_t)(addr)) + 4U, (uint32_t)((value & 0xFFFFFFFF00000000U) >> 32U)); \ } while (false) /* memcopy is always word aligned */ /* interleaved to interleaved static void CASPER_MEMCPY_I2I(void *dst, const void *src, size_t siz) */ #define CASPER_MEMCPY_I2I(dst, src, siz) \ \ { \ uint32_t *dst32 = (uint32_t *)(dst); \ const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \ uint32_t i; \ for (i = 0U; i < (siz) / 4U; i++) \ { \ SET_WORD(&dst32[i], GET_WORD(&src32[i])); \ } \ } /* interleaved to non-interleaved static void CASPER_MEMCPY_I2N(void *dst, const void *src, size_t siz) */ #define CASPER_MEMCPY_I2N(dst, src, siz) \ \ { \ uint32_t *dst32 = (uint32_t *)(dst); \ const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \ uint32_t i; \ for (i = 0U; i < (siz) / 4U; i++) \ { \ dst32[i] = GET_WORD(&src32[i]); \ } \ } /* non-interleaved to interleaved static void CASPER_MEMCPY_N2I(void *dst, const void *src, size_t siz) */ #define CASPER_MEMCPY_N2I(dst, src, siz) \ \ { \ volatile uint32_t *dst32 = (uint32_t *)(dst); \ const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \ uint32_t i; \ for (i = 0U; i < (siz) / 4U; i++) \ { \ SET_WORD(&dst32[i], src32[i]); \ } \ } #else #define GET_WORD(addr) (*((uint32_t *)(uint32_t)(addr))) #define GET_DWORD(addr) (*((uint64_t *)(addr))) #define SET_WORD(addr, value) *((uint32_t *)(uint32_t)(addr)) = ((uint32_t)(value)) #define SET_DWORD(addr, value) *((uint64_t *)(addr)) = ((uint64_t)(value)) #define CASPER_MEMCPY_I2I(dst, src, siz) (void)memcpy(dst, src, siz) #define CASPER_MEMCPY_I2N(dst, src, siz) (void)memcpy(dst, src, siz) #define CASPER_MEMCPY_N2I(dst, src, siz) (void)memcpy(dst, src, siz) #endif #define WORK_BUFF_MUL4 (N_wordlen_max * 4 + 2) /* ! working buffer is 4xN_wordlen to allow in place math */ #define N_bytelen (N_wordlen * 4U) /* for memory copy and the like */ #define N_dwordlen (unsigned)(N_wordlen / 2U) #define PreZeroW(i, w_out) \ for ((i) = 0U; (i) < N_wordlen; (i) += 4U) \ { \ SET_WORD(&(w_out)[(i) + 0U], 0U); \ SET_WORD(&(w_out)[(i) + 1U], 0U); \ SET_WORD(&(w_out)[(i) + 2U], 0U); \ SET_WORD(&(w_out)[(i) + 3U], 0U); \ } /* unrolled partly */ #define PreZeroW2up(i, w_out) \ for (i = N_wordlen; i <= N_wordlen * 2U; i += 4U) \ { \ SET_WORD(&w_out[i + 0U], 0U); \ SET_WORD(&w_out[i + 1U], 0U); \ SET_WORD(&w_out[i + 2U], 0U); \ SET_WORD(&w_out[i + 3U], 0U); \ } /* unrolled partly */ /* Macros for the ECC component in Casper */ /* CASPER memory layout for ECC */ #define CASPER_MEM ((uint32_t *)msg_ret) /* Currently these macros work on 32-bit platforms */ #define add(c1, c0, a, b) \ \ do \ { \ uint32_t _t; \ _t = a + b; \ c1 = (uint32_t)(_t < a); \ c0 = _t; \ \ } while (false) #define add_cout(carry, c, a, b) add((carry), (c), (a), (b)) #define add_cout_cin(carryout, c, a, b, carryin) \ do \ { \ uint64_t _t = (uint64_t)(a) + (b) + (carryin); \ (c) = (uint32_t)_t; \ (carryout) = (uint32_t)(_t >> 32); \ } while (false) #define sub_borrowout(borrow, c, a, b) \ do \ { \ uint32_t _b = (uint32_t)((b) > (a)); \ (c) = (a) - (b); \ (borrow) = _b; \ } while (false) #define sub_borrowin_borrowout(borrowout, c, a, b, borrowin) \ do \ { \ uint32_t _t, _borrow1, _borrow2; \ sub_borrowout(_borrow1, _t, (a), (b)); \ sub_borrowout(_borrow2, (c), _t, (borrowin)); \ (borrowout) = _borrow1 + _borrow2; \ } while (false) #define sub_borrowout_1(borrow, c, a) \ do \ { \ uint32_t _b = 0; \ c = a - b; \ borrow = _b; \ } while (false) #define sub_borrowin_borrowout_1(borrowout, c, a, borrowin) \ do \ { \ uint32_t _t, _borrow1, _borrow2; \ sub_borrowout_1(_borrow1, _t, a); \ sub_borrowout(_borrow2, c, _t, borrowin); \ borrowout = _borrow1 + _borrow2; \ } while (false) /* 32 x 32 --> 64-bit multiplication * (c1,c0) = a * b */ #define mul(c1, c0, a, b) \ \ do \ { \ uint64_t __m; \ __m = (uint64_t)a * (uint64_t)b; \ c0 = (uint32_t)__m; \ c1 = (uint32_t)(__m >> (uint64_t)32); \ \ } while (false) /* Multiply-and-accumulate * (c1,c0) = a*b+c0 */ #define muladd(c1, c0, a, b) \ \ do \ { \ uint32_t __ma = c0; \ mul(c1, c0, a, b); \ c0 = c0 + __ma; \ c1 = c1 + (c0 < __ma); \ \ } while (0) /* Multiply-and-accumulate-accumulate * (c1,c0) = a*b+c0+c1 */ #define muladdadd(c1, c0, a, b) \ \ do \ { \ uint32_t __maa0 = c0, __maa1 = c1; \ mul(c1, c0, a, b); \ c0 = c0 + __maa0; \ c1 = c1 + (c0 < __maa0); \ c0 = c0 + __maa1; \ c1 = c1 + (c0 < __maa1); \ \ } while (0) #define square_casper(c, a) multiply_casper(c, a, a) #define sub_casper(c, a, b) CASPER_montsub(c, a, b, &CASPER_MEM[(N_wordlen + 4U)]) #define add_casper(c, a, b) CASPER_montadd(c, a, b, &CASPER_MEM[(N_wordlen + 4U)]) #define mul2_casper(c, a) add_casper(c, a, a) #define half(c, a, b) CASPER_half(c, a, b) /******************************************************************************* * Variables ******************************************************************************/ /* The model for this algo is that it can be implemented for a fixed size RSA key */ /* for max speed. If this is made into a variable (to allow varying size), then */ /* it will be slower by a bit. */ /* The file is compiled with N_bitlen passed in as number of bits of the RSA key */ /* #define N_bitlen 2048 */ static size_t N_wordlen = 0U; /* ! number of words (e.g. 4096/32 is 128 words) */ static uint32_t s_casperRamBase = CASPER_RAM_BASE_NS; static uint32_t *msg_ret = (uint32_t *)CASPER_RAM_BASE_NS; /* NISTp-256 = 2^256-2^224+2^192+2^96-1 */ static uint32_t NISTp256[256 / 32u] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0xffffffffU}; /* The cardinality of the curve E(F_p) */ static uint32_t NISTp256_q[256 / 32u] = {0xfc632551U, 0xf3b9cac2U, 0xa7179e84U, 0xbce6faadU, 0xffffffffU, 0xffffffffU, 0x00000000, 0xffffffffU}; /* R = 2^256 mod p, the value "1" in Montgomery form. */ static uint32_t NISTr256[256 / 32u] = {0x00000001, 0x00000000, 0x00000000, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xfffffffeU, 0x00000000}; static uint32_t Np256[2] = {1, 0}; /* NISTp-384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */ static uint32_t NISTp384[384 / 32u] = {0xffffffffU, 0x00000000, 0x00000000, 0xffffffffU, 0xfffffffeU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU}; /* The cardinality of the curve E(F_p) */ static uint32_t NISTp384_q[384 / 32u] = {0xccc52973U, 0xecec196aU, 0x48b0a77aU, 0x581a0db2U, 0xf4372ddfU, 0xc7634d81U, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU}; /* R = 2^256 mod p, the value "1" in Montgomery form. */ static uint32_t NISTr384[384 / 32u] = {0x00000001, 0xffffffffU, 0xffffffffU, 0x00000000, 0x1, 0, 0, 0, 0, 0, 0, 0}; // -p^-1 mod 2^64 = 0x100000001 static uint32_t Np384[2] = {1, 1}; /* NISTp-521 = 2^521 - 1 */ static uint32_t NISTp521[576 / 32U] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU, 0}; /* The cardinality of the curve E(F_p) */ static uint32_t NISTp521_q[576 / 32U] = {0x91386409U, 0xbb6fb71eU, 0x899c47aeU, 0x3bb5c9b8U, 0xf709a5d0U, 0x7fcc0148U, 0xbf2f966bU, 0x51868783U, 0xfffffffaU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU, 0}; /* R = 2^576 mod p, the value "1" in Montgomery form. */ static uint32_t NISTr521[576 / 32U] = {0, 0x800000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* -p^-1 mod 2^64 = 1 */ static uint32_t Np521[2] = {1, 0}; /******************************************************************************* * Prototypes ******************************************************************************/ /* Convert a projective point (X1 : Y1 : Z1) * to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3) * The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap */ void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1); /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2) * where (X1: Y1: Z1) != (X2 : Y2 : Z2) * (X3 : Y3: Z3) may be the same as one of the inputs. */ void Jac_addition(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2, uint32_t *Z2); /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2) * where (X1: Y1: Z1) != (X2, Y2) * (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1). * Source: 2004 Hankerson?Menezes?Vanstone, page 91. */ void Jac_add_affine( uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2); /* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91. * Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1) * (X3 : Y3: Z3) may be the same as the input. */ void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1); /* Constant time elliptic curve scalar multiplication. * Source: https://eprint.iacr.org/2014/130.pdf * when using w = 4. * Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p) * p is the prime used to define the finite field F_p * q is the (prime) order of the curve */ void Jac_scalar_multiplication( uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q); /* Compute the double scalar multiplication * (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2) * Using Shamir's trick and precomputing 16 points. * This code is *not* constant time since this is used * for verification only. */ void double_scalar_multiplication(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k1, uint32_t *X2, uint32_t *Y2, uint32_t *k2); /* Compute inversion modulo NIST-p384 using Fermats little theorem. * Using c = a^(p-2) = a^(-1) mod p. * This computes the modular inversion if all arithmetic is "regular" * modular arithmetic or computes automatically the Montgomery inverse * if all arithmetic is Montgomery arithmetic. */ static void invert_mod_p384(uint32_t *c, uint32_t *a); /* Modular inversion for NIST-P256 */ static void invert_mod_p256(uint32_t *c, uint32_t *a); /* Modular inversion for NIST-P521 */ static void invert_mod_p521(uint32_t *c, uint32_t *a); // A and C do not need to be in Casper memory static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A); static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A); static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A); static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod); static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod); /* Compute c = a/2 mod p where b is scratch space. */ static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b); void CASPER_MEMCPY(void *dst, const void *src, size_t siz); static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[]); static uint8_t int8abs(int8_t v); /* Constant time select c = a if m = 0 or * c = b if m = 1 * a, b, c are n words */ static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n); /* Dumb n-limb addition of c=a+b, return carry. */ static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n); #if 0 /* Dumb n-limb addition of c=a+b, return carry. */ static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n); /* Dumb n-limb subtraction of c=a-b, return borrow. */ static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n); #endif /* Dumb n-limb subtraction of c=a-b, return borrow. */ static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n); int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max], const unsigned exp_pubkey, const unsigned pubkey[N_wordlen_max], unsigned MsgRet[WORK_BUFF_MUL4]); int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max], const unsigned exp_pubkey, const unsigned pubkey[N_wordlen_max], unsigned MsgRet[WORK_BUFF_MUL4]); void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[]); void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret); void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[]); void MultprecModulo(unsigned r_out[], const unsigned v[], int top); void MultprecCiosMul( unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np); void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[]); static void MultprecCiosMul_ct( uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np); static void MultprecCiosMul521_ct( uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np); static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c); static void shiftright(uint32_t *z, uint32_t *x, uint32_t c); static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c); /******************************************************************************* * Code ******************************************************************************/ __STATIC_FORCEINLINE uint32_t CA_MK_OFF(const void *addr) { return ((uint32_t)(const uint32_t *)addr - s_casperRamBase); } #if 1 __STATIC_FORCEINLINE void Accel_done(void) { register uint32_t status; do { status = CASPER->STATUS; } while (0U == (status & CASPER_STATUS_DONE_MASK)); } __STATIC_FORCEINLINE void Accel_SetABCD_Addr(uint32_t ab, uint32_t cd) { CASPER->CTRL0 = ab | (cd << 16); /* CDoffset << 16 | ABoffset */ } __STATIC_FORCEINLINE void Accel_crypto_mul(uint32_t ctrl1) { CASPER->CTRL1 = ctrl1; } #else #include "intrinsics.h" #define Accel_done() \ { \ register uint32_t status; \ do \ { \ status = CASPER_Rd32b(CASPER_CP_STATUS); \ } while (0 == (status & CASPER_STATUS_DONE_MASK)); \ } #if 0 __STATIC_FORCEINLINE void Accel_done(void) { register uint32_t status; do { status = CASPER->STATUS; } while (0 == (status & CASPER_STATUS_DONE_MASK)); } #endif #define Accel_SetABCD_Addr(ab, cd) CASPER_Wr32b((uint32_t)ab | ((uint32_t)cd << 16), CASPER_CP_CTRL0); #define Accel_crypto_mul(ctrl1) CASPER_Wr32b((uint32_t)ctrl1, CASPER_CP_CTRL1); #endif __STATIC_FORCEINLINE uint32_t Accel_IterOpcodeResaddr(uint32_t iter, uint32_t opcode, uint32_t resAddr) { return CASPER_CTRL1_ITER(iter) | CASPER_CTRL1_MODE(opcode) | (resAddr << 16); } void CASPER_MEMCPY(void *dst, const void *src, size_t siz) { bool bdst = ((((uint32_t)(uint32_t *)dst) | 0x10000000u) >= ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) && (((uint32_t)(uint32_t *)dst) | 0x10000000u) < ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u); bool bsrc = ((((uint32_t)(const uint32_t *)src) | 0x10000000u) >= ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) && (((uint32_t)(const uint32_t *)src) | 0x10000000u) < ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u); if (bdst && bsrc) { CASPER_MEMCPY_I2I(dst, src, siz); } else if (bdst && !bsrc) { CASPER_MEMCPY_N2I(dst, src, siz); } else if (!bdst && bsrc) { CASPER_MEMCPY_I2N(dst, src, siz); } else { (void)memcpy(dst, src, siz); } } /* Constant time select c = a if m = 0 or * c = b if m = 1 * a, b, c are n words */ static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n) { uint32_t m1 = 0U - (uint32_t)m, m2 = ~m1; int i; for (i = 0; i < n; i++) { SET_WORD(&c[i], (GET_WORD(&a[i]) & m2) | (GET_WORD(&b[i]) & m1)); } } /* Compute R`, which is R mod N. This is done using subtraction */ /* R has 1 in N_wordlen, but we do not fill it in since borrowed. */ /* Exp-pubkey only used to optimize for exp=3 */ void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[]) { uint32_t i; /* R is 2^n where n is 1 bit longer than Nmod, so 1 followed by 32 or 64 0 words for example */ /* Note that Nmod's upper most bit has to be 1 by definition, so one subtract is enough. We */ /* do not set the 1 since it is "borrowed" so no point */ PreZeroW(i, Rp); Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(Rp))); Accel_done(); /* final borrow cannot happen since we know we started with a larger number */ } /* MultprecMultiply - multiple w=u*v (per Knuth) */ /* w_out is 2x the size of u and v */ void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[]) { uint32_t i, j; /* Knuth 4.3.1 - Algorithm M */ /* Compute w = u * v */ /* u and v are N bits long in 32 bit word form */ /* w is 2*N bits long in 32 bit word form */ /* Note: We just multiply in place */ /* Step 1. Fill w[t-1:0] with 0s, the upper half will be written as we go */ PreZeroW(i, w_out); /* We do 1st pass NOSUM so we do not have to 0 output */ Accel_SetABCD_Addr(CA_MK_OFF(&v[0]), CA_MK_OFF(u)); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464NoSum, CA_MK_OFF(&w_out[0]))); Accel_done(); /* Step 2. iterate over N words of v using j */ for (j = 2U; j < N_wordlen; j += 2U) { /* Step 2b. Check for 0 on v word - skip if so since we 0ed already */ /* Step 3. Iterate over N words of u using i - perform Multiply-accumulate */ if (0U != (GET_WORD(&v[j])) || 0U != (GET_WORD(&v[j + 1U]))) { Accel_SetABCD_Addr(CA_MK_OFF(&v[j]), CA_MK_OFF(u)); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464Sum, CA_MK_OFF(&w_out[j]))); Accel_done(); } } } /* MultprecModulo performs divide to get remainer as needed for RSA */ /* This performs (q,r) = u/v, but we do not keep q */ /* r_out is module (remainder) and is 2*N */ /* u is in r_out (1st N) at start (passed in) */ /* v is N long */ void MultprecModulo(unsigned r_out[], const unsigned v[], int top) { uint64_t u64; /* use 64 bit math mixed with 32 bit */ unsigned u32; /* allows us to work on U in 32 bit */ unsigned u_n, ul16, uh16, *u_shft; /* u_shft is because r_out is u initially */ unsigned vl16, vh16, v_Nm1; unsigned q_hat, r_hat, q_over; unsigned borrow, carry; uint32_t i; int j, tmp; /* Knuth 4.3.1 - Algorithm D */ /* Compute q = u / v giving remainder r = u mod v */ /* -- we only want r, so we build qhat but do not store the Qs */ /* v is N long, with u,q,r 2N long because u is slowly replavced by r. */ /* We normalize/unnormlize per Knuth in the buffer (not copied) */ /* Step 1. Normalize value so MSb is in v[n-1]. Remember that v is */ /* the public key - to call it a 2048 bit number, they cannot have 0 */ /* in the MSb (or it would be less than 2048 bits) and so we know we */ /* are normalized already. Therefore, u is effectively shifted already. */ /* For u, we have it in r_out. u[n] holds any overflow */ /* Since divide on CM3/4 is 32/32=32, we break into 16 bit halves, but */ /* multiply can be 32x32=64. */ u_n = 0; u_shft = r_out; /* u (shifted) is in r_out */ v_Nm1 = GET_WORD(&v[N_wordlen - 1U]); /* MSw of public key */ vl16 = v_Nm1 & 0xFFFFU; /* lower 16 */ vh16 = v_Nm1 >> 16; /* upper 16 */ /* Step 2. Iterate j from m-n down to 0 (M selected per Knuth as 2*N) */ for (j = top; j >= 0; j--) { /* Step 3. estimate q_hat as (U[j+n]*B + U[j+n-1]) / V[n-1] */ /* Note: using subset of Knuth algo since v is 1/2 len of u (which is */ /* from multiply or x^2 leading into this). */ u32 = u_n; /* pickup u4u3u2, knowing u4 is 0 */ u64 = ((uint64_t)u_n << 32) | GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 1U]); ul16 = (unsigned int)(u64 & 0xFFFFU); /* lower 16 */ uh16 = (unsigned int)((u64 >> 16) & 0xFFFFU); /* upper 16 */ /* we see if even possible (u large enough relative to v) */ if ((u32 - v_Nm1) <= u32) { u32 -= v_Nm1; q_over = 1; /* overflow from the sub */ } else { q_over = 0; } /* q_hat = u32 / vh16 -- is the upper partial value */ /* estimate; if too much, then back down by 1 or 2 */ q_hat = u32 / vh16; r_hat = u32 - (q_hat * vh16); /* see if Q is more than 16 bits or remainder is too large (over div) */ if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | uh16))) { /* too much - undo a division */ q_hat--; r_hat += vh16; /* check if still too much */ if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | uh16))) { q_hat--; /* yes, so undo a 2nd */ } } /* compose u3u2uh16, then sub q_hat*v if OK */ u64 = (((uint64_t)u32 << 16) | uh16) - ((uint64_t)q_hat * v_Nm1); if (0U != (u64 >> 48)) { /* no, so add v back */ u32 = (unsigned)(u64 + v_Nm1); q_hat--; } else { u32 = (unsigned)u64; } tmp = (int32_t)(uint32_t)(q_hat << 16); /* quotient upper part */ /* divide lower part: q = u2uh16ul16 / v. */ /* estimate and add back if over divdied */ q_hat = u32 / vh16; r_hat = u32 - (q_hat * vh16); if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | ul16))) { /* too much - undo a division */ q_hat--; r_hat += vh16; /* check if still too much */ if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | ul16))) { q_hat--; /* yes, so undo a 2nd */ } } /* compose u2uh16ul16, then sub q_hat*v if OK */ u64 = (((uint64_t)u32 << 16) | ul16) - ((uint64_t)q_hat * v_Nm1); if (0U != (u64 >> 48)) { /* no, so add v back */ r_hat = (unsigned)(u64 + v_Nm1); q_hat--; } else { r_hat = (unsigned)u64; } q_hat |= (unsigned)tmp; /* other half of the quotient */ while ((q_over != 0U) || ((uint64_t)q_hat * GET_WORD(&v[N_wordlen - 2U])) > ((1ULL << 32) * r_hat) + (uint64_t)GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 2U])) { /* if Qhat>b, then reduce to b-1, then adjust up Rhat */ q_hat--; r_hat += v_Nm1; if (r_hat < v_Nm1) { break; /* no overflow */ /* else repeat since Rhat >= b */ } } /* Step 4. Multiply and subtract. We know the amount, */ /* so we do the schoolboy math. Have to do on */ /* the large value. */ if (q_hat != 0U) { borrow = 0; for (i = 0; i < N_wordlen; i++) { u64 = (uint64_t)q_hat * GET_WORD(&v[i]) + borrow; borrow = (unsigned)(u64 >> 32); if (GET_WORD(&u_shft[i + (unsigned)j]) < (unsigned)u64) { borrow++; /* carry the overflow */ } SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) - (unsigned)u64); } u_n -= borrow; /* overflow from shift left does not fit otherwise */ } /* Store 5. (update Q - we don't), and add back V to remainder if we over-subtracted */ /* That restores remainder to correct (we could only be off by 1) */ /* This should happen very rarely. */ if (u_n != 0U) { carry = 0; for (i = 0; i < N_wordlen; i++) { SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + carry); carry = (GET_WORD(&u_shft[i + (unsigned)j]) < carry) ? 1U : 0U; SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + GET_WORD(&v[i])); if (GET_WORD(&u_shft[i + (unsigned)j]) < GET_WORD(&v[i])) { carry++; } } } u_n = GET_WORD( &u_shft[(uint32_t)j + N_wordlen - 1U]); /* hold upper part of u to catch overflow (to borrow from) */ } /* low N bits of r are valid as remainder */ } /* We convert X into a Mont form number. Note length of arrays: */ /* x is N_wordlen, Nmod is N_wordlen */ /* Rp is N_wordlen (it is R` which is R mod N) */ /* Xmont_out is N_wordlen*2+1 */ void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[]) { MultprecMultiply(Xmont_out, x, Rp); MultprecModulo(Xmont_out, Nmod, (int32_t)N_wordlen); } void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret) /* only pass the low order double word */ { uint64_t nprime, Nmod_0; Nmod_0 = GET_WORD(&Nmod[0]) | ((uint64_t)GET_WORD(&Nmod[1]) << 32); #define COMP_NPN_1 ((2U - Nmod_0 * nprime) * nprime) /* computes N`*N0=1 mod 2^P where P is the partial built up */ nprime = (((2U + Nmod_0) & 4U) << 1) + Nmod_0; /* mod 2^4 */ nprime = COMP_NPN_1; nprime = COMP_NPN_1; nprime = COMP_NPN_1; nprime = COMP_NPN_1; /* 8 multiplies of uint64_t */ *((uint64_t *)(uintptr_t)np64_ret) = (~0ULL - nprime) + 1ULL; } /* CIOS Multiply. This is the Coarse Integrated form where the values are */ /* multiplied and reduced for each step of "i". This uses less memory and */ /* is faster as a result. Note that this is used to square as well as mul, */ /* so not as fast as pure squaring could be. */ void MultprecCiosMul( unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np) { int j; uint32_t i; uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64]; uint64_t Np64; uint64_t carry; uint64_t *a64, *b64, *w64, *N64; Np64 = *(uint64_t *)(uintptr_t)Np; a64 = (uint64_t *)(uintptr_t)a; b64 = (uint64_t *)(uintptr_t)b; w64 = (uint64_t *)(uintptr_t)w_out; N64 = (uint64_t *)(uintptr_t)Nmod; if (a != NULL) { /* if !a, we are reducing only */ PreZeroW(i, w_out); } SET_DWORD(&w64[N_dwordlen], 0ULL); SET_DWORD(&w64[N_dwordlen + 1U], 0ULL); /* with accelerator */ /* loop i and then reduce after each j round */ for (i = 0; i < N_dwordlen; i++) { /* Step 3. Iterate over N words of u using i - perform Multiply-accumulate */ /* push-pull: we do a*b and then separately m*n (reduce) */ if (a != NULL) { /* if mul&reduce vs. reduce only */ carry = GET_DWORD(&w64[N_dwordlen]); Accel_SetABCD_Addr(CA_MK_OFF(&b64[i]), CA_MK_OFF(a64)); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64))); Accel_done(); /* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */ /* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */ /* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */ /* w64[N_dwordlen+1] = g_carry; */ carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry); SET_DWORD(&w64[N_dwordlen + 1U], carry); } SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /* prime for 1st; modulo a double-word */ /* we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */ /* need its carry. We let the accel do this separately - really need a mode to */ /* do this "reduce" since it is natural */ carry = GET_DWORD(&w64[N_dwordlen]); Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0])); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0]))); Accel_done(); carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry); Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0]))); Accel_done(); SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry)); } /* now check if need to subtract Nmod */ if (0U != (GET_WORD(&w_out[N_wordlen]))) { j = 1; /* we have to subtract for sure if carry up */ } else { j = 0; for (i = N_wordlen - 1U; i > 0U; i--) { if (GET_WORD(&w_out[i]) != GET_WORD(&Nmod[i])) { j = (int32_t)(GET_WORD(&w_out[i]) > GET_WORD(&Nmod[i])); /* if larger sub */ break; /* we would remove the break if worrying about side channel */ } } } if (0 == j) { return; /* Is smaller than Nmod, so done. */ } Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out))); Accel_done(); /* last borrow is OK since we know it could only be <2N and */ } /* RSA_MontSignatureToPlaintextFast: */ /* MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */ /* exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */ /* signature = N bitpos len long "message" to process in Montgomery form - so saving conversion (divide) */ /* pubkey = N bitpos len long public key to process signature with */ /* returns: 0 */ /* */ /* Algo: compute M = signaturen^e mod public_key */ /* where M is original plaintext, signature is signed value */ /* note: e is usually either 0x3 or 0x10001 */ int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max], const unsigned exp_pubkey, const unsigned pubkey[N_wordlen_max], unsigned MsgRet[WORK_BUFF_MUL4]) { int bidx = 0; int bitpos; unsigned np64[2]; /* MsgRet working area: */ /* 0..N = RESULT, starting with S` */ /* N..N*2 = S` and then working BASE during math. */ /* N*2..N*4+2 = temp working area for Mont mul */ /* 1. Copy sig into MsgRet so we have one working result buffer */ CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result], (const uint32_t *)(uintptr_t)mont_signature, N_bytelen); MultprecGenNp64(pubkey, np64); /* Generate N` from LSW of N (LSW being lowest 64b word) */ bitpos = (int8_t)(uint8_t)(31U - __CLZ(exp_pubkey)); /* count of bits after the left most 1 */ while (--bitpos >= 0) { /* This operates on: */ /* result = 1; */ /* base = signature */ /* loop while exponent bits from MSb to LSb */ /* if (exp bit is 1) */ /* result = result * base */ /* base = base^2 */ /* Because the MSb of exp is always 1 by definition, we can invert this a bit: */ /* base = signature` */ /* result = base; equivalent to result = 1*base from 1st pass, but now square is needed 1st */ /* loop while exponent bits from MSb-1 to LSb */ /* base = base^2 */ /* if (exp bit is 1) */ /* result = result * base */ /* This ends up doing the same thing but skips two wasteful steps of multiplying by 1 and */ /* a final squaring never used. */ /* */ /* Next we have the problem that CIOS mul needs a separate dest buffer. So, we bounce */ /* base between base and temp, and likewise for result. */ MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase], &MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], &MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], pubkey, np64); if (0U != (exp_pubkey & (uint32_t)(uint8_t)(1U << (uint8_t)bitpos))) /* where e is 1 */ { /* result has result, so we need to work into other temp area */ MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], &MsgRet[kCASPER_RamOffset_Result], &MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase], pubkey, np64); /* we have to copy back to result */ // CASPER_MEMCPY_I2I(&MsgRet[kCASPER_RamOffset_Result], // &MsgRet[bidx ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], N_bytelen); } else { bidx = (int32_t)(uint32_t) ~(unsigned)bidx; } } CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result], (uint32_t *)(uintptr_t)&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], N_bytelen); /* final step is one more reduction to get back to normal form (ie. divide R out) */ MultprecCiosMul(&MsgRet[kCASPER_RamOffset_Result], NULL, NULL, pubkey, np64); return (0); /* always 0 */ } /* RSA_SignatureToPlaintextFast: */ /* MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */ /* exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */ /* signature = N bitpos len long "message" to process in normal form - so converted to Mont form */ /* pubkey = N bitpos len long public key to process signature with */ /* returns: 0 */ /* */ /* Algo: compute M = signaturen^e mod public_key */ /* where M is original plaintext, signature is signed value */ /* note: e is usually either 0x3 or 0x10001 */ int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max], const unsigned exp_pubkey, const unsigned pubkey[N_wordlen_max], unsigned MsgRet[WORK_BUFF_MUL4]) { /* MsgRet working area: */ /* 0..N = RESULT, starting with S`; it is used for R` just during creation of S` */ /* N..N*2 = S` and then working BASE during math. Note overflow beyond N*2 when making S` */ /* N*2..N*4+2 = temp working area for Mont mul */ MultprecMontCalcRp(&MsgRet[kCASPER_RamOffset_Result], exp_pubkey, pubkey); /* calculate R` (=R mod N) */ MultprecMontPrepareX(&MsgRet[kCASPER_RamOffset_Base], signature, &MsgRet[kCASPER_RamOffset_Result], pubkey); /* X*R1` mod N */ return (RSA_MontSignatureToPlaintextFast(&MsgRet[kCASPER_RamOffset_Base], exp_pubkey, pubkey, MsgRet)); } /*! * brief Performs modular exponentiation - (A^E) mod N. * * This function performs modular exponentiation. * * param base CASPER base address * param signature first addend (in little endian format) * param pubN modulus (in little endian format) * param wordLen Size of pubN in bytes * param pubE exponent * param[out] plaintext Output array to store result of operation (in little endian format) */ void CASPER_ModExp( CASPER_Type *base, const uint8_t *signature, const uint8_t *pubN, size_t wordLen, uint32_t pubE, uint8_t *plaintext) { #define PK_LOC &msg_ret[kCASPER_RamOffset_Modulus] #define SIG_LOC &msg_ret[(unsigned)kCASPER_RamOffset_Modulus + N_wordlen_max] N_wordlen = wordLen; /* set global variable for key length - used by RSA_SignatureToPlaintextFast() */ CASPER_MEMCPY_N2I(PK_LOC, (const uint32_t *)(uintptr_t)pubN, N_bytelen); CASPER_MEMCPY_N2I(SIG_LOC, (const uint32_t *)(uintptr_t)signature, N_bytelen); (void)RSA_SignatureToPlaintextFast((const unsigned *)(uintptr_t)(SIG_LOC), pubE, (const unsigned *)(uintptr_t)(PK_LOC), (unsigned int *)(uintptr_t)msg_ret); CASPER_MEMCPY_I2N((uint32_t *)(uintptr_t)plaintext, msg_ret, N_bytelen); } /*! * brief Enables clock and disables reset for CASPER peripheral. * * Enable clock and disable reset for CASPER. * * param base CASPER base address */ void CASPER_Init(CASPER_Type *base) { #if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) #if defined(CASPER_CLOCKS) CLOCK_EnableClock(kCLOCK_Casper); #endif #endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */ #if defined(CASPER_RSTS) RESET_PeripheralReset(kCASPER_RST_SHIFT_RSTn); #endif #if defined(FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE) && (FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE > 0) /* Enable hardware interleaving to RAMX0 and RAMX1 for CASPER */ SYSCON->CASPER_CTRL = SYSCON_CASPER_CTRL_INTERLEAVE(1); #endif /* FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE */ /* If Casper init is called with secure address, use secure addres also for accessing Casper RAM. */ s_casperRamBase = (unsigned)CASPER_RAM_BASE_NS | ((uint32_t)base & 0x10000000u); msg_ret = (uint32_t *)s_casperRamBase; } /*! * brief Disables clock for CASPER peripheral. * * Disable clock and enable reset. * * param base CASPER base address */ void CASPER_Deinit(CASPER_Type *base) { #if defined(CASPER_RSTS) RESET_SetPeripheralReset(kCASPER_RST_SHIFT_RSTn); #endif #if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) #if defined(CASPER_CLOCKS) CLOCK_DisableClock(kCLOCK_Casper); #endif #endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */ } /* New ECC code which uses Casper. */ /* Set the prime modulus mod in Casper memory. */ void CASPER_ecc_init(casper_algo_t curve) { uint32_t *mod; if (curve == kCASPER_ECC_P256) { N_wordlen = 256U / 32U; mod = NISTp256; } if (curve == kCASPER_ECC_P384) { N_wordlen = 384U / 32U; mod = NISTp384; } if (curve == kCASPER_ECC_P521) { N_wordlen = 576U / 32U; mod = NISTp521; } CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U)], mod, N_wordlen * sizeof(uint32_t)); uint8_t a[((CASPER_MAX_ECC_SIZE_WORDLEN + 4U) - CASPER_MAX_ECC_SIZE_WORDLEN) * sizeof(uint32_t)] = {0}; CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U) + N_wordlen], a, ((N_wordlen + 4U) - N_wordlen) * sizeof(uint32_t)); } void CASPER_ECC_equal(int *res, uint32_t *op1, uint32_t *op2) { uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0}; uint32_t b[CASPER_MAX_ECC_SIZE_WORDLEN] = {0}; uint32_t c = 0; CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(b, op2, N_wordlen * sizeof(uint32_t)); do { uint32_t _i; c = (a[0] ^ b[0]); for (_i = 1; _i < N_wordlen; _i++) { c |= (a[_i] ^ b[_i]); } } while (false); *res = (int32_t)c; } void CASPER_ECC_equal_to_zero(int *res, uint32_t *op1) { uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0}; uint32_t c = 0; CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t)); do { uint32_t _i; c = a[0]; for (_i = 1; _i < N_wordlen; _i++) { c |= a[_i]; } } while (false); *res = (int32_t)c; } void CASPER_ECC_SECP256R1_Mul( CASPER_Type *base, uint32_t resX[8], uint32_t resY[8], uint32_t X[8], uint32_t Y[8], uint32_t scalar[8]) { uint32_t X1[8] = {0}; uint32_t Y1[8] = {0}; toMontgomery_ECC_P256(X1, X); toMontgomery_ECC_P256(Y1, Y); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); Jac_scalar_multiplication( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], scalar, NISTp256, NISTp256_q); Jac_toAffine( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); /* Montgomery to Normal */ /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */ uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0}; one[0] = 0x1u; CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t)); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); /* copy out to result */ CASPER_MEMCPY( resX, &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( resY, &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); } void CASPER_ECC_SECP256R1_MulAdd(CASPER_Type *base, uint32_t resX[8], uint32_t resY[8], uint32_t X1[8], uint32_t Y1[8], uint32_t scalar1[8], uint32_t X2[8], uint32_t Y2[8], uint32_t scalar2[8]) { uint32_t zeroes[(kCASPER_ECC_P256_wordlen + 4U)] = {0}; CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], X2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], Y2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); toMontgomery_ECC_P256( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); toMontgomery_ECC_P256( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); toMontgomery_ECC_P256( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); toMontgomery_ECC_P256( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t)); double_scalar_multiplication( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], scalar1, &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], scalar2); Jac_toAffine( &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0}; one[0] = 0x1u; CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t)); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]); CASPER_MEMCPY(resX, (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]), (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(resY, (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]), (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); } void CASPER_ECC_SECP384R1_Mul( CASPER_Type *base, uint32_t resX[12], uint32_t resY[12], uint32_t X[12], uint32_t Y[12], uint32_t scalar[12]) { uint32_t X1[12] = {0}; uint32_t Y1[12] = {0}; toMontgomery_ECC_P384(X1, X); toMontgomery_ECC_P384(Y1, Y); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); Jac_scalar_multiplication( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], scalar, NISTp384, NISTp384_q); Jac_toAffine( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); /* Montgomery to Normal */ /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */ uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); /* copy out to result */ CASPER_MEMCPY( resX, &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( resY, &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); } void CASPER_ECC_SECP384R1_MulAdd(CASPER_Type *base, uint32_t resX[12], uint32_t resY[12], uint32_t X1[12], uint32_t Y1[12], uint32_t scalar1[12], uint32_t X2[12], uint32_t Y2[12], uint32_t scalar2[12]) { CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], X2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], Y2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); toMontgomery_ECC_P384( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); toMontgomery_ECC_P384( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); toMontgomery_ECC_P384( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); toMontgomery_ECC_P384( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); double_scalar_multiplication( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], scalar1, &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], scalar2); Jac_toAffine( &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; CASPER_MEMCPY( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); multiply_casper( &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)], &CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]); CASPER_MEMCPY(resX, (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]), (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(resY, (&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]), (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); } void CASPER_ECC_SECP521R1_Mul( CASPER_Type *base, uint32_t resX[18], uint32_t resY[18], uint32_t X[18], uint32_t Y[18], uint32_t scalar[18]) { uint32_t X1[18] = {0}; uint32_t Y1[18] = {0}; toMontgomery_ECC_P521(X1, X); toMontgomery_ECC_P521(Y1, Y); CASPER_MEMCPY( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); Jac_scalar_multiplication( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], scalar, NISTp521, NISTp521_q); Jac_toAffine( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); /* Montgomery to Normal */ /* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */ uint32_t one[18] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; CASPER_MEMCPY( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], one, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); multiply_casper( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); multiply_casper( &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); /* copy out to result */ CASPER_MEMCPY( resX, &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( resY, &CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); } void CASPER_ECC_SECP521R1_MulAdd(CASPER_Type *base, uint32_t resX[18], uint32_t resY[18], uint32_t X1[18], uint32_t Y1[18], uint32_t scalar1[18], uint32_t X2[18], uint32_t Y2[18], uint32_t scalar2[18]) { uint32_t zeroes[(kCASPER_ECC_P521_wordlen + 4U)] = {0}; CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], X2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], Y2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); toMontgomery_ECC_P521( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); toMontgomery_ECC_P521( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); toMontgomery_ECC_P521( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); toMontgomery_ECC_P521( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t)); CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t)); double_scalar_multiplication( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], scalar1, &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], scalar2); Jac_toAffine( &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); uint32_t one[(kCASPER_ECC_P521_wordlen + 4U)] = {0x0}; one[0] = 0x1u; CASPER_MEMCPY( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], one, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t)); multiply_casper( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); multiply_casper( &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)(uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)], &CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]); CASPER_MEMCPY( resX, (&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]), (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY( resY, (&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]), (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); } // CIOS Multiply. This is the Coarse Integrated form where the values are // multiplied and reduced for each step of "i". This uses less memory and // is faster as a result. Note that this is used to square as well as mul, // so not as fast as pure squaring could be. static void MultprecCiosMul_ct( uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np) { uint32_t j; uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64]; uint64_t Np64; uint64_t carry; uint64_t *a64, *b64, *w64, *N64; uint32_t *T1 = &CASPER_MEM[0], borrow; Np64 = *(uint64_t *)(uintptr_t)Np; a64 = (uint64_t *)(uintptr_t)a; b64 = (uint64_t *)(uintptr_t)b; w64 = (uint64_t *)(uintptr_t)w_out; N64 = (uint64_t *)(uintptr_t)Nmod; if (a != NULL) { /* if !a, we are reducing only */ PreZeroW(j, w_out); } SET_DWORD(&w64[N_dwordlen], 0ULL); SET_DWORD(&w64[N_dwordlen + 1U], 0ULL); /* with accelerator */ /* loop j and then reduce after each j round */ for (j = 0; j < N_dwordlen; j++) { /* Step 3. Iterate over N words of u using j - perform Multiply-accumulate */ /* push-pull: we do a*b and then separately m*n (reduce) */ if (a != NULL) { /* if mul&reduce vs. reduce only */ carry = GET_DWORD(&w64[N_dwordlen]); Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64)); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64))); Accel_done(); /* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */ /* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */ /* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */ /* w64[N_dwordlen+1] = g_carry; */ carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry); SET_DWORD(&w64[N_dwordlen + 1U], carry); } SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /* prime for 1st; modulo a double-word */ /* we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */ /* need its carry. We let the accel do this separately - really need a mode to */ /* do this "reduce" since it is natural */ carry = GET_DWORD(&w64[N_dwordlen]); Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0])); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0]))); Accel_done(); carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry); Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0]))); Accel_done(); SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry)); } /* now check if need to subtract Nmod */ CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t)); /* Compute w = w - N */ Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out))); Accel_done(); // if w_out > T1 then there was a borrow borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen])); SET_WORD(&w_out[N_wordlen + 1U], 0); SET_WORD(&w_out[N_wordlen], 0); casper_select(w_out, w_out, T1, (int32_t)borrow, (int16_t)(uint16_t)N_wordlen); } /* Compute C = A - B % mod * Assumes all operand have two extra limbs to store carry. */ static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod) { uint64_t *b64, *c64, *m64, *tmp; int borrow; b64 = (uint64_t *)(uintptr_t)B; c64 = (uint64_t *)(uintptr_t)C; m64 = (uint64_t *)(uintptr_t)mod; tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0]; CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t)); /* Compute tmp = A - B. */ Accel_SetABCD_Addr(CA_MK_OFF(b64), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(tmp))); Accel_done(); borrow = (int32_t)((GET_WORD(&((uint32_t *)(uintptr_t)tmp)[N_wordlen - 1U])) > GET_WORD(&A[N_wordlen - 1U])); CASPER_MEMCPY(c64, tmp, N_wordlen * sizeof(uint32_t)); /* Compute C = Mod + tmp */ Accel_SetABCD_Addr(CA_MK_OFF(m64), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(c64))); Accel_done(); casper_select(C, (uint32_t *)(uintptr_t)tmp, C, borrow, (int16_t)(uint16_t)N_wordlen); } /* Compute C = A + B % mod * Assumes all operand have two extra limbs to store carry. */ static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod) { uint64_t *b64, *c64, *m64, *tmp; int borrow; b64 = (uint64_t *)(uintptr_t)B; c64 = (uint64_t *)(uintptr_t)C; m64 = (uint64_t *)(uintptr_t)mod; tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0]; CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t)); SET_DWORD(&tmp[N_wordlen / 2U], 0ULL); SET_DWORD(&b64[N_wordlen / 2U], 0ULL); SET_DWORD(&m64[N_wordlen / 2U], 0ULL); /* Compute tmp = A + B using one additonal double-length limb. */ Accel_SetABCD_Addr(CA_MK_OFF(b64), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(tmp))); Accel_done(); CASPER_MEMCPY(c64, tmp, (N_wordlen + 2U) * sizeof(uint32_t)); /* Compute C = Mod - tmp */ Accel_SetABCD_Addr(CA_MK_OFF(m64), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(c64))); Accel_done(); // borrow = g_carry; borrow = (int32_t)(GET_WORD(&C[N_wordlen]) > GET_WORD(&(((uint32_t *)(uintptr_t)tmp)[N_wordlen]))); casper_select(C, C, (uint32_t *)(uintptr_t)tmp, borrow, (int16_t)(uint16_t)N_wordlen); } /* Compute c = a/2 mod p where b is scratch space. */ static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b) { shiftright(b, a, 1U); /* Compute a/2 and (a+p)/2 */ /* Compute tmp = a + p using one additonal double-length limb. */ CASPER_MEMCPY(c, a, N_wordlen * sizeof(uint32_t)); SET_WORD(&c[N_wordlen], 0); SET_WORD(&c[N_wordlen + 1U], 0U); Accel_SetABCD_Addr(CA_MK_OFF(((uint64_t *)(uintptr_t)&CASPER_MEM[(N_wordlen + 4U)])), 0); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(((uint64_t *)(uintptr_t)c)))); Accel_done(); shiftright(c, c, 1U); SET_WORD(&c[N_wordlen - 1U], GET_WORD(&c[N_wordlen - 1U]) | (GET_WORD(&c[N_wordlen]) << 31)); SET_WORD(&c[N_wordlen], 0U); casper_select(c, b, c, (int32_t)(uint32_t)(GET_WORD(&a[0]) & 1U), (int16_t)(uint16_t)(N_wordlen)); } static uint32_t casper_get_word(uint32_t *addr) { return GET_WORD(addr); } /* Shift right by 1 <= c <= 31. z[] and x[] in system RAM, no interleaving macros used. */ static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c) { z[0] = (x[1] << (32U - (c))) | (x[0] >> (c)); z[1] = (x[2] << (32U - (c))) | (x[1] >> (c)); z[2] = (x[3] << (32U - (c))) | (x[2] >> (c)); z[3] = (x[4] << (32U - (c))) | (x[3] >> (c)); z[4] = (x[5] << (32U - (c))) | (x[4] >> (c)); z[5] = (x[6] << (32U - (c))) | (x[5] >> (c)); z[6] = (x[7] << (32U - (c))) | (x[6] >> (c)); if (N_wordlen == 18U) { z[7] = (x[8] << (32U - (c))) | (x[7] >> (c)); z[8] = (x[9] << (32U - (c))) | (x[8] >> (c)); z[9] = (x[10] << (32U - (c))) | (x[9] >> (c)); z[10] = (x[11] << (32U - (c))) | (x[10] >> (c)); z[11] = (x[12] << (32U - (c))) | (x[11] >> (c)); z[12] = (x[13] << (32U - (c))) | (x[12] >> (c)); z[13] = (x[14] << (32U - (c))) | (x[13] >> (c)); z[14] = (x[15] << (32U - (c))) | (x[14] >> (c)); z[15] = (x[16] << (32U - (c))) | (x[15] >> (c)); z[16] = (x[17] << (32U - (c))) | (x[16] >> (c)); z[17] = (x[17] >> (c)); } if (N_wordlen == 12U) { z[7] = (x[8] << (32U - (c))) | (x[7] >> (c)); z[8] = (x[9] << (32U - (c))) | (x[8] >> (c)); z[9] = (x[10] << (32U - (c))) | (x[9] >> (c)); z[10] = (x[11] << (32U - (c))) | (x[10] >> (c)); z[11] = (x[11] >> (c)); } if (N_wordlen == 8U) { z[7] = (x[7] >> (c)); } } /* Shift right by 1 <= c <= 31. */ static void shiftright(uint32_t *z, uint32_t *x, uint32_t c) { SET_WORD(&z[0], (GET_WORD(&x[1]) << (32U - (c))) | (GET_WORD(&x[0]) >> (c))); SET_WORD(&z[1], (GET_WORD(&x[2]) << (32U - (c))) | (GET_WORD(&x[1]) >> (c))); SET_WORD(&z[2], (GET_WORD(&x[3]) << (32U - (c))) | (GET_WORD(&x[2]) >> (c))); SET_WORD(&z[3], (GET_WORD(&x[4]) << (32U - (c))) | (GET_WORD(&x[3]) >> (c))); SET_WORD(&z[4], (GET_WORD(&x[5]) << (32U - (c))) | (GET_WORD(&x[4]) >> (c))); SET_WORD(&z[5], (GET_WORD(&x[6]) << (32U - (c))) | (GET_WORD(&x[5]) >> (c))); SET_WORD(&z[6], (GET_WORD(&x[7]) << (32U - (c))) | (GET_WORD(&x[6]) >> (c))); if (N_wordlen == 18U) { SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c))); SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c))); SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c))); SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c))); SET_WORD(&z[11], (GET_WORD(&x[12]) << (32U - (c))) | (GET_WORD(&x[11]) >> (c))); SET_WORD(&z[12], (GET_WORD(&x[13]) << (32U - (c))) | (GET_WORD(&x[12]) >> (c))); SET_WORD(&z[13], (GET_WORD(&x[14]) << (32U - (c))) | (GET_WORD(&x[13]) >> (c))); SET_WORD(&z[14], (GET_WORD(&x[15]) << (32U - (c))) | (GET_WORD(&x[14]) >> (c))); SET_WORD(&z[15], (GET_WORD(&x[16]) << (32U - (c))) | (GET_WORD(&x[15]) >> (c))); SET_WORD(&z[16], (GET_WORD(&x[17]) << (32U - (c))) | (GET_WORD(&x[16]) >> (c))); SET_WORD(&z[17], (GET_WORD(&x[17]) >> (c))); } if (N_wordlen == 12U) { SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c))); SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c))); SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c))); SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c))); SET_WORD(&z[11], (GET_WORD(&x[11]) >> (c))); } if (N_wordlen == 8U) { SET_WORD((&z[7]), (GET_WORD(&x[7]) >> (c))); } } /* Shift left by 1 <= c <= 31. */ static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c) { if (N_wordlen == 18U) { SET_WORD(&z[17], (GET_WORD(&x[17]) << (c)) | GET_WORD(&z[16]) >> (32U - (c))); SET_WORD(&z[16], (GET_WORD(&x[16]) << (c)) | GET_WORD(&z[15]) >> (32U - (c))); SET_WORD(&z[15], (GET_WORD(&x[15]) << (c)) | GET_WORD(&z[14]) >> (32U - (c))); SET_WORD(&z[14], (GET_WORD(&x[14]) << (c)) | GET_WORD(&z[13]) >> (32U - (c))); SET_WORD(&z[13], (GET_WORD(&x[13]) << (c)) | GET_WORD(&z[12]) >> (32U - (c))); SET_WORD(&z[12], (GET_WORD(&x[12]) << (c)) | GET_WORD(&z[11]) >> (32U - (c))); SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c))); SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c))); SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c))); SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c))); } if (N_wordlen == 12U) { SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c))); SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c))); SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c))); SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c))); } SET_WORD(&z[7], (GET_WORD(&x[7]) << (c)) | GET_WORD(&z[6]) >> (32U - (c))); SET_WORD(&z[6], (GET_WORD(&x[6]) << (c)) | GET_WORD(&z[5]) >> (32U - (c))); SET_WORD(&z[5], (GET_WORD(&x[5]) << (c)) | GET_WORD(&z[4]) >> (32U - (c))); SET_WORD(&z[4], (GET_WORD(&x[4]) << (c)) | GET_WORD(&z[3]) >> (32U - (c))); SET_WORD(&z[3], (GET_WORD(&x[3]) << (c)) | GET_WORD(&z[2]) >> (32U - (c))); SET_WORD(&z[2], (GET_WORD(&x[2]) << (c)) | GET_WORD(&z[1]) >> (32U - (c))); SET_WORD(&z[1], (GET_WORD(&x[1]) << (c)) | GET_WORD(&z[0]) >> (32U - (c))); SET_WORD(&z[0], (GET_WORD(&x[0]) << (c))); } static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[]) { uint32_t *Np; if (N_wordlen == 8U) { Np = Np256; MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np); } if (N_wordlen == 12U) { Np = Np384; MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np); } if (N_wordlen == 18U) { Np = Np521; MultprecCiosMul521_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np); } } /* Convert a projective point (X1 : Y1 : Z1) * to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3) * The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap */ void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1) { uint32_t *T1, *T2; T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)]; T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)]; square_casper(T1, Z1); // Z^2 multiply_casper(T2, T1, Z1); // Z^3 // Montgomery inverse if (N_wordlen == 8U) { invert_mod_p256(T1, T2); } if (N_wordlen == 12U) { invert_mod_p384(T1, T2); } if (N_wordlen == 18U) { invert_mod_p521(T1, T2); } multiply_casper(Y3, Y1, T1); // Y3 = Y/Z^3 multiply_casper(T2, T1, Z1); // Z^-2 multiply_casper(X3, X1, T2); // X3 = X/Z^2 } /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2) * where (X1: Y1: Z1) != (X2 : Y2 : Z2) * (X3 : Y3: Z3) may be the same as one of the inputs. */ void Jac_addition(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2, uint32_t *Z2) { uint32_t *Z1Z1, *Z2Z2, *U1, *S1, *J, *H, *V, *t0, *t1; int m1, m2; Z1Z1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)]; Z2Z2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)]; U1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)]; S1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)]; J = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)]; H = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)]; V = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)]; t0 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)]; t1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)]; CASPER_ECC_equal_to_zero(&m1, Z1); CASPER_ECC_equal_to_zero(&m2, Z2); if (m1 == 0) { CASPER_MEMCPY(X3, X2, N_wordlen * 4U); CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U); CASPER_MEMCPY(Z3, Z2, N_wordlen * 4U); return; } if (m2 == 0) { CASPER_MEMCPY(X3, X1, N_wordlen * 4U); CASPER_MEMCPY(Y3, Y1, N_wordlen * 4U); CASPER_MEMCPY(Z3, Z1, N_wordlen * 4U); return; } square_casper(Z1Z1, Z1); square_casper(Z2Z2, Z2); multiply_casper(U1, X1, Z2Z2); multiply_casper(H, X2, Z1Z1); /* if H equals U1 then X's are the same */ multiply_casper(t0, Z2, Z2Z2); multiply_casper(S1, Y1, t0); multiply_casper(t0, Z1, Z1Z1); multiply_casper(J, Y2, t0); /* if (S1 == J) then Y's are the same */ CASPER_ECC_equal(&m1, H, U1); /* If H and U1 match then the X-coordinates are the same. */ CASPER_ECC_equal(&m2, S1, J); /* If S1 and J match then the Y-coordinates are the same. */ if (m1 == 0) { if (m2 == 0) { Jac_double(X3, Y3, Z3, X1, Y1, Z1); return; } /* else { We work with the point at infinity. The Z-coordinate will be set to zero in this function. } */ } sub_casper(H, H, U1); mul2_casper(t0, H); square_casper(t1, t0); sub_casper(t0, J, S1); multiply_casper(J, H, t1); multiply_casper(V, U1, t1); mul2_casper(U1, t0); square_casper(t0, U1); mul2_casper(t1, V); sub_casper(t0, t0, J); sub_casper(X3, t0, t1); sub_casper(t0, V, X3); multiply_casper(t1, S1, J); mul2_casper(t1, t1); multiply_casper(V, U1, t0); sub_casper(Y3, V, t1); add_casper(V, Z1, Z2); square_casper(t1, V); sub_casper(t1, t1, Z1Z1); sub_casper(t1, t1, Z2Z2); multiply_casper(Z3, t1, H); } /* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2) * where (X1: Y1: Z1) != (X2, Y2) * (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1). * Source: 2004 Hankerson?Menezes?Vanstone, page 91. */ void Jac_add_affine( uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2) { uint32_t *T1, *T2, *T3, *T4, *T5; uint32_t *ONE = NULL; int m1, m2; if (N_wordlen == 8U) { ONE = NISTr256; } if (N_wordlen == 12U) { ONE = NISTr384; } if (N_wordlen == 18U) { ONE = NISTr521; } T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)]; T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)]; T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)]; T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)]; T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)]; CASPER_ECC_equal_to_zero(&m1, Z1); if (m1 == 0) { CASPER_MEMCPY(X3, X2, N_wordlen * 4U); CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U); CASPER_MEMCPY(Z3, ONE, N_wordlen * 4U); return; } CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t)); square_casper(T3, Z1); multiply_casper(T2, T3, Z1); multiply_casper(T4, T3, X2); multiply_casper(T3, T2, Y2); CASPER_ECC_equal(&m1, T4, X1); CASPER_ECC_equal(&m2, T3, Y1); if (m1 == 0) { if (m2 == 0) { Jac_double(X3, Y3, Z3, X1, Y1, Z1); return; } /* else { We work with the point at infinity. The Z-coordinate will be set to zero in this function. } */ } sub_casper(T1, T4, X1); sub_casper(T2, T3, Y1); multiply_casper(Z3, T5, T1); square_casper(T3, T1); multiply_casper(T4, T3, T1); multiply_casper(T5, T3, X1); mul2_casper(T1, T5); square_casper(X3, T2); sub_casper(X3, X3, T1); sub_casper(X3, X3, T4); sub_casper(T3, T5, X3); multiply_casper(T1, T3, T2); multiply_casper(T2, T4, Y1); sub_casper(Y3, T1, T2); } static uint32_t casper_get_word(uint32_t *addr); /* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91. * Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1) * (X3 : Y3: Z3) may be the same as the input. */ void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1) { uint32_t *T1, *T2, *T3, *T4, *T5; T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)]; T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)]; T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)]; T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)]; T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)]; square_casper(T1, Z1); sub_casper(T3, X1, T1); add_casper(T1, X1, T1); multiply_casper(T4, T3, T1); mul2_casper(T3, T4); add_casper(T2, T3, T4); mul2_casper(Y3, Y1); CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t)); multiply_casper(Z3, Y3, T5); square_casper(T5, Y3); multiply_casper(T3, T5, X1); square_casper(Y3, T5); half(T5, Y3, T4); square_casper(X3, T2); mul2_casper(T1, T3); sub_casper(X3, X3, T1); sub_casper(T1, T3, X3); multiply_casper(T3, T1, T2); sub_casper(Y3, T3, T5); } /* Recoding for a signed fixed window. * Source: https://eprint.iacr.org/2014/130.pdf, Algorithm 6 * Recode the n-bit integer k into ciel(log2(n)/(w-1)) digits * where each digit is in * { +/- 1, +/- 3, ..., +/- 2^(w-1)-1 } * and put the result in c. */ static void recode(int8_t *c, uint32_t *k, int n, int w) { int i, t; uint32_t K[CASPER_MAX_ECC_SIZE_WORDLEN] = {0}; (void)memcpy(K, k, (size_t)ceil(((double)n / 8.))); t = (n + (w - 2)) / (w - 1); for (i = 0; i < t; i++) { c[i] = (int8_t)(uint8_t)((K[0] & ((uint32_t)(uint32_t)(1UL << (uint32_t)w) - 1UL)) - (uint32_t)(uint32_t)(1UL << ((uint32_t)w - 1UL))); shiftrightSysram(K, K, (unsigned)w - 1U); (void)add_n_1(K, K, (uint32_t)c[i] >> 31, (int16_t)(uint16_t)N_wordlen); } c[t] = (int8_t)K[0]; } static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n) { int i; uint32_t borrow; sub_borrowout(borrow, GET_WORD(&c[0]), a[0], GET_WORD(&b[0])); for (i = 1; i < n; i++) { sub_borrowin_borrowout(borrow, GET_WORD(&c[i]), a[i], GET_WORD(&b[i]), borrow); } return borrow; } #if 0 /* Dumb n-limb subtraction of c=a-b, return borrow. */ static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n) { int i; uint32_t borrow; sub_borrowout(borrow, c[0], a[0], b); for (i = 1; i < n; i++) { sub_borrowin_borrowout_1(borrow, c[i], a[i], borrow); } return borrow; } /* Dumb n-limb addition of c=a+b, return carry. */ static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n) { int i; uint32_t carry; add_cout(carry, c[0], a[0], b[0]); for (i = 1; i < n; i++) { add_cout_cin(carry, c[i], a[i], b[i], carry); } return carry; } #endif /* Dumb n-limb addition of c=a+b, return carry. */ static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n) { int i; uint32_t carry; add_cout(carry, c[0], a[0], b); for (i = 1; i < n; i++) { add_cout_cin(carry, c[i], a[i], 0U, carry); } return carry; } static uint8_t int8abs(int8_t v) { return ((v < 0) ? ((uint8_t)-v) : ((uint8_t)v)); } /* Constant time elliptic curve scalar multiplication. * Source: https://eprint.iacr.org/2014/130.pdf * when using w = 4. * Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p) * p is the prime used to define the finite field F_p * q is the (prime) order of the curve */ void Jac_scalar_multiplication( uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q) { uint32_t *scalar, *M, *X, *Y, *Z, *mem_loc; uint32_t *ONE = NULL; int i, sign, odd; uint8_t index; size_t recodeLength = 175u; size_t bitlen = 0u; int8_t rec[CASPER_RECODE_LENGTH_MAX] = {0}; if (N_wordlen == 8U) { recodeLength = (size_t)kCASPER_ECC_P256_recode_len; bitlen = (size_t)kCASPER_ECC_P256_N_bitlen; ONE = NISTr256; } if (N_wordlen == 12U) { recodeLength = (size_t)kCASPER_ECC_P384_recode_len; bitlen = (size_t)kCASPER_ECC_P384_N_bitlen; ONE = NISTr384; } if (N_wordlen == 18U) { recodeLength = (size_t)kCASPER_ECC_P521_recode_len; bitlen = (size_t)521U; ONE = NISTr521; } /* Point to the start of the LUT table space. */ mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)]; scalar = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * (N_wordlen + 4U)]; X = &CASPER_MEM[(20U * N_wordlen + 80U) + 13U * (N_wordlen + 4U)]; Y = &CASPER_MEM[(20U * N_wordlen + 80U) + 14U * (N_wordlen + 4U)]; Z = &CASPER_MEM[(20U * N_wordlen + 80U) + 15U * (N_wordlen + 4U)]; M = &CASPER_MEM[(20U * N_wordlen + 80U) + 16U * (N_wordlen + 4U)]; /* Point to memory the recoded scalar. */ CASPER_MEMCPY(scalar, k, sizeof(uint32_t) * N_wordlen); /* Precomputation: compute 1*P, 3*P, 5*P, and 7*P */ #define FSL_CASPER_LUT(P, x) (mem_loc + (3U * ((P)-1U) / 2U + (x)) * (N_wordlen + 4U)) /* Set 1*P */ CASPER_MEMCPY(Z3, ONE, N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(FSL_CASPER_LUT(1U, 0U), X1, N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(FSL_CASPER_LUT(1U, 1U), Y1, N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(FSL_CASPER_LUT(1U, 2U), Z3, N_wordlen * sizeof(uint32_t)); /* Compute 2*P */ Jac_double(X3, Y3, Z3, X1, Y1, Z3); /* Compute 3*P = 2P + P */ Jac_add_affine(FSL_CASPER_LUT(3U, 0U), FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3, X1, Y1); /* Compute 5*P = 3P + 2P */ Jac_addition(FSL_CASPER_LUT(5U, 0U), FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), FSL_CASPER_LUT(3U, 0U), FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3); /* Compute 7*P = 5P + 2P */ Jac_addition(FSL_CASPER_LUT(7U, 0U), FSL_CASPER_LUT(7U, 1U), FSL_CASPER_LUT(7U, 2U), FSL_CASPER_LUT(5U, 0U), FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), X3, Y3, Z3); /* Recode the scalar */ odd = (int32_t)((uint32_t)(casper_get_word(&scalar[0]) & 1U)); (void)sub_n(M, q, scalar, (int16_t)(uint16_t)N_wordlen); // todo!!! casper_select(scalar, M, scalar, odd, (int16_t)(uint16_t)N_wordlen); /* Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits */ uint32_t scalarSysram[CASPER_MAX_ECC_SIZE_WORDLEN]; CASPER_MEMCPY(scalarSysram, scalar, /*CASPER_*/ N_wordlen * sizeof(uint32_t)); recode(rec, scalarSysram, (int32_t)bitlen, 4); /* Set the first value. */ index = int8abs(rec[recodeLength - 1U]); sign = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[recodeLength - 1U]) >> 7); CASPER_MEMCPY(X3, FSL_CASPER_LUT((uint32_t)index, 0U), N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(Y3, FSL_CASPER_LUT((uint32_t)index, 1U), N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(Z3, FSL_CASPER_LUT((uint32_t)index, 2U), N_wordlen * sizeof(uint32_t)); /* Get the correct LUT element in constant time by touching * all elements and masking out the correct one. */ #define GET_LUT(x, y, z, index) \ do \ { \ int m; \ CASPER_MEMCPY((x), FSL_CASPER_LUT(1U, 0U), N_wordlen * sizeof(uint32_t)); \ CASPER_MEMCPY((y), FSL_CASPER_LUT(1U, 1U), N_wordlen * sizeof(uint32_t)); \ CASPER_MEMCPY((z), FSL_CASPER_LUT(1U, 2U), N_wordlen * sizeof(uint32_t)); \ m = (int32_t)((index) == 3U); \ casper_select((x), (x), FSL_CASPER_LUT(3U, 0U), m, (int16_t)(uint16_t)N_wordlen); \ casper_select((y), (y), FSL_CASPER_LUT(3U, 1U), m, (int16_t)(uint16_t)N_wordlen); \ casper_select((z), (z), FSL_CASPER_LUT(3U, 2U), m, (int16_t)(uint16_t)N_wordlen); \ m = (int32_t)((index) == 5U); \ casper_select((x), (x), FSL_CASPER_LUT(5U, 0U), m, (int16_t)(uint16_t)N_wordlen); \ casper_select((y), (y), FSL_CASPER_LUT(5U, 1U), m, (int16_t)(uint16_t)N_wordlen); \ casper_select((z), (z), FSL_CASPER_LUT(5U, 2U), m, (int16_t)(uint16_t)N_wordlen); \ m = (int32_t)((index) == 7U); \ casper_select((x), (x), FSL_CASPER_LUT(7U, 0U), m, (int16_t)(uint16_t)N_wordlen); \ casper_select((y), (y), FSL_CASPER_LUT(7U, 1U), m, (int16_t)(uint16_t)N_wordlen); \ casper_select((z), (z), FSL_CASPER_LUT(7U, 2U), m, (int16_t)(uint16_t)N_wordlen); \ } while (false) GET_LUT(X3, Y3, Z3, index); /* Compute -y and select the positive or negative point. */ (void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!! casper_select(Y3, Y3, M, sign, (int16_t)(uint16_t)N_wordlen); for (i = (int)(uint32_t)(recodeLength - 2U); i >= 0; i--) { Jac_double(X3, Y3, Z3, X3, Y3, Z3); Jac_double(X3, Y3, Z3, X3, Y3, Z3); Jac_double(X3, Y3, Z3, X3, Y3, Z3); index = int8abs(rec[i]); sign = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[i]) >> 7); GET_LUT(X, Y, Z, index); /* Compute -y and select the positive or negative point. */ (void)sub_n(scalar, p, Y, (int16_t)(uint16_t)N_wordlen); // todo!!! casper_select(scalar, Y, scalar, sign, (int16_t)(uint16_t)N_wordlen); Jac_addition(X3, Y3, Z3, X3, Y3, Z3, X, scalar, Z); } (void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!! casper_select(Y3, M, Y3, odd, (int16_t)(uint16_t)N_wordlen); } #undef FSL_CASPER_LUT #undef GET_LUT /* * Pre-compute the following 16 points: * 00 00 = 0*P + 0*Q <-- Not needed when using sliding windows * 00 01 = 0*P + 1*Q <-- Not needed when using sliding windows * 00 10 = 0*P + 2*Q * 00 11 = 0*P + 3*Q * * 01 00 = 1*P + 0*Q <-- Not needed when using sliding windows * 01 01 = 1*P + 1*Q <-- Not needed when using sliding windows * 01 10 = 1*P + 2*Q * 01 11 = 1*P + 3*Q * * 10 00 = 2*P + 0*Q * 10 01 = 2*P + 1*Q * 10 10 = 2*P + 2*Q * 10 11 = 2*P + 3*Q * * 11 00 = 3*P + 0*Q * 11 01 = 3*P + 1*Q * 11 10 = 3*P + 2*Q * 11 11 = 3*P + 3*Q * * index = (bitsi||bitsj)-2 - (biti != 0)*2 * * Input: P = (X1 : Y1 : Z1) and * Q = (X2 : Y2 : Z2) * Output: mem_loc, memory location for the LUT. */ static void precompute_double_scalar_LUT16(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy) { uint32_t *Q2x, *Q2y, *Q2z, *P2x, *P2y, *P2z, *Z, *mem_loc; uint32_t *ONE = NULL; uint32_t index = 0; if (N_wordlen == 8U) { ONE = NISTr256; } if (N_wordlen == 12U) { ONE = NISTr384; } Q2x = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 0U * (N_wordlen + 4U)]; Q2y = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)]; Q2z = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)]; /* Re-use memory from different scratch space since no * projective point addition is used below. */ P2x = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)]; P2z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)]; P2y = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)]; Z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)]; mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)]; CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t)); // 00 10 = 0*P + 2*Q Jac_double(Q2x, Q2y, Q2z, Qx, Qy, Z); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 00 11 = 0*P + 3*Q Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 01 10 = 1*P + 2*Q Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Px, Py); CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 01 11 = 1*P + 3*Q Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 10 00 = 2*P + 0*Q Jac_double(P2x, P2y, P2z, Px, Py, Z); CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 10 01 = 2*P + 1*Q Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 10 10 = 2*P + 2*Q Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 10 11 = 2*P + 3*Q Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 11 00 = 3*P + 0*Q Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Px, Py); CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 11 01 = 3*P + 1*Q Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 11 10 = 3*P + 2*Q Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 11 11 = 3*P + 3*Q Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy); CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; } /* * Pre-compute the following 4 points: * 0 0 = 0*P + 0*Q <-- Not needed when using sliding windows * 0 1 = 0*P + 1*Q * * 1 0 = 1*P + 0*Q * 1 1 = 1*P + 1*Q * * index = (bitsj+1) & (0-bitsi) * * Input: P = (X1 : Y1 : Z1) and * Q = (X2 : Y2 : Z2) * Output: mem_loc, memory location for the LUT. */ static void precompute_double_scalar_LUT4(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy) { uint32_t *Z, *mem_loc, *ONE; uint32_t index = 0; ONE = NISTr521; /* Re-use memory from different scratch space since no * projective point addition is used below. */ Z = &CASPER_MEM[(11U * N_wordlen + 4U) + 5U * (N_wordlen + 4U)]; mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)]; CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t)); // 0*P + 1*Q CASPER_MEMCPY(&mem_loc[index], Qx, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Qy, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 1*P + 0*Q CASPER_MEMCPY(&mem_loc[index], Px, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Py, N_wordlen * sizeof(uint32_t)); index += N_wordlen; CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t)); index += N_wordlen; // 1*P + 1*Q Jac_add_affine(&mem_loc[index], &mem_loc[index + N_wordlen], &mem_loc[index + 2U * N_wordlen], Px, Py, Z, Qx, Qy); } #define GETLUTX(x) (3U * (x)*N_wordlen) #define GETLUTY(x) (3U * (x)*N_wordlen + 1U * N_wordlen) #define GETLUTZ(x) (3U * (x)*N_wordlen + 2U * N_wordlen) /* Compute the double scalar multiplication * (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2) * Using Shamir's trick and precomputing 16 points. * This code is *not* constant time since this is used * for verification only. */ void double_scalar_multiplication(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k1, uint32_t *X2, uint32_t *Y2, uint32_t *k2) { uint32_t index = 0, c = 0; uint32_t *p1 = NULL, *p2 = NULL, x1, x2, *lut, *Tx = NULL, *Ty = NULL, *Tz = NULL; size_t bitlen, shiftr, shiftl = 0u; if (N_wordlen == 8U) { bitlen = (size_t)kCASPER_ECC_P256_N_bitlen; precompute_double_scalar_LUT16(X1, Y1, X2, Y2); shiftr = 30U; shiftl = 2U; } if (N_wordlen == 12U) { bitlen = (size_t)kCASPER_ECC_P384_N_bitlen; precompute_double_scalar_LUT16(X1, Y1, X2, Y2); shiftr = 30U; shiftl = 2U; } if (N_wordlen == 18U) { bitlen = (size_t)kCASPER_ECC_P521_N_bitlen; precompute_double_scalar_LUT4(X1, Y1, X2, Y2); shiftr = 31U; shiftl = 1U; } lut = &CASPER_MEM[(20U * N_wordlen + 80U)]; if (N_wordlen == 8U || N_wordlen == 12U) { p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen]; p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)]; Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)]; Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 3U * (N_wordlen + 4U)]; Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 4U * (N_wordlen + 4U)]; } if (N_wordlen == 18U) { p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen]; p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 1U * (N_wordlen + 4U)]; Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 2U * (N_wordlen + 4U)]; Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 3U * (N_wordlen + 4U)]; Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 4U * (N_wordlen + 4U)]; } CASPER_MEMCPY(p1, k1, sizeof(uint32_t) * N_wordlen); CASPER_MEMCPY(p2, k2, sizeof(uint32_t) * N_wordlen); /* Check if we can slide. */ while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U && c < bitlen) { shiftleft(p1, p1, 1U); shiftleft(p2, p2, 1U); c++; /* No doubling needed. */ } /* Set the first value. */ x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr; x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr; if (N_wordlen == 8U || N_wordlen == 12U) { index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U; } if (N_wordlen == 18U) { index = (((x2) + 1U) & (0U - (x1))); } shiftleft(p1, p1, shiftl); shiftleft(p2, p2, shiftl); CASPER_MEMCPY(X3, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(Y3, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(Z3, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t)); c += shiftl; // todo: create an is_zero function while ((casper_get_word(&p1[0]) | casper_get_word(&p1[1]) | casper_get_word(&p1[2]) | casper_get_word(&p1[3]) | casper_get_word(&p1[4]) | casper_get_word(&p1[5]) | casper_get_word(&p1[6]) | casper_get_word(&p1[7]) | casper_get_word(&p1[8]) | casper_get_word(&p1[9]) | casper_get_word(&p1[10]) | casper_get_word(&p1[11]) | casper_get_word(&p1[12]) | casper_get_word(&p1[13]) | casper_get_word(&p1[14]) | casper_get_word(&p1[15]) | casper_get_word(&p1[16]) | casper_get_word(&p1[17]) | casper_get_word(&p2[0]) | casper_get_word(&p2[1]) | casper_get_word(&p2[2]) | casper_get_word(&p2[3]) | casper_get_word(&p2[4]) | casper_get_word(&p2[5]) | casper_get_word(&p2[6]) | casper_get_word(&p2[7]) | casper_get_word(&p2[8]) | casper_get_word(&p2[9]) | casper_get_word(&p2[10]) | casper_get_word(&p2[11]) | casper_get_word(&p2[12]) | casper_get_word(&p2[13]) | casper_get_word(&p2[14]) | casper_get_word(&p2[15]) | casper_get_word(&p2[16]) | casper_get_word(&p2[17])) != 0U) { /* Check if we can slide. */ while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U && c < bitlen) { shiftleft(p1, p1, 1U); shiftleft(p2, p2, 1U); Jac_double(X3, Y3, Z3, X3, Y3, Z3); c++; } if (c >= (bitlen - 1U)) { break; } for (uint32_t i = 0; i < shiftl; i++) { Jac_double(X3, Y3, Z3, X3, Y3, Z3); } x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr; x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr; if (N_wordlen == 8U || N_wordlen == 12U) { index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U; } if (N_wordlen == 18U) { index = (((x2) + 1U) & (0U - (x1))); } shiftleft(p1, p1, shiftl); shiftleft(p2, p2, shiftl); CASPER_MEMCPY(Tx, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(Ty, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(Tz, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t)); Jac_addition(X3, Y3, Z3, X3, Y3, Z3, Tx, Ty, Tz); //&lut[GETLUTX(index)], &lut[GETLUTY(index)], &lut[GETLUTZ(index)]); c += shiftl; } /* Special case in the end. */ if (c == (bitlen - 1U)) { Jac_double(X3, Y3, Z3, X3, Y3, Z3); x1 = casper_get_word(&p1[N_wordlen - 1U]) >> 31; x2 = casper_get_word(&p2[N_wordlen - 1U]) >> 31; if (0U != x1) { Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X1, Y1); } if (x2 != 0U) { Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X2, Y2); } c++; } while (c < bitlen) { Jac_double(X3, Y3, Z3, X3, Y3, Z3); c++; } } static void invert_mod_p256(uint32_t *c, uint32_t *a) { int i; uint32_t *t, *t2, *s1, *s2, *s4, *s8, *tmp; /* Assuming it is safe to use the ECC scratch size. */ t = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; t2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; s1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; s2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; s4 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; s8 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; tmp = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; // t2 = n^(2^1)*n # 11 square_casper(tmp, a); multiply_casper(t2, tmp, a); // s1 = t2^(2^2)*t2 # F square_casper(s1, t2); square_casper(tmp, s1); multiply_casper(s1, tmp, t2); // s2 = s1^(2^4)*s1 # FF square_casper(s2, s1); // for (i = 1; i < 4; i++) square(s2, s2); square_casper(tmp, s2); square_casper(s2, tmp); square_casper(tmp, s2); multiply_casper(s2, tmp, s1); // s4 = s2^(2^8)*s2 # FFFF square_casper(s4, s2); for (i = 1; i < 7; i += 2) { square_casper(tmp, s4); square_casper(s4, tmp); } square_casper(tmp, s4); multiply_casper(s4, tmp, s2); // s8 = s4^(2^16)*s4 # FFFFFFFF square_casper(s8, s4); for (i = 1; i < 15; i += 2) { square_casper(tmp, s8); square_casper(s8, tmp); } square_casper(tmp, s8); multiply_casper(s8, tmp, s4); // t = s8^(2^32)*n # ffffffff00000001 square_casper(tmp, s8); for (i = 1; i < 31; i += 2) { square_casper(t, tmp); square_casper(tmp, t); } square_casper(t, tmp); multiply_casper(tmp, t, a); // t = t^(2^128)*s8 # ffffffff00000001000000000000000000000000ffffffff for (i = 0; i < 128; i += 2) { square_casper(t, tmp); square_casper(tmp, t); } multiply_casper(t, tmp, s8); // t = t^(2^32)*s8 # ffffffff00000001000000000000000000000000ffffffffffffffff for (i = 0; i < 32; i += 2) { square_casper(tmp, t); square_casper(t, tmp); } multiply_casper(tmp, t, s8); // t = t^(2^16)*s4 # ffffffff00000001000000000000000000000000ffffffffffffffffffff for (i = 0; i < 16; i += 2) { square_casper(t, tmp); square_casper(tmp, t); } multiply_casper(t, tmp, s4); // t = t^(2^8)*s2 # ffffffff00000001000000000000000000000000ffffffffffffffffffffff for (i = 0; i < 8; i += 2) { square_casper(tmp, t); square_casper(t, tmp); } multiply_casper(tmp, t, s2); // t = t^(2^4)*s1 # ffffffff00000001000000000000000000000000fffffffffffffffffffffff for (i = 0; i < 4; i += 2) { square_casper(t, tmp); square_casper(tmp, t); } multiply_casper(t, tmp, s1); // t = t^(2^2)*t2 square_casper(tmp, t); square_casper(t, tmp); multiply_casper(tmp, t, t2); // t = t^(2^2)*n # ffffffff00000001000000000000000000000000fffffffffffffffffffffffd square_casper(t, tmp); square_casper(tmp, t); multiply_casper(c, tmp, a); } // A and C do not need to be in Casper memory static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A) { /* R^2 = 2^512 mod p, used to convert values to Montgomery form. */ uint32_t R2[kCASPER_ECC_P256_wordlen] = {0x00000003, 0x00000000, 0xffffffffU, 0xfffffffbU, 0xfffffffeU, 0xffffffffU, 0xfffffffdU, 0x4}; uint32_t *T1, *T2, *T3; T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]; CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); multiply_casper(T3, T2, T1); CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t)); } /* Compute inversion modulo NIST-p384 using Fermats little theorem. * Using c = a^(p-2) = a^(-1) mod p. * This computes the modular inversion if all arithmetic is "regular" * modular arithmetic or computes automatically the Montgomery inverse * if all arithmetic is Montgomery arithmetic. */ static void invert_mod_p384(uint32_t *c, uint32_t *a) { int i; uint32_t *e, *d, *tmp, *t0, *t1, *t2, *t3, *t4, *t5, *t6; // 10 residues needed /* Assuming it is safe to use the LUT scratch size. * Hence, do not invert while elements in the LUT are needed. */ e = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; d = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; tmp = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t0 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t5 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; t6 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; square_casper(tmp, a); // 2 square_casper(t1, tmp); // 4 square_casper(tmp, t1); // 8 multiply_casper(t2, tmp, t1); // 12 multiply_casper(d, a, t2); // 13 multiply_casper(e, d, a); // 14 multiply_casper(t0, e, a); // 15 // t1 = t0^(2^4)*t0 # ff square_casper(tmp, t0); square_casper(t1, tmp); square_casper(tmp, t1); square_casper(t2, tmp); multiply_casper(t1, t2, t0); // t2 = t1^(2^8)*t1 # 4f square_casper(tmp, t1); for (i = 0; i < 3; i++) { square_casper(t3, tmp); square_casper(tmp, t3); } square_casper(t3, tmp); multiply_casper(t2, t3, t1); // t3 = t2^(2^16)*t2 # 8f square_casper(tmp, t2); for (i = 0; i < 7; i++) { square_casper(t4, tmp); square_casper(tmp, t4); } square_casper(t4, tmp); multiply_casper(t3, t4, t2); // t4 = t3^(2^32)*t3 # 16f square_casper(tmp, t3); for (i = 0; i < 15; i++) { square_casper(t5, tmp); square_casper(tmp, t5); } square_casper(t5, tmp); multiply_casper(t4, t5, t3); // t5 = t4^(2^64)*t4 # 32f square_casper(tmp, t4); for (i = 0; i < 31; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t4); // t5 = t5^(2^64)*t4 # 48f square_casper(tmp, t5); for (i = 0; i < 31; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t4); // t5 = t5^(2^32)*t3 # 56f square_casper(tmp, t5); for (i = 0; i < 15; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t3); // t5 = t5^(2^16)*t2 # 60f square_casper(tmp, t5); for (i = 0; i < 7; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t2); // t5 = t5^(2^8)*t1 # 62f square_casper(tmp, t5); for (i = 0; i < 3; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t1); // n = t5^(2^4)*t0 # 63f square_casper(tmp, t5); for (i = 0; i < 1; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t0); // n = n^(2^4)*e square_casper(tmp, t5); for (i = 0; i < 1; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, e); // n = n^(2^32)*t3 square_casper(tmp, t5); for (i = 0; i < 15; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t3); // n = n^(2^64) square_casper(tmp, t5); for (i = 0; i < 31; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t5, tmp); // n = n^(2^16)*t2 square_casper(tmp, t5); for (i = 0; i < 7; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t2); // n = n^(2^8)*t1 square_casper(tmp, t5); for (i = 0; i < 3; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t1); // n = n^(2^4)*t0 square_casper(tmp, t5); for (i = 0; i < 1; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(t5, t6, t0); // n = n^(2^4)*d square_casper(tmp, t5); for (i = 0; i < 1; i++) { square_casper(t6, tmp); square_casper(tmp, t6); } square_casper(t6, tmp); multiply_casper(c, t6, d); } // A and C do not need to be in Casper memory static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A) { /* R^2 = 2^768 mod p, used to convert values to Montgomery form. */ uint32_t R2[kCASPER_ECC_P384_wordlen] = {0x00000001, 0xfffffffeU, 0x00000000, 0x00000002, 0x00000000, 0xfffffffeU, 0x00000000, 0x00000002, 0x1, 0x0, 0x0, 0x0}; uint32_t *T1, *T2, *T3; T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + (9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]; CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); multiply_casper(T3, T2, T1); CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t)); } static void invert_mod_p521(uint32_t *c, uint32_t *a) { int i; uint32_t *e3, *d2, *d3, *d4, *T2, *T4; // 6 residues needed /* Assuming it is safe to use the LUT scratch size. * Hence, do not invert while elements in the LUT are needed. */ e3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; d2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; d3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; d4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; T4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; square_casper(d2, a); multiply_casper(T2, d2, a); // d3 = 2^2 * T2 square_casper(d3, T2); square_casper(e3, d3); multiply_casper(T4, e3, T2); // d3 = 2^4 * T4 square_casper(d3, T4); square_casper(e3, d3); square_casper(d3, e3); square_casper(e3, d3); multiply_casper(d2, e3, T4); // d3 = 2^8 * d2 square_casper(d3, d2); square_casper(e3, d3); for (i = 0; i < 3; i++) { square_casper(d3, e3); square_casper(e3, d3); } multiply_casper(d4, e3, d2); // d3 = 2^16 * d2 square_casper(d3, d4); square_casper(e3, d3); for (i = 0; i < 7; i++) { square_casper(d3, e3); square_casper(e3, d3); } multiply_casper(d2, e3, d4); // d3 = 2^32 * d2 square_casper(d3, d2); square_casper(e3, d3); for (i = 0; i < 15; i++) { square_casper(d3, e3); square_casper(e3, d3); } multiply_casper(d4, e3, d2); // d3 = 2^64 * d2 square_casper(d3, d4); square_casper(e3, d3); for (i = 0; i < 31; i++) { square_casper(d3, e3); square_casper(e3, d3); } multiply_casper(d2, e3, d4); // d3 = 2^128 * d2 square_casper(d3, d2); square_casper(e3, d3); for (i = 0; i < 63; i++) { square_casper(d3, e3); square_casper(e3, d3); } multiply_casper(d4, e3, d2); // d3 = 2^256 * d2 square_casper(d3, d4); square_casper(e3, d3); for (i = 0; i < 127; i++) { square_casper(d3, e3); square_casper(e3, d3); } multiply_casper(d2, e3, d4); // d3 = 2^2 * d2 square_casper(d3, d2); square_casper(e3, d3); multiply_casper(d2, e3, T2); // d3 = 2^4 * d2 square_casper(d3, d2); square_casper(e3, d3); square_casper(d3, e3); square_casper(e3, d3); multiply_casper(d2, e3, T4); square_casper(d3, d2); multiply_casper(d2, d3, a); // d3 = 2 ^ 2 * d2 square_casper(d3, d2); square_casper(e3, d3); multiply_casper(c, e3, a); } static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A) { /* R^2 = 2^1088 mod p, used to convert values to Montgomery form. */ // uint32_t R2[NUM_LIMBS] = { 0x00000000, 0x4000, 0, 0, // 0, 0, 0, 0, // 0, 0, 0, 0, // 0 }; /* R^2 = 2^1152 mod p, used to convert values to Montgomery form. */ uint32_t R2[kCASPER_ECC_P521_wordlen] = {0, 0, 0, 0x4000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; uint32_t *T1, *T2, *T3; T1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; T3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]; CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); multiply_casper(T3, T2, T1); CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t)); } static void MultprecCiosMul521_ct( uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np) { uint32_t j; uint64_t carry; uint64_t *a64, *b64, *w64; uint32_t *T1 = &CASPER_MEM[0], borrow; a64 = (uint64_t *)(uintptr_t)a; b64 = (uint64_t *)(uintptr_t)b; w64 = (uint64_t *)(uintptr_t)w_out; if (a != NULL) { /* if !a, we are reducing only */ PreZeroW(j, w_out); } SET_DWORD(&w64[N_dwordlen], 0ULL); SET_DWORD(&w64[N_dwordlen + 1U], 0ULL); /* with accelerator */ /* loop j and then reduce after each j round */ for (j = 0; j < N_dwordlen; j++) { /* Step 3. Iterate over N words of u using j - perform Multiply-accumulate */ /* push-pull: we do a*b and then separately m*n (reduce) */ if (a != NULL) { /* if mul&reduce vs. reduce only */ carry = GET_DWORD(&w64[N_dwordlen]); Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64)); Accel_crypto_mul( Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64))); Accel_done(); /* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */ /* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */ /* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */ /* w64[N_dwordlen+1] = g_carry; */ carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry); SET_DWORD(&w64[N_dwordlen + 1U], carry); } /* Fast reduction using only shifts for this special shape: * (c - (-p^-1*c mod 2^64) * p)/2^64 = * (c - c_0 * p)/2^64 = * (\sum_{j=0}^9 c_i*2^64 - c_0 * p)/2^64 = * (\sum_{j=0}^9 c_i*2^64 - c_0 * (2^521-1))/2^64 = * (\sum_{j=0}^9 c_i*2^64 - c_0 * 2^521 - c_0)/2^64 = * c_1 + c_2*2^64 + c_3*2^128 + c_4*2^192 + c_5*2^256 + c_6*2^320 + c_7*2^384 + c_8*2^448 + c_9*2^512 + c_0 * * 2^{448 + 9} so one only needs to compute this 128-bit addition: [c_8, c_9] + c_0 * 2^9 */ uint64_t *p64 = (uint64_t *)(uintptr_t)T1; /* p64[0] = w64[0] << 9;*/ SET_DWORD(&p64[0], GET_DWORD(&w64[0]) << 9U); /* p64[1] = w64[0] >> (64 - 9); */ SET_DWORD(&p64[1], GET_DWORD(&w64[0]) >> (64 - 9)); /* w64[0] = w64[1]; */ SET_DWORD(&w64[0], GET_DWORD(&w64[1])); /* w64[1] = w64[2]; */ SET_DWORD(&w64[1], GET_DWORD(&w64[2])); /* w64[2] = w64[3]; */ SET_DWORD(&w64[2], GET_DWORD(&w64[3])); /* w64[3] = w64[4]; */ SET_DWORD(&w64[3], GET_DWORD(&w64[4])); /* w64[4] = w64[5]; */ SET_DWORD(&w64[4], GET_DWORD(&w64[5])); /* w64[5] = w64[6]; */ SET_DWORD(&w64[5], GET_DWORD(&w64[6])); /* w64[6] = w64[7]; */ SET_DWORD(&w64[6], GET_DWORD(&w64[7])); /* Compute p64 = p64 + {w64[8], w64[9]} using one additonal double-length limb, * where p64 = w64[0] * 2^9. */ Accel_SetABCD_Addr(CA_MK_OFF(&w64[8]), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(2, (uint32_t)kCASPER_OpAdd64, /* kCASPER_OpAdd64, */ CA_MK_OFF(p64))); Accel_done(); /* w64[7] = p64[0]; */ SET_DWORD(&w64[7], GET_DWORD(&p64[0])); /* w64[8] = p64[1]; */ SET_DWORD(&w64[8], GET_DWORD(&p64[1])); /* w64[9] = 0; */ SET_DWORD(&w64[9], (uint64_t)0U); } /* memcpy(T1, w_out, (NUM_LIMBS + 1) * sizeof(uint32_t)); */ /* now check if need to subtract Nmod */ CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t)); /* Compute w = w - N */ Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0); Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out))); Accel_done(); /* if w_out > T1 then there was a borrow */ /* borrow = (((uint32_t*)w_out)[NUM_LIMBS] > T1[NUM_LIMBS]); */ borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen])); SET_WORD(&w_out[N_wordlen + 1U], 0); SET_WORD(&w_out[N_wordlen], 0); /* w_out[NUM_LIMBS + 1] = 0; */ /* w_out[NUM_LIMBS] = 0; */ casper_select(w_out, w_out, T1, (int32_t)borrow, (int32_t)N_wordlen); } #if defined(__GNUC__) /* End of enforcing O1 optimize level for gcc*/ #pragma GCC pop_options #endif #if (defined(__CC_ARM) || defined(__ARMCC_VERSION)) // End of enforcing optimize off for clang #pragma clang optimize on #endif