3434 lines
141 KiB
C
3434 lines
141 KiB
C
/*
|
|
* Copyright 2018-2021 NXP
|
|
* All rights reserved.
|
|
*
|
|
*
|
|
* SPDX-License-Identifier: BSD-3-Clause
|
|
*/
|
|
|
|
#include "fsl_casper.h"
|
|
#include <math.h> /* ceil TODO check if really need it */
|
|
|
|
/*******************************************************************************
|
|
* Definitions
|
|
******************************************************************************/
|
|
|
|
/* Component ID definition, used by tools. */
|
|
#ifndef FSL_COMPONENT_ID
|
|
#define FSL_COMPONENT_ID "platform.drivers.casper"
|
|
#endif
|
|
|
|
/* Recoding length for the secure scalar multiplication:
|
|
* Use n=256 and w=4 --> compute ciel(384/3) = 86 + 1 digits
|
|
* Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits
|
|
* Use n=521 and w=4 --> compute ciel(521/3) = 174 + 1 digits
|
|
*/
|
|
|
|
/*!< Recoding length for the secure scalar multiplication */
|
|
enum _casper_ecc_recode_len
|
|
{
|
|
kCASPER_ECC_P256_recode_len = 87u,
|
|
kCASPER_ECC_P384_recode_len = 129u,
|
|
kCASPER_ECC_P521_recode_len = 175u,
|
|
};
|
|
|
|
enum _casper_ecc_N_bitlen
|
|
{
|
|
kCASPER_ECC_P256_N_bitlen = 256u,
|
|
kCASPER_ECC_P384_N_bitlen = 384u,
|
|
kCASPER_ECC_P521_N_bitlen = 576u,
|
|
};
|
|
|
|
enum _casper_ecc_N_wordlen
|
|
{
|
|
kCASPER_ECC_P256_wordlen = 256U / 32U,
|
|
kCASPER_ECC_P384_wordlen = 384u / 32U,
|
|
kCASPER_ECC_P521_wordlen = 576u / 32U,
|
|
};
|
|
|
|
#if defined(__GNUC__)
|
|
/* Enforce O1 optimize level, specifically to remove strict-aliasing option.
|
|
(-fno-strict-aliasing is required for this driver). */
|
|
#pragma GCC push_options
|
|
#pragma GCC optimize("-O1")
|
|
#endif
|
|
|
|
#if (defined(__CC_ARM) || defined(__ARMCC_VERSION))
|
|
/* Enforce optimization off for clang, specifically to remove strict-aliasing option.
|
|
(-fno-strict-aliasing is required for this driver). */
|
|
#pragma clang optimize off
|
|
#endif
|
|
|
|
/* CASPER driver allows usage of 256, 384 and 521 ECC */
|
|
#define CASPER_MAX_ECC_SIZE_WORDLEN (576u / 32U)
|
|
#define CASPER_RECODE_LENGTH_MAX 175
|
|
|
|
#define CASPER_RAM_BASE_NS (FSL_FEATURE_CASPER_RAM_BASE_ADDRESS)
|
|
|
|
#if defined(FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED) && FSL_FEATURE_CASPER_RAM_IS_INTERLEAVED
|
|
#define CASPER_RAM_OFFSET (FSL_FEATURE_CASPER_RAM_OFFSET)
|
|
#define INTERLEAVE(addr) \
|
|
(((((((addr) >> 2U) & 0x00000001U) << CASPER_RAM_OFFSET) + (((addr) >> 3U) << 2U) + ((addr)&0x00000003U)) & \
|
|
0xFFFFU) | \
|
|
s_casperRamBase)
|
|
#define DEINTERLEAVE(addr) INTERLEAVE(addr)
|
|
#define GET_WORD(addr) (*((uint32_t *)DEINTERLEAVE((uint32_t)(addr))))
|
|
#define GET_DWORD(addr) (((uint64_t)GET_WORD(addr)) | (((uint64_t)GET_WORD(((uint32_t)(addr)) + 4U)) << 32U))
|
|
#define SET_WORD(addr, value) *((uint32_t *)INTERLEAVE((uint32_t)(addr))) = ((uint32_t)(value))
|
|
#define SET_DWORD(addr, value) \
|
|
do \
|
|
{ \
|
|
SET_WORD(addr, (uint32_t)(value & 0xFFFFFFFFU)); \
|
|
SET_WORD(((uint32_t)(addr)) + 4U, (uint32_t)((value & 0xFFFFFFFF00000000U) >> 32U)); \
|
|
} while (false)
|
|
|
|
/* memcopy is always word aligned */
|
|
/* interleaved to interleaved
|
|
static void CASPER_MEMCPY_I2I(void *dst, const void *src, size_t siz)
|
|
*/
|
|
#define CASPER_MEMCPY_I2I(dst, src, siz) \
|
|
\
|
|
{ \
|
|
uint32_t *dst32 = (uint32_t *)(dst); \
|
|
const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
|
|
uint32_t i; \
|
|
for (i = 0U; i < (siz) / 4U; i++) \
|
|
{ \
|
|
SET_WORD(&dst32[i], GET_WORD(&src32[i])); \
|
|
} \
|
|
}
|
|
|
|
/* interleaved to non-interleaved
|
|
static void CASPER_MEMCPY_I2N(void *dst, const void *src, size_t siz)
|
|
*/
|
|
#define CASPER_MEMCPY_I2N(dst, src, siz) \
|
|
\
|
|
{ \
|
|
uint32_t *dst32 = (uint32_t *)(dst); \
|
|
const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
|
|
uint32_t i; \
|
|
for (i = 0U; i < (siz) / 4U; i++) \
|
|
{ \
|
|
dst32[i] = GET_WORD(&src32[i]); \
|
|
} \
|
|
}
|
|
|
|
/* non-interleaved to interleaved
|
|
static void CASPER_MEMCPY_N2I(void *dst, const void *src, size_t siz)
|
|
*/
|
|
#define CASPER_MEMCPY_N2I(dst, src, siz) \
|
|
\
|
|
{ \
|
|
volatile uint32_t *dst32 = (uint32_t *)(dst); \
|
|
const uint32_t *src32 = (const uint32_t *)(const uint32_t *)(src); \
|
|
uint32_t i; \
|
|
for (i = 0U; i < (siz) / 4U; i++) \
|
|
{ \
|
|
SET_WORD(&dst32[i], src32[i]); \
|
|
} \
|
|
}
|
|
#else
|
|
#define GET_WORD(addr) (*((uint32_t *)(uint32_t)(addr)))
|
|
#define GET_DWORD(addr) (*((uint64_t *)(addr)))
|
|
#define SET_WORD(addr, value) *((uint32_t *)(uint32_t)(addr)) = ((uint32_t)(value))
|
|
#define SET_DWORD(addr, value) *((uint64_t *)(addr)) = ((uint64_t)(value))
|
|
|
|
#define CASPER_MEMCPY_I2I(dst, src, siz) (void)memcpy(dst, src, siz)
|
|
#define CASPER_MEMCPY_I2N(dst, src, siz) (void)memcpy(dst, src, siz)
|
|
#define CASPER_MEMCPY_N2I(dst, src, siz) (void)memcpy(dst, src, siz)
|
|
#endif
|
|
|
|
#define WORK_BUFF_MUL4 (N_wordlen_max * 4 + 2) /* ! working buffer is 4xN_wordlen to allow in place math */
|
|
#define N_bytelen (N_wordlen * 4U) /* for memory copy and the like */
|
|
#define N_dwordlen (unsigned)(N_wordlen / 2U)
|
|
|
|
#define PreZeroW(i, w_out) \
|
|
for ((i) = 0U; (i) < N_wordlen; (i) += 4U) \
|
|
{ \
|
|
SET_WORD(&(w_out)[(i) + 0U], 0U); \
|
|
SET_WORD(&(w_out)[(i) + 1U], 0U); \
|
|
SET_WORD(&(w_out)[(i) + 2U], 0U); \
|
|
SET_WORD(&(w_out)[(i) + 3U], 0U); \
|
|
} /* unrolled partly */
|
|
#define PreZeroW2up(i, w_out) \
|
|
for (i = N_wordlen; i <= N_wordlen * 2U; i += 4U) \
|
|
{ \
|
|
SET_WORD(&w_out[i + 0U], 0U); \
|
|
SET_WORD(&w_out[i + 1U], 0U); \
|
|
SET_WORD(&w_out[i + 2U], 0U); \
|
|
SET_WORD(&w_out[i + 3U], 0U); \
|
|
} /* unrolled partly */
|
|
|
|
/* Macros for the ECC component in Casper */
|
|
|
|
/* CASPER memory layout for ECC */
|
|
|
|
#define CASPER_MEM ((uint32_t *)msg_ret)
|
|
|
|
/* Currently these macros work on 32-bit platforms */
|
|
|
|
#define add(c1, c0, a, b) \
|
|
\
|
|
do \
|
|
{ \
|
|
uint32_t _t; \
|
|
_t = a + b; \
|
|
c1 = (uint32_t)(_t < a); \
|
|
c0 = _t; \
|
|
\
|
|
} while (false)
|
|
|
|
#define add_cout(carry, c, a, b) add((carry), (c), (a), (b))
|
|
|
|
#define add_cout_cin(carryout, c, a, b, carryin) \
|
|
do \
|
|
{ \
|
|
uint64_t _t = (uint64_t)(a) + (b) + (carryin); \
|
|
(c) = (uint32_t)_t; \
|
|
(carryout) = (uint32_t)(_t >> 32); \
|
|
} while (false)
|
|
|
|
#define sub_borrowout(borrow, c, a, b) \
|
|
do \
|
|
{ \
|
|
uint32_t _b = (uint32_t)((b) > (a)); \
|
|
(c) = (a) - (b); \
|
|
(borrow) = _b; \
|
|
} while (false)
|
|
|
|
#define sub_borrowin_borrowout(borrowout, c, a, b, borrowin) \
|
|
do \
|
|
{ \
|
|
uint32_t _t, _borrow1, _borrow2; \
|
|
sub_borrowout(_borrow1, _t, (a), (b)); \
|
|
sub_borrowout(_borrow2, (c), _t, (borrowin)); \
|
|
(borrowout) = _borrow1 + _borrow2; \
|
|
} while (false)
|
|
|
|
#define sub_borrowout_1(borrow, c, a) \
|
|
do \
|
|
{ \
|
|
uint32_t _b = 0; \
|
|
c = a - b; \
|
|
borrow = _b; \
|
|
} while (false)
|
|
|
|
#define sub_borrowin_borrowout_1(borrowout, c, a, borrowin) \
|
|
do \
|
|
{ \
|
|
uint32_t _t, _borrow1, _borrow2; \
|
|
sub_borrowout_1(_borrow1, _t, a); \
|
|
sub_borrowout(_borrow2, c, _t, borrowin); \
|
|
borrowout = _borrow1 + _borrow2; \
|
|
} while (false)
|
|
|
|
/* 32 x 32 --> 64-bit multiplication
|
|
* (c1,c0) = a * b
|
|
*/
|
|
#define mul(c1, c0, a, b) \
|
|
\
|
|
do \
|
|
{ \
|
|
uint64_t __m; \
|
|
__m = (uint64_t)a * (uint64_t)b; \
|
|
c0 = (uint32_t)__m; \
|
|
c1 = (uint32_t)(__m >> (uint64_t)32); \
|
|
\
|
|
} while (false)
|
|
|
|
/* Multiply-and-accumulate
|
|
* (c1,c0) = a*b+c0
|
|
*/
|
|
#define muladd(c1, c0, a, b) \
|
|
\
|
|
do \
|
|
{ \
|
|
uint32_t __ma = c0; \
|
|
mul(c1, c0, a, b); \
|
|
c0 = c0 + __ma; \
|
|
c1 = c1 + (c0 < __ma); \
|
|
\
|
|
} while (0)
|
|
|
|
/* Multiply-and-accumulate-accumulate
|
|
* (c1,c0) = a*b+c0+c1
|
|
*/
|
|
#define muladdadd(c1, c0, a, b) \
|
|
\
|
|
do \
|
|
{ \
|
|
uint32_t __maa0 = c0, __maa1 = c1; \
|
|
mul(c1, c0, a, b); \
|
|
c0 = c0 + __maa0; \
|
|
c1 = c1 + (c0 < __maa0); \
|
|
c0 = c0 + __maa1; \
|
|
c1 = c1 + (c0 < __maa1); \
|
|
\
|
|
} while (0)
|
|
|
|
#define square_casper(c, a) multiply_casper(c, a, a)
|
|
#define sub_casper(c, a, b) CASPER_montsub(c, a, b, &CASPER_MEM[(N_wordlen + 4U)])
|
|
#define add_casper(c, a, b) CASPER_montadd(c, a, b, &CASPER_MEM[(N_wordlen + 4U)])
|
|
#define mul2_casper(c, a) add_casper(c, a, a)
|
|
#define half(c, a, b) CASPER_half(c, a, b)
|
|
/*******************************************************************************
|
|
* Variables
|
|
******************************************************************************/
|
|
|
|
/* The model for this algo is that it can be implemented for a fixed size RSA key */
|
|
/* for max speed. If this is made into a variable (to allow varying size), then */
|
|
/* it will be slower by a bit. */
|
|
/* The file is compiled with N_bitlen passed in as number of bits of the RSA key */
|
|
/* #define N_bitlen 2048 */
|
|
static size_t N_wordlen = 0U; /* ! number of words (e.g. 4096/32 is 128 words) */
|
|
|
|
static uint32_t s_casperRamBase = CASPER_RAM_BASE_NS;
|
|
static uint32_t *msg_ret = (uint32_t *)CASPER_RAM_BASE_NS;
|
|
|
|
/* NISTp-256 = 2^256-2^224+2^192+2^96-1 */
|
|
static uint32_t NISTp256[256 / 32u] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0x00000000,
|
|
0x00000000, 0x00000000, 0x00000001, 0xffffffffU};
|
|
|
|
/* The cardinality of the curve E(F_p) */
|
|
static uint32_t NISTp256_q[256 / 32u] = {0xfc632551U, 0xf3b9cac2U, 0xa7179e84U, 0xbce6faadU,
|
|
0xffffffffU, 0xffffffffU, 0x00000000, 0xffffffffU};
|
|
|
|
/* R = 2^256 mod p, the value "1" in Montgomery form. */
|
|
static uint32_t NISTr256[256 / 32u] = {0x00000001, 0x00000000, 0x00000000, 0xffffffffU,
|
|
0xffffffffU, 0xffffffffU, 0xfffffffeU, 0x00000000};
|
|
|
|
static uint32_t Np256[2] = {1, 0};
|
|
|
|
/* NISTp-384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */
|
|
static uint32_t NISTp384[384 / 32u] = {0xffffffffU, 0x00000000, 0x00000000, 0xffffffffU, 0xfffffffeU, 0xffffffffU,
|
|
0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU};
|
|
|
|
/* The cardinality of the curve E(F_p) */
|
|
static uint32_t NISTp384_q[384 / 32u] = {0xccc52973U, 0xecec196aU, 0x48b0a77aU, 0x581a0db2U, 0xf4372ddfU, 0xc7634d81U,
|
|
0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU};
|
|
|
|
/* R = 2^256 mod p, the value "1" in Montgomery form. */
|
|
static uint32_t NISTr384[384 / 32u] = {0x00000001, 0xffffffffU, 0xffffffffU, 0x00000000, 0x1, 0, 0, 0, 0, 0, 0, 0};
|
|
|
|
// -p^-1 mod 2^64 = 0x100000001
|
|
static uint32_t Np384[2] = {1, 1};
|
|
|
|
/* NISTp-521 = 2^521 - 1 */
|
|
static uint32_t NISTp521[576 / 32U] = {0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
|
|
0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
|
|
0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU, 0};
|
|
|
|
/* The cardinality of the curve E(F_p) */
|
|
static uint32_t NISTp521_q[576 / 32U] = {0x91386409U, 0xbb6fb71eU, 0x899c47aeU, 0x3bb5c9b8U, 0xf709a5d0U, 0x7fcc0148U,
|
|
0xbf2f966bU, 0x51868783U, 0xfffffffaU, 0xffffffffU, 0xffffffffU, 0xffffffffU,
|
|
0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU, 0x1ffU, 0};
|
|
|
|
/* R = 2^576 mod p, the value "1" in Montgomery form. */
|
|
static uint32_t NISTr521[576 / 32U] = {0, 0x800000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
|
|
/* -p^-1 mod 2^64 = 1 */
|
|
static uint32_t Np521[2] = {1, 0};
|
|
|
|
/*******************************************************************************
|
|
* Prototypes
|
|
******************************************************************************/
|
|
|
|
/* Convert a projective point (X1 : Y1 : Z1)
|
|
* to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3)
|
|
* The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap
|
|
*/
|
|
void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1);
|
|
|
|
/* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2)
|
|
* where (X1: Y1: Z1) != (X2 : Y2 : Z2)
|
|
* (X3 : Y3: Z3) may be the same as one of the inputs.
|
|
*/
|
|
void Jac_addition(uint32_t *X3,
|
|
uint32_t *Y3,
|
|
uint32_t *Z3,
|
|
uint32_t *X1,
|
|
uint32_t *Y1,
|
|
uint32_t *Z1,
|
|
uint32_t *X2,
|
|
uint32_t *Y2,
|
|
uint32_t *Z2);
|
|
|
|
/* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2)
|
|
* where (X1: Y1: Z1) != (X2, Y2)
|
|
* (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1).
|
|
* Source: 2004 Hankerson?Menezes?Vanstone, page 91.
|
|
*/
|
|
void Jac_add_affine(
|
|
uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2);
|
|
|
|
/* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91.
|
|
* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1)
|
|
* (X3 : Y3: Z3) may be the same as the input.
|
|
*/
|
|
void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1);
|
|
|
|
/* Constant time elliptic curve scalar multiplication.
|
|
* Source: https://eprint.iacr.org/2014/130.pdf
|
|
* when using w = 4.
|
|
* Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p)
|
|
* p is the prime used to define the finite field F_p
|
|
* q is the (prime) order of the curve
|
|
*/
|
|
void Jac_scalar_multiplication(
|
|
uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q);
|
|
|
|
/* Compute the double scalar multiplication
|
|
* (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2)
|
|
* Using Shamir's trick and precomputing 16 points.
|
|
* This code is *not* constant time since this is used
|
|
* for verification only.
|
|
*/
|
|
void double_scalar_multiplication(uint32_t *X3,
|
|
uint32_t *Y3,
|
|
uint32_t *Z3,
|
|
uint32_t *X1,
|
|
uint32_t *Y1,
|
|
uint32_t *k1,
|
|
uint32_t *X2,
|
|
uint32_t *Y2,
|
|
uint32_t *k2);
|
|
|
|
/* Compute inversion modulo NIST-p384 using Fermats little theorem.
|
|
* Using c = a^(p-2) = a^(-1) mod p.
|
|
* This computes the modular inversion if all arithmetic is "regular"
|
|
* modular arithmetic or computes automatically the Montgomery inverse
|
|
* if all arithmetic is Montgomery arithmetic.
|
|
*/
|
|
static void invert_mod_p384(uint32_t *c, uint32_t *a);
|
|
|
|
/* Modular inversion for NIST-P256 */
|
|
static void invert_mod_p256(uint32_t *c, uint32_t *a);
|
|
|
|
/* Modular inversion for NIST-P521 */
|
|
static void invert_mod_p521(uint32_t *c, uint32_t *a);
|
|
|
|
// A and C do not need to be in Casper memory
|
|
static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A);
|
|
static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A);
|
|
static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A);
|
|
|
|
static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod);
|
|
static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod);
|
|
|
|
/* Compute c = a/2 mod p where b is scratch space. */
|
|
static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b);
|
|
|
|
void CASPER_MEMCPY(void *dst, const void *src, size_t siz);
|
|
|
|
static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[]);
|
|
|
|
static uint8_t int8abs(int8_t v);
|
|
|
|
/* Constant time select c = a if m = 0 or
|
|
* c = b if m = 1
|
|
* a, b, c are n words
|
|
*/
|
|
static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n);
|
|
|
|
/* Dumb n-limb addition of c=a+b, return carry. */
|
|
static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n);
|
|
|
|
#if 0
|
|
/* Dumb n-limb addition of c=a+b, return carry. */
|
|
static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n);
|
|
|
|
/* Dumb n-limb subtraction of c=a-b, return borrow. */
|
|
static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n);
|
|
#endif
|
|
|
|
/* Dumb n-limb subtraction of c=a-b, return borrow. */
|
|
static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n);
|
|
|
|
int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],
|
|
const unsigned exp_pubkey,
|
|
const unsigned pubkey[N_wordlen_max],
|
|
unsigned MsgRet[WORK_BUFF_MUL4]);
|
|
|
|
int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],
|
|
const unsigned exp_pubkey,
|
|
const unsigned pubkey[N_wordlen_max],
|
|
unsigned MsgRet[WORK_BUFF_MUL4]);
|
|
|
|
void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[]);
|
|
|
|
void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret);
|
|
void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[]);
|
|
void MultprecModulo(unsigned r_out[], const unsigned v[], int top);
|
|
void MultprecCiosMul(
|
|
unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np);
|
|
void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[]);
|
|
|
|
static void MultprecCiosMul_ct(
|
|
uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np);
|
|
|
|
static void MultprecCiosMul521_ct(
|
|
uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np);
|
|
|
|
static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c);
|
|
static void shiftright(uint32_t *z, uint32_t *x, uint32_t c);
|
|
static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c);
|
|
|
|
/*******************************************************************************
|
|
* Code
|
|
******************************************************************************/
|
|
|
|
__STATIC_FORCEINLINE uint32_t CA_MK_OFF(const void *addr)
|
|
{
|
|
return ((uint32_t)(const uint32_t *)addr - s_casperRamBase);
|
|
}
|
|
|
|
#if 1
|
|
__STATIC_FORCEINLINE void Accel_done(void)
|
|
{
|
|
register uint32_t status;
|
|
do
|
|
{
|
|
status = CASPER->STATUS;
|
|
} while (0U == (status & CASPER_STATUS_DONE_MASK));
|
|
}
|
|
|
|
__STATIC_FORCEINLINE void Accel_SetABCD_Addr(uint32_t ab, uint32_t cd)
|
|
{
|
|
CASPER->CTRL0 = ab | (cd << 16); /* CDoffset << 16 | ABoffset */
|
|
}
|
|
|
|
__STATIC_FORCEINLINE void Accel_crypto_mul(uint32_t ctrl1)
|
|
{
|
|
CASPER->CTRL1 = ctrl1;
|
|
}
|
|
#else
|
|
#include "intrinsics.h"
|
|
#define Accel_done() \
|
|
{ \
|
|
register uint32_t status; \
|
|
do \
|
|
{ \
|
|
status = CASPER_Rd32b(CASPER_CP_STATUS); \
|
|
} while (0 == (status & CASPER_STATUS_DONE_MASK)); \
|
|
}
|
|
#if 0
|
|
__STATIC_FORCEINLINE void Accel_done(void)
|
|
{
|
|
register uint32_t status;
|
|
do
|
|
{
|
|
status = CASPER->STATUS;
|
|
} while (0 == (status & CASPER_STATUS_DONE_MASK));
|
|
}
|
|
#endif
|
|
#define Accel_SetABCD_Addr(ab, cd) CASPER_Wr32b((uint32_t)ab | ((uint32_t)cd << 16), CASPER_CP_CTRL0);
|
|
#define Accel_crypto_mul(ctrl1) CASPER_Wr32b((uint32_t)ctrl1, CASPER_CP_CTRL1);
|
|
#endif
|
|
|
|
__STATIC_FORCEINLINE uint32_t Accel_IterOpcodeResaddr(uint32_t iter, uint32_t opcode, uint32_t resAddr)
|
|
{
|
|
return CASPER_CTRL1_ITER(iter) | CASPER_CTRL1_MODE(opcode) | (resAddr << 16);
|
|
}
|
|
|
|
void CASPER_MEMCPY(void *dst, const void *src, size_t siz)
|
|
{
|
|
bool bdst =
|
|
((((uint32_t)(uint32_t *)dst) | 0x10000000u) >= ((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) &&
|
|
(((uint32_t)(uint32_t *)dst) | 0x10000000u) <
|
|
((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u);
|
|
|
|
bool bsrc = ((((uint32_t)(const uint32_t *)src) | 0x10000000u) >=
|
|
((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) &&
|
|
(((uint32_t)(const uint32_t *)src) | 0x10000000u) <
|
|
((unsigned)FSL_FEATURE_CASPER_RAM_BASE_ADDRESS | 0x10000000u) + 8u * 1024u);
|
|
|
|
if (bdst && bsrc)
|
|
{
|
|
CASPER_MEMCPY_I2I(dst, src, siz);
|
|
}
|
|
else if (bdst && !bsrc)
|
|
{
|
|
CASPER_MEMCPY_N2I(dst, src, siz);
|
|
}
|
|
else if (!bdst && bsrc)
|
|
{
|
|
CASPER_MEMCPY_I2N(dst, src, siz);
|
|
}
|
|
else
|
|
{
|
|
(void)memcpy(dst, src, siz);
|
|
}
|
|
}
|
|
|
|
/* Constant time select c = a if m = 0 or
|
|
* c = b if m = 1
|
|
* a, b, c are n words
|
|
*/
|
|
static void casper_select(uint32_t *c, uint32_t *a, uint32_t *b, int m, int n)
|
|
{
|
|
uint32_t m1 = 0U - (uint32_t)m, m2 = ~m1;
|
|
int i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
{
|
|
SET_WORD(&c[i], (GET_WORD(&a[i]) & m2) | (GET_WORD(&b[i]) & m1));
|
|
}
|
|
}
|
|
|
|
/* Compute R`, which is R mod N. This is done using subtraction */
|
|
/* R has 1 in N_wordlen, but we do not fill it in since borrowed. */
|
|
/* Exp-pubkey only used to optimize for exp=3 */
|
|
void MultprecMontCalcRp(unsigned Rp[], const unsigned exp_pubkey, const unsigned Nmod[])
|
|
{
|
|
uint32_t i;
|
|
|
|
/* R is 2^n where n is 1 bit longer than Nmod, so 1 followed by 32 or 64 0 words for example */
|
|
/* Note that Nmod's upper most bit has to be 1 by definition, so one subtract is enough. We */
|
|
/* do not set the 1 since it is "borrowed" so no point */
|
|
PreZeroW(i, Rp);
|
|
Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(Rp)));
|
|
Accel_done();
|
|
/* final borrow cannot happen since we know we started with a larger number */
|
|
}
|
|
|
|
/* MultprecMultiply - multiple w=u*v (per Knuth) */
|
|
/* w_out is 2x the size of u and v */
|
|
void MultprecMultiply(unsigned w_out[], const unsigned u[], const unsigned v[])
|
|
{
|
|
uint32_t i, j;
|
|
|
|
/* Knuth 4.3.1 - Algorithm M */
|
|
/* Compute w = u * v */
|
|
/* u and v are N bits long in 32 bit word form */
|
|
/* w is 2*N bits long in 32 bit word form */
|
|
/* Note: We just multiply in place */
|
|
|
|
/* Step 1. Fill w[t-1:0] with 0s, the upper half will be written as we go */
|
|
PreZeroW(i, w_out);
|
|
|
|
/* We do 1st pass NOSUM so we do not have to 0 output */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&v[0]), CA_MK_OFF(u));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464NoSum, CA_MK_OFF(&w_out[0])));
|
|
Accel_done();
|
|
/* Step 2. iterate over N words of v using j */
|
|
for (j = 2U; j < N_wordlen; j += 2U)
|
|
{
|
|
/* Step 2b. Check for 0 on v word - skip if so since we 0ed already */
|
|
/* Step 3. Iterate over N words of u using i - perform Multiply-accumulate */
|
|
if (0U != (GET_WORD(&v[j])) || 0U != (GET_WORD(&v[j + 1U])))
|
|
{
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&v[j]), CA_MK_OFF(u));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpMul6464Sum, CA_MK_OFF(&w_out[j])));
|
|
Accel_done();
|
|
}
|
|
}
|
|
}
|
|
|
|
/* MultprecModulo performs divide to get remainer as needed for RSA */
|
|
/* This performs (q,r) = u/v, but we do not keep q */
|
|
/* r_out is module (remainder) and is 2*N */
|
|
/* u is in r_out (1st N) at start (passed in) */
|
|
/* v is N long */
|
|
void MultprecModulo(unsigned r_out[], const unsigned v[], int top)
|
|
{
|
|
uint64_t u64; /* use 64 bit math mixed with 32 bit */
|
|
unsigned u32; /* allows us to work on U in 32 bit */
|
|
unsigned u_n, ul16, uh16, *u_shft; /* u_shft is because r_out is u initially */
|
|
unsigned vl16, vh16, v_Nm1;
|
|
unsigned q_hat, r_hat, q_over;
|
|
unsigned borrow, carry;
|
|
uint32_t i;
|
|
int j, tmp;
|
|
|
|
/* Knuth 4.3.1 - Algorithm D */
|
|
/* Compute q = u / v giving remainder r = u mod v */
|
|
/* -- we only want r, so we build qhat but do not store the Qs */
|
|
/* v is N long, with u,q,r 2N long because u is slowly replavced by r. */
|
|
/* We normalize/unnormlize per Knuth in the buffer (not copied) */
|
|
|
|
/* Step 1. Normalize value so MSb is in v[n-1]. Remember that v is */
|
|
/* the public key - to call it a 2048 bit number, they cannot have 0 */
|
|
/* in the MSb (or it would be less than 2048 bits) and so we know we */
|
|
/* are normalized already. Therefore, u is effectively shifted already. */
|
|
/* For u, we have it in r_out. u[n] holds any overflow */
|
|
/* Since divide on CM3/4 is 32/32=32, we break into 16 bit halves, but */
|
|
/* multiply can be 32x32=64. */
|
|
u_n = 0;
|
|
u_shft = r_out; /* u (shifted) is in r_out */
|
|
|
|
v_Nm1 = GET_WORD(&v[N_wordlen - 1U]); /* MSw of public key */
|
|
vl16 = v_Nm1 & 0xFFFFU; /* lower 16 */
|
|
vh16 = v_Nm1 >> 16; /* upper 16 */
|
|
/* Step 2. Iterate j from m-n down to 0 (M selected per Knuth as 2*N) */
|
|
for (j = top; j >= 0; j--)
|
|
{
|
|
/* Step 3. estimate q_hat as (U[j+n]*B + U[j+n-1]) / V[n-1] */
|
|
/* Note: using subset of Knuth algo since v is 1/2 len of u (which is */
|
|
/* from multiply or x^2 leading into this). */
|
|
u32 = u_n; /* pickup u4u3u2, knowing u4 is 0 */
|
|
u64 = ((uint64_t)u_n << 32) | GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 1U]);
|
|
ul16 = (unsigned int)(u64 & 0xFFFFU); /* lower 16 */
|
|
uh16 = (unsigned int)((u64 >> 16) & 0xFFFFU); /* upper 16 */
|
|
|
|
/* we see if even possible (u large enough relative to v) */
|
|
if ((u32 - v_Nm1) <= u32)
|
|
{
|
|
u32 -= v_Nm1;
|
|
q_over = 1; /* overflow from the sub */
|
|
}
|
|
else
|
|
{
|
|
q_over = 0;
|
|
}
|
|
/* q_hat = u32 / vh16 -- is the upper partial value */
|
|
/* estimate; if too much, then back down by 1 or 2 */
|
|
q_hat = u32 / vh16;
|
|
r_hat = u32 - (q_hat * vh16);
|
|
/* see if Q is more than 16 bits or remainder is too large (over div) */
|
|
if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | uh16)))
|
|
{
|
|
/* too much - undo a division */
|
|
q_hat--;
|
|
r_hat += vh16;
|
|
/* check if still too much */
|
|
if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | uh16)))
|
|
{
|
|
q_hat--; /* yes, so undo a 2nd */
|
|
}
|
|
}
|
|
|
|
/* compose u3u2uh16, then sub q_hat*v if OK */
|
|
u64 = (((uint64_t)u32 << 16) | uh16) - ((uint64_t)q_hat * v_Nm1);
|
|
if (0U != (u64 >> 48))
|
|
{
|
|
/* no, so add v back */
|
|
u32 = (unsigned)(u64 + v_Nm1);
|
|
q_hat--;
|
|
}
|
|
else
|
|
{
|
|
u32 = (unsigned)u64;
|
|
}
|
|
tmp = (int32_t)(uint32_t)(q_hat << 16); /* quotient upper part */
|
|
|
|
/* divide lower part: q = u2uh16ul16 / v. */
|
|
/* estimate and add back if over divdied */
|
|
q_hat = u32 / vh16;
|
|
r_hat = u32 - (q_hat * vh16);
|
|
if ((q_hat == 0x10000U) || ((q_hat * vl16) > ((r_hat << 16) | ul16)))
|
|
{
|
|
/* too much - undo a division */
|
|
q_hat--;
|
|
r_hat += vh16;
|
|
/* check if still too much */
|
|
if ((r_hat < 0x10000U) && ((q_hat * vl16) > ((r_hat << 16) | ul16)))
|
|
{
|
|
q_hat--; /* yes, so undo a 2nd */
|
|
}
|
|
}
|
|
|
|
/* compose u2uh16ul16, then sub q_hat*v if OK */
|
|
u64 = (((uint64_t)u32 << 16) | ul16) - ((uint64_t)q_hat * v_Nm1);
|
|
if (0U != (u64 >> 48))
|
|
{
|
|
/* no, so add v back */
|
|
r_hat = (unsigned)(u64 + v_Nm1);
|
|
q_hat--;
|
|
}
|
|
else
|
|
{
|
|
r_hat = (unsigned)u64;
|
|
}
|
|
q_hat |= (unsigned)tmp; /* other half of the quotient */
|
|
while ((q_over != 0U) || ((uint64_t)q_hat * GET_WORD(&v[N_wordlen - 2U])) >
|
|
((1ULL << 32) * r_hat) + (uint64_t)GET_WORD(&u_shft[(uint32_t)j + N_wordlen - 2U]))
|
|
{ /* if Qhat>b, then reduce to b-1, then adjust up Rhat */
|
|
q_hat--;
|
|
r_hat += v_Nm1;
|
|
if (r_hat < v_Nm1)
|
|
{
|
|
break; /* no overflow */
|
|
/* else repeat since Rhat >= b */
|
|
}
|
|
}
|
|
|
|
/* Step 4. Multiply and subtract. We know the amount, */
|
|
/* so we do the schoolboy math. Have to do on */
|
|
/* the large value. */
|
|
if (q_hat != 0U)
|
|
{
|
|
borrow = 0;
|
|
for (i = 0; i < N_wordlen; i++)
|
|
{
|
|
u64 = (uint64_t)q_hat * GET_WORD(&v[i]) + borrow;
|
|
borrow = (unsigned)(u64 >> 32);
|
|
if (GET_WORD(&u_shft[i + (unsigned)j]) < (unsigned)u64)
|
|
{
|
|
borrow++; /* carry the overflow */
|
|
}
|
|
SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) - (unsigned)u64);
|
|
}
|
|
u_n -= borrow; /* overflow from shift left does not fit otherwise */
|
|
}
|
|
|
|
/* Store 5. (update Q - we don't), and add back V to remainder if we over-subtracted */
|
|
/* That restores remainder to correct (we could only be off by 1) */
|
|
/* This should happen very rarely. */
|
|
if (u_n != 0U)
|
|
{
|
|
carry = 0;
|
|
for (i = 0; i < N_wordlen; i++)
|
|
{
|
|
SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + carry);
|
|
carry = (GET_WORD(&u_shft[i + (unsigned)j]) < carry) ? 1U : 0U;
|
|
SET_WORD(&u_shft[i + (unsigned)j], GET_WORD(&u_shft[i + (unsigned)j]) + GET_WORD(&v[i]));
|
|
if (GET_WORD(&u_shft[i + (unsigned)j]) < GET_WORD(&v[i]))
|
|
{
|
|
carry++;
|
|
}
|
|
}
|
|
}
|
|
u_n = GET_WORD(
|
|
&u_shft[(uint32_t)j + N_wordlen - 1U]); /* hold upper part of u to catch overflow (to borrow from) */
|
|
}
|
|
/* low N bits of r are valid as remainder */
|
|
}
|
|
|
|
/* We convert X into a Mont form number. Note length of arrays: */
|
|
/* x is N_wordlen, Nmod is N_wordlen */
|
|
/* Rp is N_wordlen (it is R` which is R mod N) */
|
|
/* Xmont_out is N_wordlen*2+1 */
|
|
void MultprecMontPrepareX(unsigned Xmont_out[], const unsigned x[], const unsigned Rp[], const unsigned Nmod[])
|
|
{
|
|
MultprecMultiply(Xmont_out, x, Rp);
|
|
MultprecModulo(Xmont_out, Nmod, (int32_t)N_wordlen);
|
|
}
|
|
|
|
void MultprecGenNp64(const unsigned *Nmod, unsigned *np64_ret) /* only pass the low order double word */
|
|
{
|
|
uint64_t nprime, Nmod_0;
|
|
Nmod_0 = GET_WORD(&Nmod[0]) | ((uint64_t)GET_WORD(&Nmod[1]) << 32);
|
|
|
|
#define COMP_NPN_1 ((2U - Nmod_0 * nprime) * nprime) /* computes N`*N0=1 mod 2^P where P is the partial built up */
|
|
nprime = (((2U + Nmod_0) & 4U) << 1) + Nmod_0; /* mod 2^4 */
|
|
nprime = COMP_NPN_1;
|
|
nprime = COMP_NPN_1;
|
|
nprime = COMP_NPN_1;
|
|
nprime = COMP_NPN_1;
|
|
/* 8 multiplies of uint64_t */
|
|
*((uint64_t *)(uintptr_t)np64_ret) = (~0ULL - nprime) + 1ULL;
|
|
}
|
|
|
|
/* CIOS Multiply. This is the Coarse Integrated form where the values are */
|
|
/* multiplied and reduced for each step of "i". This uses less memory and */
|
|
/* is faster as a result. Note that this is used to square as well as mul, */
|
|
/* so not as fast as pure squaring could be. */
|
|
void MultprecCiosMul(
|
|
unsigned w_out[], const unsigned a[], const unsigned b[], const unsigned Nmod[], const unsigned *Np)
|
|
{
|
|
int j;
|
|
uint32_t i;
|
|
uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64];
|
|
uint64_t Np64;
|
|
uint64_t carry;
|
|
uint64_t *a64, *b64, *w64, *N64;
|
|
|
|
Np64 = *(uint64_t *)(uintptr_t)Np;
|
|
|
|
a64 = (uint64_t *)(uintptr_t)a;
|
|
b64 = (uint64_t *)(uintptr_t)b;
|
|
w64 = (uint64_t *)(uintptr_t)w_out;
|
|
N64 = (uint64_t *)(uintptr_t)Nmod;
|
|
|
|
if (a != NULL)
|
|
{ /* if !a, we are reducing only */
|
|
PreZeroW(i, w_out);
|
|
}
|
|
SET_DWORD(&w64[N_dwordlen], 0ULL);
|
|
SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
|
|
/* with accelerator */
|
|
|
|
/* loop i and then reduce after each j round */
|
|
for (i = 0; i < N_dwordlen; i++)
|
|
{
|
|
/* Step 3. Iterate over N words of u using i - perform Multiply-accumulate */
|
|
/* push-pull: we do a*b and then separately m*n (reduce) */
|
|
if (a != NULL)
|
|
{ /* if mul&reduce vs. reduce only */
|
|
carry = GET_DWORD(&w64[N_dwordlen]);
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&b64[i]), CA_MK_OFF(a64));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
|
|
Accel_done();
|
|
/* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
|
|
/* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
|
|
/* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
|
|
/* w64[N_dwordlen+1] = g_carry; */
|
|
carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
|
|
SET_DWORD(&w64[N_dwordlen + 1U], carry);
|
|
}
|
|
SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /* prime for 1st; modulo a double-word */
|
|
|
|
/* we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */
|
|
/* need its carry. We let the accel do this separately - really need a mode to */
|
|
/* do this "reduce" since it is natural */
|
|
carry = GET_DWORD(&w64[N_dwordlen]);
|
|
Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0]));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0])));
|
|
Accel_done();
|
|
carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
|
|
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0])));
|
|
|
|
Accel_done();
|
|
SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry));
|
|
}
|
|
|
|
/* now check if need to subtract Nmod */
|
|
if (0U != (GET_WORD(&w_out[N_wordlen])))
|
|
{
|
|
j = 1; /* we have to subtract for sure if carry up */
|
|
}
|
|
else
|
|
{
|
|
j = 0;
|
|
for (i = N_wordlen - 1U; i > 0U; i--)
|
|
{
|
|
if (GET_WORD(&w_out[i]) != GET_WORD(&Nmod[i]))
|
|
{
|
|
j = (int32_t)(GET_WORD(&w_out[i]) > GET_WORD(&Nmod[i])); /* if larger sub */
|
|
break; /* we would remove the break if worrying about side channel */
|
|
}
|
|
}
|
|
}
|
|
if (0 == j)
|
|
{
|
|
return; /* Is smaller than Nmod, so done. */
|
|
}
|
|
Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
|
|
Accel_done();
|
|
/* last borrow is OK since we know it could only be <2N and */
|
|
}
|
|
|
|
/* RSA_MontSignatureToPlaintextFast: */
|
|
/* MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */
|
|
/* exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */
|
|
/* signature = N bitpos len long "message" to process in Montgomery form - so saving conversion (divide) */
|
|
/* pubkey = N bitpos len long public key to process signature with */
|
|
/* returns: 0 */
|
|
/* */
|
|
/* Algo: compute M = signaturen^e mod public_key */
|
|
/* where M is original plaintext, signature is signed value */
|
|
/* note: e is usually either 0x3 or 0x10001 */
|
|
int RSA_MontSignatureToPlaintextFast(const unsigned mont_signature[N_wordlen_max],
|
|
const unsigned exp_pubkey,
|
|
const unsigned pubkey[N_wordlen_max],
|
|
unsigned MsgRet[WORK_BUFF_MUL4])
|
|
{
|
|
int bidx = 0;
|
|
int bitpos;
|
|
unsigned np64[2];
|
|
|
|
/* MsgRet working area: */
|
|
/* 0..N = RESULT, starting with S` */
|
|
/* N..N*2 = S` and then working BASE during math. */
|
|
/* N*2..N*4+2 = temp working area for Mont mul */
|
|
|
|
/* 1. Copy sig into MsgRet so we have one working result buffer */
|
|
CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result],
|
|
(const uint32_t *)(uintptr_t)mont_signature, N_bytelen);
|
|
MultprecGenNp64(pubkey, np64); /* Generate N` from LSW of N (LSW being lowest 64b word) */
|
|
bitpos = (int8_t)(uint8_t)(31U - __CLZ(exp_pubkey)); /* count of bits after the left most 1 */
|
|
while (--bitpos >= 0)
|
|
{
|
|
/* This operates on: */
|
|
/* result = 1; */
|
|
/* base = signature */
|
|
/* loop while exponent bits from MSb to LSb */
|
|
/* if (exp bit is 1) */
|
|
/* result = result * base */
|
|
/* base = base^2 */
|
|
/* Because the MSb of exp is always 1 by definition, we can invert this a bit: */
|
|
/* base = signature` */
|
|
/* result = base; equivalent to result = 1*base from 1st pass, but now square is needed 1st */
|
|
/* loop while exponent bits from MSb-1 to LSb */
|
|
/* base = base^2 */
|
|
/* if (exp bit is 1) */
|
|
/* result = result * base */
|
|
/* This ends up doing the same thing but skips two wasteful steps of multiplying by 1 and */
|
|
/* a final squaring never used. */
|
|
/* */
|
|
/* Next we have the problem that CIOS mul needs a separate dest buffer. So, we bounce */
|
|
/* base between base and temp, and likewise for result. */
|
|
MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase],
|
|
&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
|
|
&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], pubkey, np64);
|
|
if (0U != (exp_pubkey & (uint32_t)(uint8_t)(1U << (uint8_t)bitpos))) /* where e is 1 */
|
|
{
|
|
/* result has result, so we need to work into other temp area */
|
|
MultprecCiosMul(&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
|
|
&MsgRet[kCASPER_RamOffset_Result],
|
|
&MsgRet[(bidx != 0) ? kCASPER_RamOffset_Base : kCASPER_RamOffset_TempBase], pubkey, np64);
|
|
/* we have to copy back to result */
|
|
|
|
// CASPER_MEMCPY_I2I(&MsgRet[kCASPER_RamOffset_Result],
|
|
// &MsgRet[bidx ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base], N_bytelen);
|
|
}
|
|
else
|
|
{
|
|
bidx = (int32_t)(uint32_t) ~(unsigned)bidx;
|
|
}
|
|
}
|
|
|
|
CASPER_MEMCPY_I2I((uint32_t *)(uintptr_t)&MsgRet[kCASPER_RamOffset_Result],
|
|
(uint32_t *)(uintptr_t)&MsgRet[(bidx != 0) ? kCASPER_RamOffset_TempBase : kCASPER_RamOffset_Base],
|
|
N_bytelen);
|
|
|
|
/* final step is one more reduction to get back to normal form (ie. divide R out) */
|
|
MultprecCiosMul(&MsgRet[kCASPER_RamOffset_Result], NULL, NULL, pubkey, np64);
|
|
return (0); /* always 0 */
|
|
}
|
|
|
|
/* RSA_SignatureToPlaintextFast: */
|
|
/* MsgRet[] = Message return buffer - must be large enough to hold input and output (4*N+2) */
|
|
/* exp_pubkey = the "e" that the value is raised to. Usually 3 or 0x10001. */
|
|
/* signature = N bitpos len long "message" to process in normal form - so converted to Mont form */
|
|
/* pubkey = N bitpos len long public key to process signature with */
|
|
/* returns: 0 */
|
|
/* */
|
|
/* Algo: compute M = signaturen^e mod public_key */
|
|
/* where M is original plaintext, signature is signed value */
|
|
/* note: e is usually either 0x3 or 0x10001 */
|
|
int RSA_SignatureToPlaintextFast(const unsigned signature[N_wordlen_max],
|
|
const unsigned exp_pubkey,
|
|
const unsigned pubkey[N_wordlen_max],
|
|
unsigned MsgRet[WORK_BUFF_MUL4])
|
|
{
|
|
/* MsgRet working area: */
|
|
/* 0..N = RESULT, starting with S`; it is used for R` just during creation of S` */
|
|
/* N..N*2 = S` and then working BASE during math. Note overflow beyond N*2 when making S` */
|
|
/* N*2..N*4+2 = temp working area for Mont mul */
|
|
|
|
MultprecMontCalcRp(&MsgRet[kCASPER_RamOffset_Result], exp_pubkey, pubkey); /* calculate R` (=R mod N) */
|
|
MultprecMontPrepareX(&MsgRet[kCASPER_RamOffset_Base], signature, &MsgRet[kCASPER_RamOffset_Result],
|
|
pubkey); /* X*R1` mod N */
|
|
return (RSA_MontSignatureToPlaintextFast(&MsgRet[kCASPER_RamOffset_Base], exp_pubkey, pubkey, MsgRet));
|
|
}
|
|
|
|
/*!
|
|
* brief Performs modular exponentiation - (A^E) mod N.
|
|
*
|
|
* This function performs modular exponentiation.
|
|
*
|
|
* param base CASPER base address
|
|
* param signature first addend (in little endian format)
|
|
* param pubN modulus (in little endian format)
|
|
* param wordLen Size of pubN in bytes
|
|
* param pubE exponent
|
|
* param[out] plaintext Output array to store result of operation (in little endian format)
|
|
*/
|
|
void CASPER_ModExp(
|
|
CASPER_Type *base, const uint8_t *signature, const uint8_t *pubN, size_t wordLen, uint32_t pubE, uint8_t *plaintext)
|
|
{
|
|
#define PK_LOC &msg_ret[kCASPER_RamOffset_Modulus]
|
|
#define SIG_LOC &msg_ret[(unsigned)kCASPER_RamOffset_Modulus + N_wordlen_max]
|
|
|
|
N_wordlen = wordLen; /* set global variable for key length - used by RSA_SignatureToPlaintextFast() */
|
|
CASPER_MEMCPY_N2I(PK_LOC, (const uint32_t *)(uintptr_t)pubN, N_bytelen);
|
|
CASPER_MEMCPY_N2I(SIG_LOC, (const uint32_t *)(uintptr_t)signature, N_bytelen);
|
|
(void)RSA_SignatureToPlaintextFast((const unsigned *)(uintptr_t)(SIG_LOC), pubE,
|
|
(const unsigned *)(uintptr_t)(PK_LOC), (unsigned int *)(uintptr_t)msg_ret);
|
|
|
|
CASPER_MEMCPY_I2N((uint32_t *)(uintptr_t)plaintext, msg_ret, N_bytelen);
|
|
}
|
|
|
|
/*!
|
|
* brief Enables clock and disables reset for CASPER peripheral.
|
|
*
|
|
* Enable clock and disable reset for CASPER.
|
|
*
|
|
* param base CASPER base address
|
|
*/
|
|
void CASPER_Init(CASPER_Type *base)
|
|
{
|
|
#if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
|
|
#if defined(CASPER_CLOCKS)
|
|
CLOCK_EnableClock(kCLOCK_Casper);
|
|
#endif
|
|
#endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */
|
|
#if defined(CASPER_RSTS)
|
|
RESET_PeripheralReset(kCASPER_RST_SHIFT_RSTn);
|
|
#endif
|
|
#if defined(FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE) && (FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE > 0)
|
|
/* Enable hardware interleaving to RAMX0 and RAMX1 for CASPER */
|
|
SYSCON->CASPER_CTRL = SYSCON_CASPER_CTRL_INTERLEAVE(1);
|
|
#endif /* FSL_FEATURE_CASPER_RAM_HW_INTERLEAVE */
|
|
/* If Casper init is called with secure address, use secure addres also for accessing Casper RAM. */
|
|
s_casperRamBase = (unsigned)CASPER_RAM_BASE_NS | ((uint32_t)base & 0x10000000u);
|
|
msg_ret = (uint32_t *)s_casperRamBase;
|
|
}
|
|
|
|
/*!
|
|
* brief Disables clock for CASPER peripheral.
|
|
*
|
|
* Disable clock and enable reset.
|
|
*
|
|
* param base CASPER base address
|
|
*/
|
|
void CASPER_Deinit(CASPER_Type *base)
|
|
{
|
|
#if defined(CASPER_RSTS)
|
|
RESET_SetPeripheralReset(kCASPER_RST_SHIFT_RSTn);
|
|
#endif
|
|
#if !(defined(FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL) && FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL)
|
|
#if defined(CASPER_CLOCKS)
|
|
CLOCK_DisableClock(kCLOCK_Casper);
|
|
#endif
|
|
#endif /* FSL_SDK_DISABLE_DRIVER_CLOCK_CONTROL */
|
|
}
|
|
|
|
/* New ECC code which uses Casper. */
|
|
|
|
/* Set the prime modulus mod in Casper memory.
|
|
*/
|
|
void CASPER_ecc_init(casper_algo_t curve)
|
|
{
|
|
uint32_t *mod;
|
|
|
|
if (curve == kCASPER_ECC_P256)
|
|
{
|
|
N_wordlen = 256U / 32U;
|
|
mod = NISTp256;
|
|
}
|
|
|
|
if (curve == kCASPER_ECC_P384)
|
|
{
|
|
N_wordlen = 384U / 32U;
|
|
mod = NISTp384;
|
|
}
|
|
|
|
if (curve == kCASPER_ECC_P521)
|
|
{
|
|
N_wordlen = 576U / 32U;
|
|
mod = NISTp521;
|
|
}
|
|
|
|
CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U)], mod, N_wordlen * sizeof(uint32_t));
|
|
uint8_t a[((CASPER_MAX_ECC_SIZE_WORDLEN + 4U) - CASPER_MAX_ECC_SIZE_WORDLEN) * sizeof(uint32_t)] = {0};
|
|
CASPER_MEMCPY(&CASPER_MEM[(N_wordlen + 4U) + N_wordlen], a, ((N_wordlen + 4U) - N_wordlen) * sizeof(uint32_t));
|
|
}
|
|
|
|
void CASPER_ECC_equal(int *res, uint32_t *op1, uint32_t *op2)
|
|
{
|
|
uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
|
|
uint32_t b[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
|
|
uint32_t c = 0;
|
|
CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(b, op2, N_wordlen * sizeof(uint32_t));
|
|
|
|
do
|
|
{
|
|
uint32_t _i;
|
|
c = (a[0] ^ b[0]);
|
|
for (_i = 1; _i < N_wordlen; _i++)
|
|
{
|
|
c |= (a[_i] ^ b[_i]);
|
|
}
|
|
} while (false);
|
|
|
|
*res = (int32_t)c;
|
|
}
|
|
|
|
void CASPER_ECC_equal_to_zero(int *res, uint32_t *op1)
|
|
{
|
|
uint32_t a[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
|
|
uint32_t c = 0;
|
|
CASPER_MEMCPY(a, op1, N_wordlen * sizeof(uint32_t));
|
|
|
|
do
|
|
{
|
|
uint32_t _i;
|
|
c = a[0];
|
|
for (_i = 1; _i < N_wordlen; _i++)
|
|
{
|
|
c |= a[_i];
|
|
}
|
|
} while (false);
|
|
|
|
*res = (int32_t)c;
|
|
}
|
|
|
|
void CASPER_ECC_SECP256R1_Mul(
|
|
CASPER_Type *base, uint32_t resX[8], uint32_t resY[8], uint32_t X[8], uint32_t Y[8], uint32_t scalar[8])
|
|
{
|
|
uint32_t X1[8] = {0};
|
|
uint32_t Y1[8] = {0};
|
|
toMontgomery_ECC_P256(X1, X);
|
|
toMontgomery_ECC_P256(Y1, Y);
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
|
|
Jac_scalar_multiplication(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
scalar, NISTp256, NISTp256_q);
|
|
|
|
Jac_toAffine(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
|
|
/* Montgomery to Normal */
|
|
/* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
|
|
uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0};
|
|
one[0] = 0x1u;
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
|
|
/* copy out to result */
|
|
CASPER_MEMCPY(
|
|
resX,
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
(uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
resY,
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
(uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
void CASPER_ECC_SECP256R1_MulAdd(CASPER_Type *base,
|
|
uint32_t resX[8],
|
|
uint32_t resY[8],
|
|
uint32_t X1[8],
|
|
uint32_t Y1[8],
|
|
uint32_t scalar1[8],
|
|
uint32_t X2[8],
|
|
uint32_t Y2[8],
|
|
uint32_t scalar2[8])
|
|
{
|
|
uint32_t zeroes[(kCASPER_ECC_P256_wordlen + 4U)] = {0};
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
X1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
Y1, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
X2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
Y2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
|
|
toMontgomery_ECC_P256(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
toMontgomery_ECC_P256(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
toMontgomery_ECC_P256(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
toMontgomery_ECC_P256(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
zeroes, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
|
|
double_scalar_multiplication(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
scalar1,
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
scalar2);
|
|
|
|
Jac_toAffine(
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
|
|
uint32_t one[(kCASPER_ECC_P256_wordlen + 4U)] = {0x0};
|
|
one[0] = 0x1u;
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
one, ((uint32_t)kCASPER_ECC_P256_wordlen + 4U) * sizeof(uint32_t));
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P256_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]);
|
|
|
|
CASPER_MEMCPY(resX,
|
|
(&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]),
|
|
(uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(resY,
|
|
(&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)]),
|
|
(uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
void CASPER_ECC_SECP384R1_Mul(
|
|
CASPER_Type *base, uint32_t resX[12], uint32_t resY[12], uint32_t X[12], uint32_t Y[12], uint32_t scalar[12])
|
|
{
|
|
uint32_t X1[12] = {0};
|
|
uint32_t Y1[12] = {0};
|
|
toMontgomery_ECC_P384(X1, X);
|
|
toMontgomery_ECC_P384(Y1, Y);
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
|
|
Jac_scalar_multiplication(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
scalar, NISTp384, NISTp384_q);
|
|
|
|
Jac_toAffine(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
|
|
/* Montgomery to Normal */
|
|
/* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
|
|
uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
|
|
/* copy out to result */
|
|
CASPER_MEMCPY(
|
|
resX,
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
(uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
resY,
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
(uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
void CASPER_ECC_SECP384R1_MulAdd(CASPER_Type *base,
|
|
uint32_t resX[12],
|
|
uint32_t resY[12],
|
|
uint32_t X1[12],
|
|
uint32_t Y1[12],
|
|
uint32_t scalar1[12],
|
|
uint32_t X2[12],
|
|
uint32_t Y2[12],
|
|
uint32_t scalar2[12])
|
|
{
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
X1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
Y1, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
X2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
Y2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
|
|
toMontgomery_ECC_P384(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
toMontgomery_ECC_P384(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
toMontgomery_ECC_P384(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
toMontgomery_ECC_P384(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
|
|
double_scalar_multiplication(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
scalar1,
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
scalar2);
|
|
|
|
Jac_toAffine(
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
|
|
uint32_t one[12] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
one, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]);
|
|
|
|
CASPER_MEMCPY(resX,
|
|
(&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
|
|
1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]),
|
|
(uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(resY,
|
|
(&CASPER_MEM[(2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
|
|
2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)]),
|
|
(uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
void CASPER_ECC_SECP521R1_Mul(
|
|
CASPER_Type *base, uint32_t resX[18], uint32_t resY[18], uint32_t X[18], uint32_t Y[18], uint32_t scalar[18])
|
|
{
|
|
uint32_t X1[18] = {0};
|
|
uint32_t Y1[18] = {0};
|
|
toMontgomery_ECC_P521(X1, X);
|
|
toMontgomery_ECC_P521(Y1, Y);
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
|
|
Jac_scalar_multiplication(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
scalar, NISTp521, NISTp521_q);
|
|
|
|
Jac_toAffine(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
7U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
8U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
|
|
/* Montgomery to Normal */
|
|
/* X_normal = 1 * X_montgomery; Y_normal = 1 * Y_montgomery */
|
|
uint32_t one[18] = {0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
one, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
multiply_casper(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
multiply_casper(
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
|
|
/* copy out to result */
|
|
CASPER_MEMCPY(
|
|
resX,
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
(uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
resY,
|
|
&CASPER_MEM[(((uint32_t)kCASPER_ECC_P521_wordlen + 4U) + (1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U))) +
|
|
6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
(uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
void CASPER_ECC_SECP521R1_MulAdd(CASPER_Type *base,
|
|
uint32_t resX[18],
|
|
uint32_t resY[18],
|
|
uint32_t X1[18],
|
|
uint32_t Y1[18],
|
|
uint32_t scalar1[18],
|
|
uint32_t X2[18],
|
|
uint32_t Y2[18],
|
|
uint32_t scalar2[18])
|
|
{
|
|
uint32_t zeroes[(kCASPER_ECC_P521_wordlen + 4U)] = {0};
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
X1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
Y1, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
X2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
Y2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
|
|
toMontgomery_ECC_P521(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
toMontgomery_ECC_P521(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
toMontgomery_ECC_P521(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
toMontgomery_ECC_P521(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
zeroes, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
|
|
double_scalar_multiplication(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
scalar1,
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
scalar2);
|
|
|
|
Jac_toAffine(
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 6U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
|
|
uint32_t one[(kCASPER_ECC_P521_wordlen + 4U)] = {0x0};
|
|
one[0] = 0x1u;
|
|
CASPER_MEMCPY(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
one, ((uint32_t)kCASPER_ECC_P521_wordlen + 4U) * sizeof(uint32_t));
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
multiply_casper(
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) +
|
|
2U * ((uint32_t)(uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)],
|
|
&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]);
|
|
|
|
CASPER_MEMCPY(
|
|
resX,
|
|
(&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]),
|
|
(uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(
|
|
resY,
|
|
(&CASPER_MEM[(2U * (uint32_t)kCASPER_ECC_P521_wordlen + 8U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)]),
|
|
(uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
// CIOS Multiply. This is the Coarse Integrated form where the values are
|
|
// multiplied and reduced for each step of "i". This uses less memory and
|
|
// is faster as a result. Note that this is used to square as well as mul,
|
|
// so not as fast as pure squaring could be.
|
|
static void MultprecCiosMul_ct(
|
|
uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np)
|
|
{
|
|
uint32_t j;
|
|
uint64_t *m64 = (uint64_t *)(uintptr_t)&msg_ret[kCASPER_RamOffset_M64];
|
|
uint64_t Np64;
|
|
uint64_t carry;
|
|
uint64_t *a64, *b64, *w64, *N64;
|
|
uint32_t *T1 = &CASPER_MEM[0], borrow;
|
|
|
|
Np64 = *(uint64_t *)(uintptr_t)Np;
|
|
|
|
a64 = (uint64_t *)(uintptr_t)a;
|
|
b64 = (uint64_t *)(uintptr_t)b;
|
|
w64 = (uint64_t *)(uintptr_t)w_out;
|
|
N64 = (uint64_t *)(uintptr_t)Nmod;
|
|
|
|
if (a != NULL)
|
|
{ /* if !a, we are reducing only */
|
|
PreZeroW(j, w_out);
|
|
}
|
|
SET_DWORD(&w64[N_dwordlen], 0ULL);
|
|
SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
|
|
/* with accelerator */
|
|
|
|
/* loop j and then reduce after each j round */
|
|
for (j = 0; j < N_dwordlen; j++)
|
|
{
|
|
/* Step 3. Iterate over N words of u using j - perform Multiply-accumulate */
|
|
/* push-pull: we do a*b and then separately m*n (reduce) */
|
|
if (a != NULL)
|
|
{ /* if mul&reduce vs. reduce only */
|
|
carry = GET_DWORD(&w64[N_dwordlen]);
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
|
|
Accel_done();
|
|
/* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
|
|
/* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
|
|
/* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
|
|
/* w64[N_dwordlen+1] = g_carry; */
|
|
carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
|
|
SET_DWORD(&w64[N_dwordlen + 1U], carry);
|
|
}
|
|
SET_DWORD(&m64[0], GET_DWORD(&w64[0]) * Np64); /* prime for 1st; modulo a double-word */
|
|
|
|
/* we are reducing, so the 1st [0th] 64 bit value product is tossed, but we */
|
|
/* need its carry. We let the accel do this separately - really need a mode to */
|
|
/* do this "reduce" since it is natural */
|
|
carry = GET_DWORD(&w64[N_dwordlen]);
|
|
Accel_SetABCD_Addr(CA_MK_OFF(m64), CA_MK_OFF(&N64[0]));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(&w64[0])));
|
|
Accel_done();
|
|
carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
|
|
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&w64[1]), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpCopy, CA_MK_OFF(&w64[0])));
|
|
|
|
Accel_done();
|
|
SET_DWORD(&w64[N_dwordlen], (GET_DWORD(&w64[N_dwordlen + 1U]) + carry));
|
|
}
|
|
|
|
/* now check if need to subtract Nmod */
|
|
CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t));
|
|
|
|
/* Compute w = w - N */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
|
|
Accel_done();
|
|
|
|
// if w_out > T1 then there was a borrow
|
|
borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen]));
|
|
|
|
SET_WORD(&w_out[N_wordlen + 1U], 0);
|
|
SET_WORD(&w_out[N_wordlen], 0);
|
|
casper_select(w_out, w_out, T1, (int32_t)borrow, (int16_t)(uint16_t)N_wordlen);
|
|
}
|
|
|
|
/* Compute C = A - B % mod
|
|
* Assumes all operand have two extra limbs to store carry.
|
|
*/
|
|
static void CASPER_montsub(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod)
|
|
{
|
|
uint64_t *b64, *c64, *m64, *tmp;
|
|
int borrow;
|
|
|
|
b64 = (uint64_t *)(uintptr_t)B;
|
|
c64 = (uint64_t *)(uintptr_t)C;
|
|
m64 = (uint64_t *)(uintptr_t)mod;
|
|
|
|
tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0];
|
|
|
|
CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t));
|
|
|
|
/* Compute tmp = A - B. */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(b64), 0);
|
|
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(tmp)));
|
|
Accel_done();
|
|
|
|
borrow = (int32_t)((GET_WORD(&((uint32_t *)(uintptr_t)tmp)[N_wordlen - 1U])) > GET_WORD(&A[N_wordlen - 1U]));
|
|
CASPER_MEMCPY(c64, tmp, N_wordlen * sizeof(uint32_t));
|
|
|
|
/* Compute C = Mod + tmp */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(m64), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U - 1U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(c64)));
|
|
Accel_done();
|
|
|
|
casper_select(C, (uint32_t *)(uintptr_t)tmp, C, borrow, (int16_t)(uint16_t)N_wordlen);
|
|
}
|
|
|
|
/* Compute C = A + B % mod
|
|
* Assumes all operand have two extra limbs to store carry.
|
|
*/
|
|
static void CASPER_montadd(uint32_t *C, uint32_t *A, uint32_t *B, uint32_t *mod)
|
|
{
|
|
uint64_t *b64, *c64, *m64, *tmp;
|
|
int borrow;
|
|
|
|
b64 = (uint64_t *)(uintptr_t)B;
|
|
c64 = (uint64_t *)(uintptr_t)C;
|
|
m64 = (uint64_t *)(uintptr_t)mod;
|
|
|
|
tmp = (uint64_t *)(uintptr_t)&CASPER_MEM[0];
|
|
|
|
CASPER_MEMCPY(tmp, A, N_wordlen * sizeof(uint32_t));
|
|
SET_DWORD(&tmp[N_wordlen / 2U], 0ULL);
|
|
SET_DWORD(&b64[N_wordlen / 2U], 0ULL);
|
|
SET_DWORD(&m64[N_wordlen / 2U], 0ULL);
|
|
|
|
/* Compute tmp = A + B using one additonal double-length limb. */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(b64), 0);
|
|
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(tmp)));
|
|
Accel_done();
|
|
|
|
CASPER_MEMCPY(c64, tmp, (N_wordlen + 2U) * sizeof(uint32_t));
|
|
|
|
/* Compute C = Mod - tmp */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(m64), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(c64)));
|
|
Accel_done();
|
|
|
|
// borrow = g_carry;
|
|
borrow = (int32_t)(GET_WORD(&C[N_wordlen]) > GET_WORD(&(((uint32_t *)(uintptr_t)tmp)[N_wordlen])));
|
|
casper_select(C, C, (uint32_t *)(uintptr_t)tmp, borrow, (int16_t)(uint16_t)N_wordlen);
|
|
}
|
|
|
|
/* Compute c = a/2 mod p where b is scratch space. */
|
|
static void CASPER_half(uint32_t *c, uint32_t *a, uint32_t *b)
|
|
{
|
|
shiftright(b, a, 1U); /* Compute a/2 and (a+p)/2 */
|
|
|
|
/* Compute tmp = a + p using one additonal double-length limb. */
|
|
CASPER_MEMCPY(c, a, N_wordlen * sizeof(uint32_t));
|
|
SET_WORD(&c[N_wordlen], 0);
|
|
SET_WORD(&c[N_wordlen + 1U], 0U);
|
|
|
|
Accel_SetABCD_Addr(CA_MK_OFF(((uint64_t *)(uintptr_t)&CASPER_MEM[(N_wordlen + 4U)])), 0);
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_wordlen / 2U, (uint32_t)kCASPER_OpAdd64, CA_MK_OFF(((uint64_t *)(uintptr_t)c))));
|
|
Accel_done();
|
|
|
|
shiftright(c, c, 1U);
|
|
SET_WORD(&c[N_wordlen - 1U], GET_WORD(&c[N_wordlen - 1U]) | (GET_WORD(&c[N_wordlen]) << 31));
|
|
SET_WORD(&c[N_wordlen], 0U);
|
|
casper_select(c, b, c, (int32_t)(uint32_t)(GET_WORD(&a[0]) & 1U), (int16_t)(uint16_t)(N_wordlen));
|
|
}
|
|
|
|
static uint32_t casper_get_word(uint32_t *addr)
|
|
{
|
|
return GET_WORD(addr);
|
|
}
|
|
|
|
/* Shift right by 1 <= c <= 31. z[] and x[] in system RAM, no interleaving macros used. */
|
|
static void shiftrightSysram(uint32_t *z, uint32_t *x, uint32_t c)
|
|
{
|
|
z[0] = (x[1] << (32U - (c))) | (x[0] >> (c));
|
|
z[1] = (x[2] << (32U - (c))) | (x[1] >> (c));
|
|
z[2] = (x[3] << (32U - (c))) | (x[2] >> (c));
|
|
z[3] = (x[4] << (32U - (c))) | (x[3] >> (c));
|
|
z[4] = (x[5] << (32U - (c))) | (x[4] >> (c));
|
|
z[5] = (x[6] << (32U - (c))) | (x[5] >> (c));
|
|
z[6] = (x[7] << (32U - (c))) | (x[6] >> (c));
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
z[7] = (x[8] << (32U - (c))) | (x[7] >> (c));
|
|
z[8] = (x[9] << (32U - (c))) | (x[8] >> (c));
|
|
z[9] = (x[10] << (32U - (c))) | (x[9] >> (c));
|
|
z[10] = (x[11] << (32U - (c))) | (x[10] >> (c));
|
|
z[11] = (x[12] << (32U - (c))) | (x[11] >> (c));
|
|
z[12] = (x[13] << (32U - (c))) | (x[12] >> (c));
|
|
z[13] = (x[14] << (32U - (c))) | (x[13] >> (c));
|
|
z[14] = (x[15] << (32U - (c))) | (x[14] >> (c));
|
|
z[15] = (x[16] << (32U - (c))) | (x[15] >> (c));
|
|
z[16] = (x[17] << (32U - (c))) | (x[16] >> (c));
|
|
z[17] = (x[17] >> (c));
|
|
}
|
|
|
|
if (N_wordlen == 12U)
|
|
{
|
|
z[7] = (x[8] << (32U - (c))) | (x[7] >> (c));
|
|
z[8] = (x[9] << (32U - (c))) | (x[8] >> (c));
|
|
z[9] = (x[10] << (32U - (c))) | (x[9] >> (c));
|
|
z[10] = (x[11] << (32U - (c))) | (x[10] >> (c));
|
|
z[11] = (x[11] >> (c));
|
|
}
|
|
if (N_wordlen == 8U)
|
|
{
|
|
z[7] = (x[7] >> (c));
|
|
}
|
|
}
|
|
/* Shift right by 1 <= c <= 31. */
|
|
static void shiftright(uint32_t *z, uint32_t *x, uint32_t c)
|
|
{
|
|
SET_WORD(&z[0], (GET_WORD(&x[1]) << (32U - (c))) | (GET_WORD(&x[0]) >> (c)));
|
|
SET_WORD(&z[1], (GET_WORD(&x[2]) << (32U - (c))) | (GET_WORD(&x[1]) >> (c)));
|
|
SET_WORD(&z[2], (GET_WORD(&x[3]) << (32U - (c))) | (GET_WORD(&x[2]) >> (c)));
|
|
SET_WORD(&z[3], (GET_WORD(&x[4]) << (32U - (c))) | (GET_WORD(&x[3]) >> (c)));
|
|
SET_WORD(&z[4], (GET_WORD(&x[5]) << (32U - (c))) | (GET_WORD(&x[4]) >> (c)));
|
|
SET_WORD(&z[5], (GET_WORD(&x[6]) << (32U - (c))) | (GET_WORD(&x[5]) >> (c)));
|
|
SET_WORD(&z[6], (GET_WORD(&x[7]) << (32U - (c))) | (GET_WORD(&x[6]) >> (c)));
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c)));
|
|
SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c)));
|
|
SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c)));
|
|
SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c)));
|
|
SET_WORD(&z[11], (GET_WORD(&x[12]) << (32U - (c))) | (GET_WORD(&x[11]) >> (c)));
|
|
SET_WORD(&z[12], (GET_WORD(&x[13]) << (32U - (c))) | (GET_WORD(&x[12]) >> (c)));
|
|
SET_WORD(&z[13], (GET_WORD(&x[14]) << (32U - (c))) | (GET_WORD(&x[13]) >> (c)));
|
|
SET_WORD(&z[14], (GET_WORD(&x[15]) << (32U - (c))) | (GET_WORD(&x[14]) >> (c)));
|
|
SET_WORD(&z[15], (GET_WORD(&x[16]) << (32U - (c))) | (GET_WORD(&x[15]) >> (c)));
|
|
SET_WORD(&z[16], (GET_WORD(&x[17]) << (32U - (c))) | (GET_WORD(&x[16]) >> (c)));
|
|
SET_WORD(&z[17], (GET_WORD(&x[17]) >> (c)));
|
|
}
|
|
if (N_wordlen == 12U)
|
|
{
|
|
SET_WORD(&z[7], (GET_WORD(&x[8]) << (32U - (c))) | (GET_WORD(&x[7]) >> (c)));
|
|
SET_WORD(&z[8], (GET_WORD(&x[9]) << (32U - (c))) | (GET_WORD(&x[8]) >> (c)));
|
|
SET_WORD(&z[9], (GET_WORD(&x[10]) << (32U - (c))) | (GET_WORD(&x[9]) >> (c)));
|
|
SET_WORD(&z[10], (GET_WORD(&x[11]) << (32U - (c))) | (GET_WORD(&x[10]) >> (c)));
|
|
SET_WORD(&z[11], (GET_WORD(&x[11]) >> (c)));
|
|
}
|
|
if (N_wordlen == 8U)
|
|
{
|
|
SET_WORD((&z[7]), (GET_WORD(&x[7]) >> (c)));
|
|
}
|
|
}
|
|
/* Shift left by 1 <= c <= 31. */
|
|
static void shiftleft(uint32_t *z, uint32_t *x, uint32_t c)
|
|
{
|
|
if (N_wordlen == 18U)
|
|
{
|
|
SET_WORD(&z[17], (GET_WORD(&x[17]) << (c)) | GET_WORD(&z[16]) >> (32U - (c)));
|
|
SET_WORD(&z[16], (GET_WORD(&x[16]) << (c)) | GET_WORD(&z[15]) >> (32U - (c)));
|
|
SET_WORD(&z[15], (GET_WORD(&x[15]) << (c)) | GET_WORD(&z[14]) >> (32U - (c)));
|
|
SET_WORD(&z[14], (GET_WORD(&x[14]) << (c)) | GET_WORD(&z[13]) >> (32U - (c)));
|
|
SET_WORD(&z[13], (GET_WORD(&x[13]) << (c)) | GET_WORD(&z[12]) >> (32U - (c)));
|
|
SET_WORD(&z[12], (GET_WORD(&x[12]) << (c)) | GET_WORD(&z[11]) >> (32U - (c)));
|
|
SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c)));
|
|
SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c)));
|
|
SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c)));
|
|
SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c)));
|
|
}
|
|
if (N_wordlen == 12U)
|
|
{
|
|
SET_WORD(&z[11], (GET_WORD(&x[11]) << (c)) | GET_WORD(&z[10]) >> (32U - (c)));
|
|
SET_WORD(&z[10], (GET_WORD(&x[10]) << (c)) | GET_WORD(&z[9]) >> (32U - (c)));
|
|
SET_WORD(&z[9], (GET_WORD(&x[9]) << (c)) | GET_WORD(&z[8]) >> (32U - (c)));
|
|
SET_WORD(&z[8], (GET_WORD(&x[8]) << (c)) | GET_WORD(&z[7]) >> (32U - (c)));
|
|
}
|
|
SET_WORD(&z[7], (GET_WORD(&x[7]) << (c)) | GET_WORD(&z[6]) >> (32U - (c)));
|
|
SET_WORD(&z[6], (GET_WORD(&x[6]) << (c)) | GET_WORD(&z[5]) >> (32U - (c)));
|
|
SET_WORD(&z[5], (GET_WORD(&x[5]) << (c)) | GET_WORD(&z[4]) >> (32U - (c)));
|
|
SET_WORD(&z[4], (GET_WORD(&x[4]) << (c)) | GET_WORD(&z[3]) >> (32U - (c)));
|
|
SET_WORD(&z[3], (GET_WORD(&x[3]) << (c)) | GET_WORD(&z[2]) >> (32U - (c)));
|
|
SET_WORD(&z[2], (GET_WORD(&x[2]) << (c)) | GET_WORD(&z[1]) >> (32U - (c)));
|
|
SET_WORD(&z[1], (GET_WORD(&x[1]) << (c)) | GET_WORD(&z[0]) >> (32U - (c)));
|
|
SET_WORD(&z[0], (GET_WORD(&x[0]) << (c)));
|
|
}
|
|
|
|
static void multiply_casper(uint32_t w_out[], const uint32_t a[], const uint32_t b[])
|
|
{
|
|
uint32_t *Np;
|
|
|
|
if (N_wordlen == 8U)
|
|
{
|
|
Np = Np256;
|
|
MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
|
|
}
|
|
if (N_wordlen == 12U)
|
|
{
|
|
Np = Np384;
|
|
MultprecCiosMul_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
Np = Np521;
|
|
MultprecCiosMul521_ct(w_out, a, b, &CASPER_MEM[(N_wordlen + 4U)], Np);
|
|
}
|
|
}
|
|
/* Convert a projective point (X1 : Y1 : Z1)
|
|
* to the affine point (X3, Y3) = (X1/Z1^2,Y1/Z1^3)
|
|
* The memory of (X3, Y3) and (X1 : Y1 : Z1) should not overlap
|
|
*/
|
|
void Jac_toAffine(uint32_t *X3, uint32_t *Y3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
|
|
{
|
|
uint32_t *T1, *T2;
|
|
|
|
T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
|
|
|
|
square_casper(T1, Z1); // Z^2
|
|
multiply_casper(T2, T1, Z1); // Z^3
|
|
|
|
// Montgomery inverse
|
|
if (N_wordlen == 8U)
|
|
{
|
|
invert_mod_p256(T1, T2);
|
|
}
|
|
|
|
if (N_wordlen == 12U)
|
|
{
|
|
invert_mod_p384(T1, T2);
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
invert_mod_p521(T1, T2);
|
|
}
|
|
|
|
multiply_casper(Y3, Y1, T1); // Y3 = Y/Z^3
|
|
multiply_casper(T2, T1, Z1); // Z^-2
|
|
multiply_casper(X3, X1, T2); // X3 = X/Z^2
|
|
}
|
|
|
|
/* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2 : Y2 : Z2)
|
|
* where (X1: Y1: Z1) != (X2 : Y2 : Z2)
|
|
* (X3 : Y3: Z3) may be the same as one of the inputs.
|
|
*/
|
|
void Jac_addition(uint32_t *X3,
|
|
uint32_t *Y3,
|
|
uint32_t *Z3,
|
|
uint32_t *X1,
|
|
uint32_t *Y1,
|
|
uint32_t *Z1,
|
|
uint32_t *X2,
|
|
uint32_t *Y2,
|
|
uint32_t *Z2)
|
|
{
|
|
uint32_t *Z1Z1, *Z2Z2, *U1, *S1, *J, *H, *V, *t0, *t1;
|
|
int m1, m2;
|
|
|
|
Z1Z1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
|
|
Z2Z2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
|
|
U1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
|
|
S1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
|
|
J = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
|
|
H = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)];
|
|
V = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)];
|
|
t0 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)];
|
|
t1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)];
|
|
|
|
CASPER_ECC_equal_to_zero(&m1, Z1);
|
|
CASPER_ECC_equal_to_zero(&m2, Z2);
|
|
if (m1 == 0)
|
|
{
|
|
CASPER_MEMCPY(X3, X2, N_wordlen * 4U);
|
|
CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U);
|
|
CASPER_MEMCPY(Z3, Z2, N_wordlen * 4U);
|
|
return;
|
|
}
|
|
if (m2 == 0)
|
|
{
|
|
CASPER_MEMCPY(X3, X1, N_wordlen * 4U);
|
|
CASPER_MEMCPY(Y3, Y1, N_wordlen * 4U);
|
|
CASPER_MEMCPY(Z3, Z1, N_wordlen * 4U);
|
|
return;
|
|
}
|
|
|
|
square_casper(Z1Z1, Z1);
|
|
square_casper(Z2Z2, Z2);
|
|
multiply_casper(U1, X1, Z2Z2);
|
|
multiply_casper(H, X2, Z1Z1); /* if H equals U1 then X's are the same */
|
|
multiply_casper(t0, Z2, Z2Z2);
|
|
multiply_casper(S1, Y1, t0);
|
|
multiply_casper(t0, Z1, Z1Z1);
|
|
multiply_casper(J, Y2, t0); /* if (S1 == J) then Y's are the same */
|
|
|
|
CASPER_ECC_equal(&m1, H, U1); /* If H and U1 match then the X-coordinates are the same. */
|
|
CASPER_ECC_equal(&m2, S1, J); /* If S1 and J match then the Y-coordinates are the same. */
|
|
if (m1 == 0)
|
|
{
|
|
if (m2 == 0)
|
|
{
|
|
Jac_double(X3, Y3, Z3, X1, Y1, Z1);
|
|
return;
|
|
}
|
|
/* else {
|
|
We work with the point at infinity.
|
|
The Z-coordinate will be set to zero in this function.
|
|
} */
|
|
}
|
|
|
|
sub_casper(H, H, U1);
|
|
mul2_casper(t0, H);
|
|
square_casper(t1, t0);
|
|
sub_casper(t0, J, S1);
|
|
multiply_casper(J, H, t1);
|
|
multiply_casper(V, U1, t1);
|
|
mul2_casper(U1, t0);
|
|
square_casper(t0, U1);
|
|
mul2_casper(t1, V);
|
|
sub_casper(t0, t0, J);
|
|
sub_casper(X3, t0, t1);
|
|
sub_casper(t0, V, X3);
|
|
multiply_casper(t1, S1, J);
|
|
mul2_casper(t1, t1);
|
|
multiply_casper(V, U1, t0);
|
|
sub_casper(Y3, V, t1);
|
|
add_casper(V, Z1, Z2);
|
|
square_casper(t1, V);
|
|
sub_casper(t1, t1, Z1Z1);
|
|
sub_casper(t1, t1, Z2Z2);
|
|
multiply_casper(Z3, t1, H);
|
|
}
|
|
|
|
/* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X2, Y2)
|
|
* where (X1: Y1: Z1) != (X2, Y2)
|
|
* (X3 : Y3: Z3) may not overlap with (X1: Y1: Z1).
|
|
* Source: 2004 Hankerson?Menezes?Vanstone, page 91.
|
|
*/
|
|
void Jac_add_affine(
|
|
uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1, uint32_t *X2, uint32_t *Y2)
|
|
{
|
|
uint32_t *T1, *T2, *T3, *T4, *T5;
|
|
uint32_t *ONE = NULL;
|
|
int m1, m2;
|
|
|
|
if (N_wordlen == 8U)
|
|
{
|
|
ONE = NISTr256;
|
|
}
|
|
if (N_wordlen == 12U)
|
|
{
|
|
ONE = NISTr384;
|
|
}
|
|
if (N_wordlen == 18U)
|
|
{
|
|
ONE = NISTr521;
|
|
}
|
|
|
|
T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
|
|
T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
|
|
T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
|
|
T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
|
|
|
|
CASPER_ECC_equal_to_zero(&m1, Z1);
|
|
if (m1 == 0)
|
|
{
|
|
CASPER_MEMCPY(X3, X2, N_wordlen * 4U);
|
|
CASPER_MEMCPY(Y3, Y2, N_wordlen * 4U);
|
|
CASPER_MEMCPY(Z3, ONE, N_wordlen * 4U);
|
|
return;
|
|
}
|
|
|
|
CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t));
|
|
square_casper(T3, Z1);
|
|
multiply_casper(T2, T3, Z1);
|
|
multiply_casper(T4, T3, X2);
|
|
multiply_casper(T3, T2, Y2);
|
|
|
|
CASPER_ECC_equal(&m1, T4, X1);
|
|
CASPER_ECC_equal(&m2, T3, Y1);
|
|
if (m1 == 0)
|
|
{
|
|
if (m2 == 0)
|
|
{
|
|
Jac_double(X3, Y3, Z3, X1, Y1, Z1);
|
|
return;
|
|
}
|
|
/* else {
|
|
We work with the point at infinity.
|
|
The Z-coordinate will be set to zero in this function.
|
|
} */
|
|
}
|
|
|
|
sub_casper(T1, T4, X1);
|
|
sub_casper(T2, T3, Y1);
|
|
multiply_casper(Z3, T5, T1);
|
|
square_casper(T3, T1);
|
|
multiply_casper(T4, T3, T1);
|
|
multiply_casper(T5, T3, X1);
|
|
mul2_casper(T1, T5);
|
|
square_casper(X3, T2);
|
|
sub_casper(X3, X3, T1);
|
|
sub_casper(X3, X3, T4);
|
|
sub_casper(T3, T5, X3);
|
|
multiply_casper(T1, T3, T2);
|
|
multiply_casper(T2, T4, Y1);
|
|
sub_casper(Y3, T1, T2);
|
|
}
|
|
|
|
static uint32_t casper_get_word(uint32_t *addr);
|
|
|
|
/* Point doubling from: 2004 Hankerson?Menezes?Vanstone, page 91.
|
|
* Compute (X3 : Y3: Z3) = (X1: Y1: Z1) + (X1 : Y1 : Z1)
|
|
* (X3 : Y3: Z3) may be the same as the input.
|
|
*/
|
|
void Jac_double(uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
|
|
{
|
|
uint32_t *T1, *T2, *T3, *T4, *T5;
|
|
|
|
T1 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 0U * (N_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 1U * (N_wordlen + 4U)];
|
|
T3 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 2U * (N_wordlen + 4U)];
|
|
T4 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 3U * (N_wordlen + 4U)];
|
|
T5 = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 4U * (N_wordlen + 4U)];
|
|
|
|
square_casper(T1, Z1);
|
|
sub_casper(T3, X1, T1);
|
|
add_casper(T1, X1, T1);
|
|
multiply_casper(T4, T3, T1);
|
|
|
|
mul2_casper(T3, T4);
|
|
|
|
add_casper(T2, T3, T4);
|
|
|
|
mul2_casper(Y3, Y1);
|
|
|
|
CASPER_MEMCPY(T5, Z1, N_wordlen * sizeof(uint32_t));
|
|
multiply_casper(Z3, Y3, T5);
|
|
|
|
square_casper(T5, Y3);
|
|
|
|
multiply_casper(T3, T5, X1);
|
|
|
|
square_casper(Y3, T5);
|
|
|
|
half(T5, Y3, T4);
|
|
|
|
square_casper(X3, T2);
|
|
|
|
mul2_casper(T1, T3);
|
|
|
|
sub_casper(X3, X3, T1);
|
|
|
|
sub_casper(T1, T3, X3);
|
|
|
|
multiply_casper(T3, T1, T2);
|
|
|
|
sub_casper(Y3, T3, T5);
|
|
}
|
|
|
|
/* Recoding for a signed fixed window.
|
|
* Source: https://eprint.iacr.org/2014/130.pdf, Algorithm 6
|
|
* Recode the n-bit integer k into ciel(log2(n)/(w-1)) digits
|
|
* where each digit is in
|
|
* { +/- 1, +/- 3, ..., +/- 2^(w-1)-1 }
|
|
* and put the result in c.
|
|
*/
|
|
static void recode(int8_t *c, uint32_t *k, int n, int w)
|
|
{
|
|
int i, t;
|
|
uint32_t K[CASPER_MAX_ECC_SIZE_WORDLEN] = {0};
|
|
(void)memcpy(K, k, (size_t)ceil(((double)n / 8.)));
|
|
t = (n + (w - 2)) / (w - 1);
|
|
for (i = 0; i < t; i++)
|
|
{
|
|
c[i] = (int8_t)(uint8_t)((K[0] & ((uint32_t)(uint32_t)(1UL << (uint32_t)w) - 1UL)) -
|
|
(uint32_t)(uint32_t)(1UL << ((uint32_t)w - 1UL)));
|
|
shiftrightSysram(K, K, (unsigned)w - 1U);
|
|
(void)add_n_1(K, K, (uint32_t)c[i] >> 31, (int16_t)(uint16_t)N_wordlen);
|
|
}
|
|
c[t] = (int8_t)K[0];
|
|
}
|
|
|
|
static uint32_t sub_n(uint32_t *c, uint32_t *a, uint32_t *b, int n)
|
|
{
|
|
int i;
|
|
uint32_t borrow;
|
|
sub_borrowout(borrow, GET_WORD(&c[0]), a[0], GET_WORD(&b[0]));
|
|
for (i = 1; i < n; i++)
|
|
{
|
|
sub_borrowin_borrowout(borrow, GET_WORD(&c[i]), a[i], GET_WORD(&b[i]), borrow);
|
|
}
|
|
return borrow;
|
|
}
|
|
|
|
#if 0
|
|
/* Dumb n-limb subtraction of c=a-b, return borrow. */
|
|
static uint32_t sub_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n) {
|
|
int i;
|
|
uint32_t borrow;
|
|
sub_borrowout(borrow, c[0], a[0], b);
|
|
for (i = 1; i < n; i++) {
|
|
sub_borrowin_borrowout_1(borrow, c[i], a[i], borrow);
|
|
}
|
|
return borrow;
|
|
}
|
|
|
|
/* Dumb n-limb addition of c=a+b, return carry. */
|
|
static uint32_t add_n(uint32_t *c, uint32_t *a, uint32_t *b, int n) {
|
|
int i;
|
|
uint32_t carry;
|
|
add_cout(carry, c[0], a[0], b[0]);
|
|
for (i = 1; i < n; i++) {
|
|
add_cout_cin(carry, c[i], a[i], b[i], carry);
|
|
}
|
|
return carry;
|
|
}
|
|
#endif
|
|
|
|
/* Dumb n-limb addition of c=a+b, return carry. */
|
|
static uint32_t add_n_1(uint32_t *c, uint32_t *a, uint32_t b, int n)
|
|
{
|
|
int i;
|
|
uint32_t carry;
|
|
add_cout(carry, c[0], a[0], b);
|
|
for (i = 1; i < n; i++)
|
|
{
|
|
add_cout_cin(carry, c[i], a[i], 0U, carry);
|
|
}
|
|
return carry;
|
|
}
|
|
|
|
static uint8_t int8abs(int8_t v)
|
|
{
|
|
return ((v < 0) ? ((uint8_t)-v) : ((uint8_t)v));
|
|
}
|
|
|
|
/* Constant time elliptic curve scalar multiplication.
|
|
* Source: https://eprint.iacr.org/2014/130.pdf
|
|
* when using w = 4.
|
|
* Computes (X3 : Y3 : Z3) = k * (X1, Y1) \in E(F_p)
|
|
* p is the prime used to define the finite field F_p
|
|
* q is the (prime) order of the curve
|
|
*/
|
|
void Jac_scalar_multiplication(
|
|
uint32_t *X3, uint32_t *Y3, uint32_t *Z3, uint32_t *X1, uint32_t *Y1, uint32_t *k, uint32_t *p, uint32_t *q)
|
|
{
|
|
uint32_t *scalar, *M, *X, *Y, *Z, *mem_loc;
|
|
uint32_t *ONE = NULL;
|
|
int i, sign, odd;
|
|
uint8_t index;
|
|
size_t recodeLength = 175u;
|
|
size_t bitlen = 0u;
|
|
int8_t rec[CASPER_RECODE_LENGTH_MAX] = {0};
|
|
|
|
if (N_wordlen == 8U)
|
|
{
|
|
recodeLength = (size_t)kCASPER_ECC_P256_recode_len;
|
|
bitlen = (size_t)kCASPER_ECC_P256_N_bitlen;
|
|
ONE = NISTr256;
|
|
}
|
|
|
|
if (N_wordlen == 12U)
|
|
{
|
|
recodeLength = (size_t)kCASPER_ECC_P384_recode_len;
|
|
bitlen = (size_t)kCASPER_ECC_P384_N_bitlen;
|
|
ONE = NISTr384;
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
recodeLength = (size_t)kCASPER_ECC_P521_recode_len;
|
|
bitlen = (size_t)521U;
|
|
ONE = NISTr521;
|
|
}
|
|
|
|
/* Point to the start of the LUT table space. */
|
|
mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
|
|
|
|
scalar = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * (N_wordlen + 4U)];
|
|
X = &CASPER_MEM[(20U * N_wordlen + 80U) + 13U * (N_wordlen + 4U)];
|
|
Y = &CASPER_MEM[(20U * N_wordlen + 80U) + 14U * (N_wordlen + 4U)];
|
|
Z = &CASPER_MEM[(20U * N_wordlen + 80U) + 15U * (N_wordlen + 4U)];
|
|
M = &CASPER_MEM[(20U * N_wordlen + 80U) + 16U * (N_wordlen + 4U)];
|
|
|
|
/* Point to memory the recoded scalar.
|
|
*/
|
|
CASPER_MEMCPY(scalar, k, sizeof(uint32_t) * N_wordlen);
|
|
|
|
/* Precomputation: compute 1*P, 3*P, 5*P, and 7*P */
|
|
#define FSL_CASPER_LUT(P, x) (mem_loc + (3U * ((P)-1U) / 2U + (x)) * (N_wordlen + 4U))
|
|
|
|
/* Set 1*P */
|
|
CASPER_MEMCPY(Z3, ONE, N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(FSL_CASPER_LUT(1U, 0U), X1, N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(FSL_CASPER_LUT(1U, 1U), Y1, N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(FSL_CASPER_LUT(1U, 2U), Z3, N_wordlen * sizeof(uint32_t));
|
|
|
|
/* Compute 2*P */
|
|
Jac_double(X3, Y3, Z3, X1, Y1, Z3);
|
|
|
|
/* Compute 3*P = 2P + P */
|
|
Jac_add_affine(FSL_CASPER_LUT(3U, 0U), FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3, X1, Y1);
|
|
|
|
/* Compute 5*P = 3P + 2P */
|
|
Jac_addition(FSL_CASPER_LUT(5U, 0U), FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), FSL_CASPER_LUT(3U, 0U),
|
|
FSL_CASPER_LUT(3U, 1U), FSL_CASPER_LUT(3U, 2U), X3, Y3, Z3);
|
|
|
|
/* Compute 7*P = 5P + 2P */
|
|
Jac_addition(FSL_CASPER_LUT(7U, 0U), FSL_CASPER_LUT(7U, 1U), FSL_CASPER_LUT(7U, 2U), FSL_CASPER_LUT(5U, 0U),
|
|
FSL_CASPER_LUT(5U, 1U), FSL_CASPER_LUT(5U, 2U), X3, Y3, Z3);
|
|
|
|
/* Recode the scalar */
|
|
odd = (int32_t)((uint32_t)(casper_get_word(&scalar[0]) & 1U));
|
|
(void)sub_n(M, q, scalar, (int16_t)(uint16_t)N_wordlen); // todo!!!
|
|
casper_select(scalar, M, scalar, odd, (int16_t)(uint16_t)N_wordlen);
|
|
|
|
/* Use n=384 and w=4 --> compute ciel(384/3) = 128 + 1 digits */
|
|
uint32_t scalarSysram[CASPER_MAX_ECC_SIZE_WORDLEN];
|
|
CASPER_MEMCPY(scalarSysram, scalar, /*CASPER_*/ N_wordlen * sizeof(uint32_t));
|
|
recode(rec, scalarSysram, (int32_t)bitlen, 4);
|
|
|
|
/* Set the first value. */
|
|
index = int8abs(rec[recodeLength - 1U]);
|
|
sign = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[recodeLength - 1U]) >> 7);
|
|
|
|
CASPER_MEMCPY(X3, FSL_CASPER_LUT((uint32_t)index, 0U), N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(Y3, FSL_CASPER_LUT((uint32_t)index, 1U), N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(Z3, FSL_CASPER_LUT((uint32_t)index, 2U), N_wordlen * sizeof(uint32_t));
|
|
|
|
/* Get the correct LUT element in constant time by touching
|
|
* all elements and masking out the correct one.
|
|
*/
|
|
|
|
#define GET_LUT(x, y, z, index) \
|
|
do \
|
|
{ \
|
|
int m; \
|
|
CASPER_MEMCPY((x), FSL_CASPER_LUT(1U, 0U), N_wordlen * sizeof(uint32_t)); \
|
|
CASPER_MEMCPY((y), FSL_CASPER_LUT(1U, 1U), N_wordlen * sizeof(uint32_t)); \
|
|
CASPER_MEMCPY((z), FSL_CASPER_LUT(1U, 2U), N_wordlen * sizeof(uint32_t)); \
|
|
m = (int32_t)((index) == 3U); \
|
|
casper_select((x), (x), FSL_CASPER_LUT(3U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
casper_select((y), (y), FSL_CASPER_LUT(3U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
casper_select((z), (z), FSL_CASPER_LUT(3U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
m = (int32_t)((index) == 5U); \
|
|
casper_select((x), (x), FSL_CASPER_LUT(5U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
casper_select((y), (y), FSL_CASPER_LUT(5U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
casper_select((z), (z), FSL_CASPER_LUT(5U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
m = (int32_t)((index) == 7U); \
|
|
casper_select((x), (x), FSL_CASPER_LUT(7U, 0U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
casper_select((y), (y), FSL_CASPER_LUT(7U, 1U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
casper_select((z), (z), FSL_CASPER_LUT(7U, 2U), m, (int16_t)(uint16_t)N_wordlen); \
|
|
} while (false)
|
|
|
|
GET_LUT(X3, Y3, Z3, index);
|
|
|
|
/* Compute -y and select the positive or negative point. */
|
|
(void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!!
|
|
casper_select(Y3, Y3, M, sign, (int16_t)(uint16_t)N_wordlen);
|
|
|
|
for (i = (int)(uint32_t)(recodeLength - 2U); i >= 0; i--)
|
|
{
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
|
|
index = int8abs(rec[i]);
|
|
sign = (int32_t)(uint32_t)(uint8_t)(((uint8_t)rec[i]) >> 7);
|
|
|
|
GET_LUT(X, Y, Z, index);
|
|
|
|
/* Compute -y and select the positive or negative point. */
|
|
(void)sub_n(scalar, p, Y, (int16_t)(uint16_t)N_wordlen); // todo!!!
|
|
casper_select(scalar, Y, scalar, sign, (int16_t)(uint16_t)N_wordlen);
|
|
|
|
Jac_addition(X3, Y3, Z3, X3, Y3, Z3, X, scalar, Z);
|
|
}
|
|
|
|
(void)sub_n(M, p, Y3, (int16_t)(uint16_t)N_wordlen); // todo!!!
|
|
|
|
casper_select(Y3, M, Y3, odd, (int16_t)(uint16_t)N_wordlen);
|
|
}
|
|
|
|
#undef FSL_CASPER_LUT
|
|
#undef GET_LUT
|
|
|
|
/*
|
|
* Pre-compute the following 16 points:
|
|
* 00 00 = 0*P + 0*Q <-- Not needed when using sliding windows
|
|
* 00 01 = 0*P + 1*Q <-- Not needed when using sliding windows
|
|
* 00 10 = 0*P + 2*Q
|
|
* 00 11 = 0*P + 3*Q
|
|
*
|
|
* 01 00 = 1*P + 0*Q <-- Not needed when using sliding windows
|
|
* 01 01 = 1*P + 1*Q <-- Not needed when using sliding windows
|
|
* 01 10 = 1*P + 2*Q
|
|
* 01 11 = 1*P + 3*Q
|
|
*
|
|
* 10 00 = 2*P + 0*Q
|
|
* 10 01 = 2*P + 1*Q
|
|
* 10 10 = 2*P + 2*Q
|
|
* 10 11 = 2*P + 3*Q
|
|
*
|
|
* 11 00 = 3*P + 0*Q
|
|
* 11 01 = 3*P + 1*Q
|
|
* 11 10 = 3*P + 2*Q
|
|
* 11 11 = 3*P + 3*Q
|
|
*
|
|
* index = (bitsi||bitsj)-2 - (biti != 0)*2
|
|
*
|
|
* Input: P = (X1 : Y1 : Z1) and
|
|
* Q = (X2 : Y2 : Z2)
|
|
* Output: mem_loc, memory location for the LUT.
|
|
*/
|
|
|
|
static void precompute_double_scalar_LUT16(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy)
|
|
{
|
|
uint32_t *Q2x, *Q2y, *Q2z, *P2x, *P2y, *P2z, *Z, *mem_loc;
|
|
uint32_t *ONE = NULL;
|
|
uint32_t index = 0;
|
|
|
|
if (N_wordlen == 8U)
|
|
{
|
|
ONE = NISTr256;
|
|
}
|
|
|
|
if (N_wordlen == 12U)
|
|
{
|
|
ONE = NISTr384;
|
|
}
|
|
|
|
Q2x = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 0U * (N_wordlen + 4U)];
|
|
Q2y = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)];
|
|
Q2z = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)];
|
|
|
|
/* Re-use memory from different scratch space since no
|
|
* projective point addition is used below. */
|
|
P2x = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 5U * (N_wordlen + 4U)];
|
|
P2z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 6U * (N_wordlen + 4U)];
|
|
P2y = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 7U * (N_wordlen + 4U)];
|
|
Z = &CASPER_MEM[((2U * (N_wordlen + 4U)) + (9U * (N_wordlen + 4U))) + 8U * (N_wordlen + 4U)];
|
|
|
|
mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
|
|
|
|
CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t));
|
|
|
|
// 00 10 = 0*P + 2*Q
|
|
Jac_double(Q2x, Q2y, Q2z, Qx, Qy, Z);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 00 11 = 0*P + 3*Q
|
|
Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 01 10 = 1*P + 2*Q
|
|
Jac_add_affine(P2x, P2y, P2z, Q2x, Q2y, Q2z, Px, Py);
|
|
CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 01 11 = 1*P + 3*Q
|
|
Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 10 00 = 2*P + 0*Q
|
|
Jac_double(P2x, P2y, P2z, Px, Py, Z);
|
|
CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 10 01 = 2*P + 1*Q
|
|
Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 10 10 = 2*P + 2*Q
|
|
Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 10 11 = 2*P + 3*Q
|
|
Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 11 00 = 3*P + 0*Q
|
|
Jac_add_affine(P2x, P2y, P2z, P2x, P2y, P2z, Px, Py);
|
|
CASPER_MEMCPY(&mem_loc[index], P2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], P2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 11 01 = 3*P + 1*Q
|
|
Jac_add_affine(Q2x, Q2y, Q2z, P2x, P2y, P2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 11 10 = 3*P + 2*Q
|
|
Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 11 11 = 3*P + 3*Q
|
|
Jac_add_affine(Q2x, Q2y, Q2z, Q2x, Q2y, Q2z, Qx, Qy);
|
|
CASPER_MEMCPY(&mem_loc[index], Q2x, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2y, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Q2z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
}
|
|
|
|
/*
|
|
* Pre-compute the following 4 points:
|
|
* 0 0 = 0*P + 0*Q <-- Not needed when using sliding windows
|
|
* 0 1 = 0*P + 1*Q
|
|
*
|
|
* 1 0 = 1*P + 0*Q
|
|
* 1 1 = 1*P + 1*Q
|
|
*
|
|
* index = (bitsj+1) & (0-bitsi)
|
|
*
|
|
* Input: P = (X1 : Y1 : Z1) and
|
|
* Q = (X2 : Y2 : Z2)
|
|
* Output: mem_loc, memory location for the LUT.
|
|
*/
|
|
|
|
static void precompute_double_scalar_LUT4(uint32_t *Px, uint32_t *Py, uint32_t *Qx, uint32_t *Qy)
|
|
{
|
|
uint32_t *Z, *mem_loc, *ONE;
|
|
uint32_t index = 0;
|
|
|
|
ONE = NISTr521;
|
|
|
|
/* Re-use memory from different scratch space since no
|
|
* projective point addition is used below. */
|
|
Z = &CASPER_MEM[(11U * N_wordlen + 4U) + 5U * (N_wordlen + 4U)];
|
|
mem_loc = &CASPER_MEM[(20U * N_wordlen + 80U)];
|
|
|
|
CASPER_MEMCPY(Z, ONE, N_wordlen * sizeof(uint32_t));
|
|
|
|
// 0*P + 1*Q
|
|
CASPER_MEMCPY(&mem_loc[index], Qx, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Qy, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 1*P + 0*Q
|
|
CASPER_MEMCPY(&mem_loc[index], Px, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Py, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
CASPER_MEMCPY(&mem_loc[index], Z, N_wordlen * sizeof(uint32_t));
|
|
index += N_wordlen;
|
|
|
|
// 1*P + 1*Q
|
|
Jac_add_affine(&mem_loc[index], &mem_loc[index + N_wordlen], &mem_loc[index + 2U * N_wordlen], Px, Py, Z, Qx, Qy);
|
|
}
|
|
|
|
#define GETLUTX(x) (3U * (x)*N_wordlen)
|
|
#define GETLUTY(x) (3U * (x)*N_wordlen + 1U * N_wordlen)
|
|
#define GETLUTZ(x) (3U * (x)*N_wordlen + 2U * N_wordlen)
|
|
|
|
/* Compute the double scalar multiplication
|
|
* (X3 : Y3 : Z3) = k1 * (X1, Y1) + k2 * (X2, Y2)
|
|
* Using Shamir's trick and precomputing 16 points.
|
|
* This code is *not* constant time since this is used
|
|
* for verification only.
|
|
*/
|
|
void double_scalar_multiplication(uint32_t *X3,
|
|
uint32_t *Y3,
|
|
uint32_t *Z3,
|
|
uint32_t *X1,
|
|
uint32_t *Y1,
|
|
uint32_t *k1,
|
|
uint32_t *X2,
|
|
uint32_t *Y2,
|
|
uint32_t *k2)
|
|
{
|
|
uint32_t index = 0, c = 0;
|
|
uint32_t *p1 = NULL, *p2 = NULL, x1, x2, *lut, *Tx = NULL, *Ty = NULL, *Tz = NULL;
|
|
size_t bitlen, shiftr, shiftl = 0u;
|
|
|
|
if (N_wordlen == 8U)
|
|
{
|
|
bitlen = (size_t)kCASPER_ECC_P256_N_bitlen;
|
|
precompute_double_scalar_LUT16(X1, Y1, X2, Y2);
|
|
shiftr = 30U;
|
|
shiftl = 2U;
|
|
}
|
|
|
|
if (N_wordlen == 12U)
|
|
{
|
|
bitlen = (size_t)kCASPER_ECC_P384_N_bitlen;
|
|
precompute_double_scalar_LUT16(X1, Y1, X2, Y2);
|
|
shiftr = 30U;
|
|
shiftl = 2U;
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
bitlen = (size_t)kCASPER_ECC_P521_N_bitlen;
|
|
precompute_double_scalar_LUT4(X1, Y1, X2, Y2);
|
|
shiftr = 31U;
|
|
shiftl = 1U;
|
|
}
|
|
|
|
lut = &CASPER_MEM[(20U * N_wordlen + 80U)];
|
|
|
|
if (N_wordlen == 8U || N_wordlen == 12U)
|
|
{
|
|
p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen];
|
|
p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 1U * (N_wordlen + 4U)];
|
|
|
|
Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 2U * (N_wordlen + 4U)];
|
|
Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 3U * (N_wordlen + 4U)];
|
|
Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 48U * N_wordlen + 4U * (N_wordlen + 4U)];
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
p1 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen];
|
|
p2 = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 1U * (N_wordlen + 4U)];
|
|
|
|
Tx = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 2U * (N_wordlen + 4U)];
|
|
Ty = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 3U * (N_wordlen + 4U)];
|
|
Tz = &CASPER_MEM[(20U * N_wordlen + 80U) + 12U * N_wordlen + 4U * (N_wordlen + 4U)];
|
|
}
|
|
|
|
CASPER_MEMCPY(p1, k1, sizeof(uint32_t) * N_wordlen);
|
|
CASPER_MEMCPY(p2, k2, sizeof(uint32_t) * N_wordlen);
|
|
|
|
/* Check if we can slide. */
|
|
while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U && c < bitlen)
|
|
{
|
|
shiftleft(p1, p1, 1U);
|
|
shiftleft(p2, p2, 1U);
|
|
c++;
|
|
/* No doubling needed. */
|
|
}
|
|
|
|
/* Set the first value. */
|
|
x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr;
|
|
x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr;
|
|
if (N_wordlen == 8U || N_wordlen == 12U)
|
|
{
|
|
index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U;
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
index = (((x2) + 1U) & (0U - (x1)));
|
|
}
|
|
shiftleft(p1, p1, shiftl);
|
|
shiftleft(p2, p2, shiftl);
|
|
|
|
CASPER_MEMCPY(X3, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(Y3, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(Z3, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t));
|
|
c += shiftl;
|
|
|
|
// todo: create an is_zero function
|
|
|
|
while ((casper_get_word(&p1[0]) | casper_get_word(&p1[1]) | casper_get_word(&p1[2]) | casper_get_word(&p1[3]) |
|
|
casper_get_word(&p1[4]) | casper_get_word(&p1[5]) | casper_get_word(&p1[6]) | casper_get_word(&p1[7]) |
|
|
casper_get_word(&p1[8]) | casper_get_word(&p1[9]) | casper_get_word(&p1[10]) | casper_get_word(&p1[11]) |
|
|
casper_get_word(&p1[12]) | casper_get_word(&p1[13]) | casper_get_word(&p1[14]) | casper_get_word(&p1[15]) |
|
|
casper_get_word(&p1[16]) | casper_get_word(&p1[17]) | casper_get_word(&p2[0]) | casper_get_word(&p2[1]) |
|
|
casper_get_word(&p2[2]) | casper_get_word(&p2[3]) | casper_get_word(&p2[4]) | casper_get_word(&p2[5]) |
|
|
casper_get_word(&p2[6]) | casper_get_word(&p2[7]) | casper_get_word(&p2[8]) | casper_get_word(&p2[9]) |
|
|
casper_get_word(&p2[10]) | casper_get_word(&p2[11]) | casper_get_word(&p2[12]) | casper_get_word(&p2[13]) |
|
|
casper_get_word(&p2[14]) | casper_get_word(&p2[15]) | casper_get_word(&p2[16]) |
|
|
casper_get_word(&p2[17])) != 0U)
|
|
{
|
|
/* Check if we can slide. */
|
|
while (((casper_get_word(&p1[N_wordlen - 1U]) | casper_get_word(&p2[N_wordlen - 1U])) >> 31) == 0U &&
|
|
c < bitlen)
|
|
{
|
|
shiftleft(p1, p1, 1U);
|
|
shiftleft(p2, p2, 1U);
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
c++;
|
|
}
|
|
|
|
if (c >= (bitlen - 1U))
|
|
{
|
|
break;
|
|
}
|
|
|
|
for (uint32_t i = 0; i < shiftl; i++)
|
|
{
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
}
|
|
|
|
x1 = casper_get_word(&p1[N_wordlen - 1U]) >> shiftr;
|
|
x2 = casper_get_word(&p2[N_wordlen - 1U]) >> shiftr;
|
|
|
|
if (N_wordlen == 8U || N_wordlen == 12U)
|
|
{
|
|
index = (x2 | (x1 << 2)) - 2U - (uint32_t)(x1 != 0U) * 2U;
|
|
}
|
|
|
|
if (N_wordlen == 18U)
|
|
{
|
|
index = (((x2) + 1U) & (0U - (x1)));
|
|
}
|
|
|
|
shiftleft(p1, p1, shiftl);
|
|
shiftleft(p2, p2, shiftl);
|
|
|
|
CASPER_MEMCPY(Tx, &lut[GETLUTX(index)], N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(Ty, &lut[GETLUTY(index)], N_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(Tz, &lut[GETLUTZ(index)], N_wordlen * sizeof(uint32_t));
|
|
|
|
Jac_addition(X3, Y3, Z3, X3, Y3, Z3, Tx, Ty,
|
|
Tz); //&lut[GETLUTX(index)], &lut[GETLUTY(index)], &lut[GETLUTZ(index)]);
|
|
c += shiftl;
|
|
}
|
|
|
|
/* Special case in the end. */
|
|
if (c == (bitlen - 1U))
|
|
{
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
x1 = casper_get_word(&p1[N_wordlen - 1U]) >> 31;
|
|
x2 = casper_get_word(&p2[N_wordlen - 1U]) >> 31;
|
|
if (0U != x1)
|
|
{
|
|
Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X1, Y1);
|
|
}
|
|
if (x2 != 0U)
|
|
{
|
|
Jac_add_affine(X3, Y3, Z3, X3, Y3, Z3, X2, Y2);
|
|
}
|
|
c++;
|
|
}
|
|
|
|
while (c < bitlen)
|
|
{
|
|
Jac_double(X3, Y3, Z3, X3, Y3, Z3);
|
|
c++;
|
|
}
|
|
}
|
|
|
|
static void invert_mod_p256(uint32_t *c, uint32_t *a)
|
|
{
|
|
int i;
|
|
uint32_t *t, *t2, *s1, *s2, *s4, *s8, *tmp;
|
|
|
|
/* Assuming it is safe to use the ECC scratch size. */
|
|
t = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
t2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
3U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
s1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
4U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
s2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
5U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
s4 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
6U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
s8 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
7U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
tmp = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
8U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
|
|
// t2 = n^(2^1)*n # 11
|
|
square_casper(tmp, a);
|
|
multiply_casper(t2, tmp, a);
|
|
|
|
// s1 = t2^(2^2)*t2 # F
|
|
square_casper(s1, t2);
|
|
square_casper(tmp, s1);
|
|
multiply_casper(s1, tmp, t2);
|
|
|
|
// s2 = s1^(2^4)*s1 # FF
|
|
square_casper(s2, s1);
|
|
// for (i = 1; i < 4; i++) square(s2, s2);
|
|
square_casper(tmp, s2);
|
|
square_casper(s2, tmp);
|
|
square_casper(tmp, s2);
|
|
multiply_casper(s2, tmp, s1);
|
|
|
|
// s4 = s2^(2^8)*s2 # FFFF
|
|
square_casper(s4, s2);
|
|
for (i = 1; i < 7; i += 2)
|
|
{
|
|
square_casper(tmp, s4);
|
|
square_casper(s4, tmp);
|
|
}
|
|
square_casper(tmp, s4);
|
|
multiply_casper(s4, tmp, s2);
|
|
|
|
// s8 = s4^(2^16)*s4 # FFFFFFFF
|
|
square_casper(s8, s4);
|
|
for (i = 1; i < 15; i += 2)
|
|
{
|
|
square_casper(tmp, s8);
|
|
square_casper(s8, tmp);
|
|
}
|
|
square_casper(tmp, s8);
|
|
multiply_casper(s8, tmp, s4);
|
|
|
|
// t = s8^(2^32)*n # ffffffff00000001
|
|
square_casper(tmp, s8);
|
|
for (i = 1; i < 31; i += 2)
|
|
{
|
|
square_casper(t, tmp);
|
|
square_casper(tmp, t);
|
|
}
|
|
square_casper(t, tmp);
|
|
multiply_casper(tmp, t, a);
|
|
|
|
// t = t^(2^128)*s8 # ffffffff00000001000000000000000000000000ffffffff
|
|
for (i = 0; i < 128; i += 2)
|
|
{
|
|
square_casper(t, tmp);
|
|
square_casper(tmp, t);
|
|
}
|
|
multiply_casper(t, tmp, s8);
|
|
|
|
// t = t^(2^32)*s8 # ffffffff00000001000000000000000000000000ffffffffffffffff
|
|
for (i = 0; i < 32; i += 2)
|
|
{
|
|
square_casper(tmp, t);
|
|
square_casper(t, tmp);
|
|
}
|
|
multiply_casper(tmp, t, s8);
|
|
|
|
// t = t^(2^16)*s4 # ffffffff00000001000000000000000000000000ffffffffffffffffffff
|
|
for (i = 0; i < 16; i += 2)
|
|
{
|
|
square_casper(t, tmp);
|
|
square_casper(tmp, t);
|
|
}
|
|
multiply_casper(t, tmp, s4);
|
|
|
|
// t = t^(2^8)*s2 # ffffffff00000001000000000000000000000000ffffffffffffffffffffff
|
|
for (i = 0; i < 8; i += 2)
|
|
{
|
|
square_casper(tmp, t);
|
|
square_casper(t, tmp);
|
|
}
|
|
multiply_casper(tmp, t, s2);
|
|
|
|
// t = t^(2^4)*s1 # ffffffff00000001000000000000000000000000fffffffffffffffffffffff
|
|
for (i = 0; i < 4; i += 2)
|
|
{
|
|
square_casper(t, tmp);
|
|
square_casper(tmp, t);
|
|
}
|
|
multiply_casper(t, tmp, s1);
|
|
|
|
// t = t^(2^2)*t2
|
|
square_casper(tmp, t);
|
|
square_casper(t, tmp);
|
|
multiply_casper(tmp, t, t2);
|
|
|
|
// t = t^(2^2)*n # ffffffff00000001000000000000000000000000fffffffffffffffffffffffd
|
|
square_casper(t, tmp);
|
|
square_casper(tmp, t);
|
|
multiply_casper(c, tmp, a);
|
|
}
|
|
|
|
// A and C do not need to be in Casper memory
|
|
static void toMontgomery_ECC_P256(uint32_t *C, uint32_t *A)
|
|
{
|
|
/* R^2 = 2^512 mod p, used to convert values to Montgomery form. */
|
|
uint32_t R2[kCASPER_ECC_P256_wordlen] = {0x00000003, 0x00000000, 0xffffffffU, 0xfffffffbU,
|
|
0xfffffffeU, 0xffffffffU, 0xfffffffdU, 0x4};
|
|
uint32_t *T1, *T2, *T3;
|
|
T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
1U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U))) +
|
|
2U * ((uint32_t)kCASPER_ECC_P256_wordlen + 4U)];
|
|
|
|
CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
|
|
multiply_casper(T3, T2, T1);
|
|
CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P256_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
/* Compute inversion modulo NIST-p384 using Fermats little theorem.
|
|
* Using c = a^(p-2) = a^(-1) mod p.
|
|
* This computes the modular inversion if all arithmetic is "regular"
|
|
* modular arithmetic or computes automatically the Montgomery inverse
|
|
* if all arithmetic is Montgomery arithmetic.
|
|
*/
|
|
|
|
static void invert_mod_p384(uint32_t *c, uint32_t *a)
|
|
{
|
|
int i;
|
|
uint32_t *e, *d, *tmp, *t0, *t1, *t2, *t3, *t4, *t5, *t6; // 10 residues needed
|
|
|
|
/* Assuming it is safe to use the LUT scratch size.
|
|
* Hence, do not invert while elements in the LUT are needed.
|
|
*/
|
|
e = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
d = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
tmp =
|
|
&CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t0 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 6U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 7U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t5 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 8U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
t6 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P384_wordlen + 80U) + 9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
|
|
square_casper(tmp, a); // 2
|
|
square_casper(t1, tmp); // 4
|
|
square_casper(tmp, t1); // 8
|
|
multiply_casper(t2, tmp, t1); // 12
|
|
multiply_casper(d, a, t2); // 13
|
|
multiply_casper(e, d, a); // 14
|
|
multiply_casper(t0, e, a); // 15
|
|
|
|
// t1 = t0^(2^4)*t0 # ff
|
|
square_casper(tmp, t0);
|
|
square_casper(t1, tmp);
|
|
square_casper(tmp, t1);
|
|
square_casper(t2, tmp);
|
|
multiply_casper(t1, t2, t0);
|
|
|
|
// t2 = t1^(2^8)*t1 # 4f
|
|
square_casper(tmp, t1);
|
|
for (i = 0; i < 3; i++)
|
|
{
|
|
square_casper(t3, tmp);
|
|
square_casper(tmp, t3);
|
|
}
|
|
square_casper(t3, tmp);
|
|
multiply_casper(t2, t3, t1);
|
|
|
|
// t3 = t2^(2^16)*t2 # 8f
|
|
square_casper(tmp, t2);
|
|
for (i = 0; i < 7; i++)
|
|
{
|
|
square_casper(t4, tmp);
|
|
square_casper(tmp, t4);
|
|
}
|
|
square_casper(t4, tmp);
|
|
multiply_casper(t3, t4, t2);
|
|
|
|
// t4 = t3^(2^32)*t3 # 16f
|
|
square_casper(tmp, t3);
|
|
for (i = 0; i < 15; i++)
|
|
{
|
|
square_casper(t5, tmp);
|
|
square_casper(tmp, t5);
|
|
}
|
|
square_casper(t5, tmp);
|
|
multiply_casper(t4, t5, t3);
|
|
|
|
// t5 = t4^(2^64)*t4 # 32f
|
|
square_casper(tmp, t4);
|
|
for (i = 0; i < 31; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t4);
|
|
|
|
// t5 = t5^(2^64)*t4 # 48f
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 31; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t4);
|
|
|
|
// t5 = t5^(2^32)*t3 # 56f
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 15; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t3);
|
|
|
|
// t5 = t5^(2^16)*t2 # 60f
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 7; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t2);
|
|
|
|
// t5 = t5^(2^8)*t1 # 62f
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 3; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t1);
|
|
|
|
// n = t5^(2^4)*t0 # 63f
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 1; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t0);
|
|
|
|
// n = n^(2^4)*e
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 1; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, e);
|
|
|
|
// n = n^(2^32)*t3
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 15; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t3);
|
|
|
|
// n = n^(2^64)
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 31; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t5, tmp);
|
|
|
|
// n = n^(2^16)*t2
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 7; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t2);
|
|
|
|
// n = n^(2^8)*t1
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 3; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t1);
|
|
|
|
// n = n^(2^4)*t0
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 1; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(t5, t6, t0);
|
|
|
|
// n = n^(2^4)*d
|
|
square_casper(tmp, t5);
|
|
for (i = 0; i < 1; i++)
|
|
{
|
|
square_casper(t6, tmp);
|
|
square_casper(tmp, t6);
|
|
}
|
|
square_casper(t6, tmp);
|
|
multiply_casper(c, t6, d);
|
|
}
|
|
|
|
// A and C do not need to be in Casper memory
|
|
static void toMontgomery_ECC_P384(uint32_t *C, uint32_t *A)
|
|
{
|
|
/* R^2 = 2^768 mod p, used to convert values to Montgomery form. */
|
|
uint32_t R2[kCASPER_ECC_P384_wordlen] = {0x00000001, 0xfffffffeU, 0x00000000, 0x00000002, 0x00000000, 0xfffffffeU,
|
|
0x00000000, 0x00000002, 0x1, 0x0, 0x0, 0x0};
|
|
uint32_t *T1, *T2, *T3;
|
|
T1 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
|
|
0U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
|
|
1U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
T3 = &CASPER_MEM[((2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)) +
|
|
(9U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U))) +
|
|
2U * ((uint32_t)kCASPER_ECC_P384_wordlen + 4U)];
|
|
|
|
CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
|
|
multiply_casper(T3, T2, T1);
|
|
CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P384_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
static void invert_mod_p521(uint32_t *c, uint32_t *a)
|
|
{
|
|
int i;
|
|
uint32_t *e3, *d2, *d3, *d4, *T2, *T4; // 6 residues needed
|
|
|
|
/* Assuming it is safe to use the LUT scratch size.
|
|
* Hence, do not invert while elements in the LUT are needed.
|
|
*/
|
|
e3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
d2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
d3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
d4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 3U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 4U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
T4 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 5U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
|
|
square_casper(d2, a);
|
|
multiply_casper(T2, d2, a);
|
|
|
|
// d3 = 2^2 * T2
|
|
square_casper(d3, T2);
|
|
square_casper(e3, d3);
|
|
multiply_casper(T4, e3, T2);
|
|
|
|
// d3 = 2^4 * T4
|
|
square_casper(d3, T4);
|
|
square_casper(e3, d3);
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
multiply_casper(d2, e3, T4);
|
|
|
|
// d3 = 2^8 * d2
|
|
square_casper(d3, d2);
|
|
square_casper(e3, d3);
|
|
for (i = 0; i < 3; i++)
|
|
{
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
}
|
|
multiply_casper(d4, e3, d2);
|
|
|
|
// d3 = 2^16 * d2
|
|
square_casper(d3, d4);
|
|
square_casper(e3, d3);
|
|
for (i = 0; i < 7; i++)
|
|
{
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
}
|
|
multiply_casper(d2, e3, d4);
|
|
|
|
// d3 = 2^32 * d2
|
|
square_casper(d3, d2);
|
|
square_casper(e3, d3);
|
|
for (i = 0; i < 15; i++)
|
|
{
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
}
|
|
multiply_casper(d4, e3, d2);
|
|
|
|
// d3 = 2^64 * d2
|
|
square_casper(d3, d4);
|
|
square_casper(e3, d3);
|
|
for (i = 0; i < 31; i++)
|
|
{
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
}
|
|
multiply_casper(d2, e3, d4);
|
|
|
|
// d3 = 2^128 * d2
|
|
square_casper(d3, d2);
|
|
square_casper(e3, d3);
|
|
for (i = 0; i < 63; i++)
|
|
{
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
}
|
|
multiply_casper(d4, e3, d2);
|
|
|
|
// d3 = 2^256 * d2
|
|
square_casper(d3, d4);
|
|
square_casper(e3, d3);
|
|
for (i = 0; i < 127; i++)
|
|
{
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
}
|
|
multiply_casper(d2, e3, d4);
|
|
|
|
// d3 = 2^2 * d2
|
|
square_casper(d3, d2);
|
|
square_casper(e3, d3);
|
|
multiply_casper(d2, e3, T2);
|
|
|
|
// d3 = 2^4 * d2
|
|
square_casper(d3, d2);
|
|
square_casper(e3, d3);
|
|
square_casper(d3, e3);
|
|
square_casper(e3, d3);
|
|
multiply_casper(d2, e3, T4);
|
|
|
|
square_casper(d3, d2);
|
|
multiply_casper(d2, d3, a);
|
|
|
|
// d3 = 2 ^ 2 * d2
|
|
square_casper(d3, d2);
|
|
square_casper(e3, d3);
|
|
multiply_casper(c, e3, a);
|
|
}
|
|
|
|
static void toMontgomery_ECC_P521(uint32_t *C, uint32_t *A)
|
|
{
|
|
/* R^2 = 2^1088 mod p, used to convert values to Montgomery form. */
|
|
// uint32_t R2[NUM_LIMBS] = { 0x00000000, 0x4000, 0, 0,
|
|
// 0, 0, 0, 0,
|
|
// 0, 0, 0, 0,
|
|
// 0 };
|
|
/* R^2 = 2^1152 mod p, used to convert values to Montgomery form. */
|
|
uint32_t R2[kCASPER_ECC_P521_wordlen] = {0, 0, 0, 0x4000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
|
|
|
uint32_t *T1, *T2, *T3;
|
|
T1 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 0U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
T2 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 1U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
T3 = &CASPER_MEM[(20U * (uint32_t)kCASPER_ECC_P521_wordlen + 80U) + 2U * ((uint32_t)kCASPER_ECC_P521_wordlen + 4U)];
|
|
|
|
CASPER_MEMCPY(T1, R2, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
CASPER_MEMCPY(T2, A, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
|
|
multiply_casper(T3, T2, T1);
|
|
CASPER_MEMCPY(C, T3, (uint32_t)kCASPER_ECC_P521_wordlen * sizeof(uint32_t));
|
|
}
|
|
|
|
static void MultprecCiosMul521_ct(
|
|
uint32_t w_out[], const uint32_t a[], const uint32_t b[], const uint32_t Nmod[], const uint32_t *Np)
|
|
{
|
|
uint32_t j;
|
|
uint64_t carry;
|
|
uint64_t *a64, *b64, *w64;
|
|
|
|
uint32_t *T1 = &CASPER_MEM[0], borrow;
|
|
|
|
a64 = (uint64_t *)(uintptr_t)a;
|
|
b64 = (uint64_t *)(uintptr_t)b;
|
|
w64 = (uint64_t *)(uintptr_t)w_out;
|
|
|
|
if (a != NULL)
|
|
{ /* if !a, we are reducing only */
|
|
PreZeroW(j, w_out);
|
|
}
|
|
SET_DWORD(&w64[N_dwordlen], 0ULL);
|
|
SET_DWORD(&w64[N_dwordlen + 1U], 0ULL);
|
|
/* with accelerator */
|
|
|
|
/* loop j and then reduce after each j round */
|
|
for (j = 0; j < N_dwordlen; j++)
|
|
{
|
|
/* Step 3. Iterate over N words of u using j - perform Multiply-accumulate */
|
|
/* push-pull: we do a*b and then separately m*n (reduce) */
|
|
if (a != NULL)
|
|
{ /* if mul&reduce vs. reduce only */
|
|
carry = GET_DWORD(&w64[N_dwordlen]);
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&b64[j]), CA_MK_OFF(a64));
|
|
Accel_crypto_mul(
|
|
Accel_IterOpcodeResaddr(N_dwordlen - 1U, (uint32_t)kCASPER_OpMul6464FullSum, CA_MK_OFF(w64)));
|
|
Accel_done();
|
|
/* max carry is contained since ~0*~0=0xFFFE0001+0xFFFF=0xFFFF0000, */
|
|
/* so max carry is 0xFFFF and 0xFFFF0000+0xFFFF=0xFFFFFFFF */
|
|
/* accel took care of w_out[N_wordlen] & +1, so we just take care of the next double word if carry=1 */
|
|
/* w64[N_dwordlen+1] = g_carry; */
|
|
carry = (uint64_t)(GET_DWORD(&w64[N_dwordlen]) < carry);
|
|
SET_DWORD(&w64[N_dwordlen + 1U], carry);
|
|
}
|
|
|
|
/* Fast reduction using only shifts for this special shape:
|
|
* (c - (-p^-1*c mod 2^64) * p)/2^64 =
|
|
* (c - c_0 * p)/2^64 =
|
|
* (\sum_{j=0}^9 c_i*2^64 - c_0 * p)/2^64 =
|
|
* (\sum_{j=0}^9 c_i*2^64 - c_0 * (2^521-1))/2^64 =
|
|
* (\sum_{j=0}^9 c_i*2^64 - c_0 * 2^521 - c_0)/2^64 =
|
|
* c_1 + c_2*2^64 + c_3*2^128 + c_4*2^192 + c_5*2^256 + c_6*2^320 + c_7*2^384 + c_8*2^448 + c_9*2^512 + c_0 *
|
|
* 2^{448 + 9} so one only needs to compute this 128-bit addition: [c_8, c_9] + c_0 * 2^9
|
|
*/
|
|
|
|
uint64_t *p64 = (uint64_t *)(uintptr_t)T1;
|
|
|
|
/* p64[0] = w64[0] << 9;*/
|
|
SET_DWORD(&p64[0], GET_DWORD(&w64[0]) << 9U);
|
|
/* p64[1] = w64[0] >> (64 - 9); */
|
|
SET_DWORD(&p64[1], GET_DWORD(&w64[0]) >> (64 - 9));
|
|
/* w64[0] = w64[1]; */
|
|
SET_DWORD(&w64[0], GET_DWORD(&w64[1]));
|
|
/* w64[1] = w64[2]; */
|
|
SET_DWORD(&w64[1], GET_DWORD(&w64[2]));
|
|
/* w64[2] = w64[3]; */
|
|
SET_DWORD(&w64[2], GET_DWORD(&w64[3]));
|
|
/* w64[3] = w64[4]; */
|
|
SET_DWORD(&w64[3], GET_DWORD(&w64[4]));
|
|
/* w64[4] = w64[5]; */
|
|
SET_DWORD(&w64[4], GET_DWORD(&w64[5]));
|
|
/* w64[5] = w64[6]; */
|
|
SET_DWORD(&w64[5], GET_DWORD(&w64[6]));
|
|
/* w64[6] = w64[7]; */
|
|
SET_DWORD(&w64[6], GET_DWORD(&w64[7]));
|
|
|
|
/* Compute p64 = p64 + {w64[8], w64[9]} using one additonal double-length limb,
|
|
* where p64 = w64[0] * 2^9.
|
|
*/
|
|
Accel_SetABCD_Addr(CA_MK_OFF(&w64[8]), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(2, (uint32_t)kCASPER_OpAdd64, /* kCASPER_OpAdd64, */
|
|
CA_MK_OFF(p64)));
|
|
Accel_done();
|
|
|
|
/* w64[7] = p64[0]; */
|
|
SET_DWORD(&w64[7], GET_DWORD(&p64[0]));
|
|
/* w64[8] = p64[1]; */
|
|
SET_DWORD(&w64[8], GET_DWORD(&p64[1]));
|
|
/* w64[9] = 0; */
|
|
SET_DWORD(&w64[9], (uint64_t)0U);
|
|
}
|
|
|
|
/* memcpy(T1, w_out, (NUM_LIMBS + 1) * sizeof(uint32_t)); */
|
|
/* now check if need to subtract Nmod */
|
|
CASPER_MEMCPY_I2I(T1, w_out, (N_wordlen + 1U) * sizeof(uint32_t));
|
|
|
|
/* Compute w = w - N */
|
|
Accel_SetABCD_Addr(CA_MK_OFF(Nmod), 0);
|
|
Accel_crypto_mul(Accel_IterOpcodeResaddr(N_dwordlen, (uint32_t)kCASPER_OpSub64, CA_MK_OFF(w_out)));
|
|
Accel_done();
|
|
|
|
/* if w_out > T1 then there was a borrow */
|
|
/* borrow = (((uint32_t*)w_out)[NUM_LIMBS] > T1[NUM_LIMBS]); */
|
|
borrow = (uint32_t)(GET_WORD(&((uint32_t *)w_out)[N_wordlen]) > GET_WORD(&T1[N_wordlen]));
|
|
SET_WORD(&w_out[N_wordlen + 1U], 0);
|
|
SET_WORD(&w_out[N_wordlen], 0);
|
|
/* w_out[NUM_LIMBS + 1] = 0; */
|
|
/* w_out[NUM_LIMBS] = 0; */
|
|
casper_select(w_out, w_out, T1, (int32_t)borrow, (int32_t)N_wordlen);
|
|
}
|
|
|
|
#if defined(__GNUC__)
|
|
/* End of enforcing O1 optimize level for gcc*/
|
|
#pragma GCC pop_options
|
|
#endif
|
|
|
|
#if (defined(__CC_ARM) || defined(__ARMCC_VERSION))
|
|
// End of enforcing optimize off for clang
|
|
#pragma clang optimize on
|
|
#endif
|