/* sp.c
 *
 * Copyright (C) 2006-2021 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Implementation by Sean Parkinson. */

#ifdef HAVE_CONFIG_H
    #include <config.h>
#endif

#include <wolfssl/wolfcrypt/settings.h>

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(WOLFSSL_HAVE_SP_ECC)

#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
#else
    #define WOLFSSL_MISC_INCLUDED
    #include <wolfcrypt/src/misc.c>
#endif

#ifdef RSA_LOW_MEM
#ifndef WOLFSSL_SP_SMALL
#define WOLFSSL_SP_SMALL
#endif
#endif

#include <wolfssl/wolfcrypt/sp.h>

#ifdef WOLFSSL_SP_ARM32_ASM
#define SP_PRINT_NUM(var, name, total, words, bits)         \
    do {                                                    \
        int ii;                                             \
        fprintf(stderr, name "=0x");                        \
        for (ii = ((bits + 31) / 32) - 1; ii >= 0; ii--)    \
            fprintf(stderr, SP_PRINT_FMT, (var)[ii]);       \
        fprintf(stderr, "\n");                              \
    } while (0)

#define SP_PRINT_VAL(var, name)                             \
    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)

#define SP_PRINT_INT(var, name)                             \
    fprintf(stderr, name "=%d\n", var)

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j;
    byte* d;

    for (i = n - 1,j = 0; i >= 3; i -= 4) {
        r[j]  = ((sp_digit)a[i - 0] <<  0) |
                ((sp_digit)a[i - 1] <<  8) |
                ((sp_digit)a[i - 2] << 16) |
                ((sp_digit)a[i - 3] << 24);
        j++;
    }

    if (i >= 0) {
        r[j] = 0;

        d = (byte*)r;
        switch (i) {
            case 2: d[n - 1 - 2] = a[2]; //fallthrough
            case 1: d[n - 1 - 1] = a[1]; //fallthrough
            case 0: d[n - 1 - 0] = a[0]; //fallthrough
        }
        j++;
    }

    for (; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 32
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < size; j++) {
        r[j] = 0;
    }
#elif DIGIT_BIT > 32
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0xffffffff;
        s = 32U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 32U) <= (word32)DIGIT_BIT) {
            s += 32U;
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 32) {
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 32 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin_64(sp_digit* r, byte* a)
{
    int i;
    int j = 0;

    for (i = 63; i >= 0; i--) {
        a[j++] = r[i] >> 24;
        a[j++] = r[i] >> 16;
        a[j++] = r[i] >> 8;
        a[j++] = r[i] >> 0;
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_2048_norm_64(a)

#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_2048_norm_64(a)

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #32\n\t"
        "mov	r10, #0\n\t"
        /* A[0] * B[0] */
        "ldr	r11, [%[a]]\n\t"
        "ldr	r12, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r3, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r3, r3, #16\n\t"
        "mul	r3, r6, r3\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r4, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r4, r4, r7\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
#else
        "umull	r3, r4, r11, r12\n\t"
        "mov	r5, #0\n\t"
#endif
        "str	r3, [sp]\n\t"
        /* A[0] * B[1] */
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[0] */
        "ldr	r8, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #4]\n\t"
        /* A[2] * B[0] */
        "ldr	r8, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[1] */
        "ldr	r11, [%[a], #4]\n\t"
        "ldr	r12, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[0] * B[2] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [sp, #8]\n\t"
        /* A[0] * B[3] */
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[1] * B[2] */
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[2] * B[1] */
        "ldr	r8, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[0] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [sp, #12]\n\t"
        /* A[4] * B[0] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[1] */
        "ldr	r8, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[2] * B[2] */
        "ldr	r11, [%[a], #8]\n\t"
        "ldr	r12, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[3] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[0] * B[4] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #16]\n\t"
        /* A[0] * B[5] */
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[4] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[2] * B[3] */
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[3] * B[2] */
        "ldr	r8, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[4] * B[1] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[0] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [sp, #20]\n\t"
        /* A[6] * B[0] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[1] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[4] * B[2] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[3] */
        "ldr	r11, [%[a], #12]\n\t"
        "ldr	r12, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[2] * B[4] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[1] * B[5] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[0] * B[6] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [sp, #24]\n\t"
        /* A[0] * B[7] */
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[6] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[2] * B[5] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[4] */
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[4] * B[3] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * B[2] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[6] * B[1] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[7] * B[0] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #28]\n\t"
        /* A[7] * B[1] */
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[6] * B[2] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[3] */
        "ldr	r8, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[4] * B[4] */
        "ldr	r11, [%[a], #16]\n\t"
        "ldr	r12, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[3] * B[5] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[2] * B[6] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[7] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r], #32]\n\t"
        /* A[2] * B[7] */
        "ldr	r8, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[6] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[4] * B[5] */
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[4] */
        "ldr	r8, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[6] * B[3] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[7] * B[2] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], #36]\n\t"
        /* A[7] * B[3] */
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[6] * B[4] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * B[5] */
        "ldr	r11, [%[a], #20]\n\t"
        "ldr	r12, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[4] * B[6] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[7] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #40]\n\t"
        /* A[4] * B[7] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[6] */
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[6] * B[5] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[7] * B[4] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r], #44]\n\t"
        /* A[7] * B[5] */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[6] * B[6] */
        "ldr	r11, [%[a], #24]\n\t"
        "ldr	r12, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[7] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], #48]\n\t"
        /* A[6] * B[7] */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[7] * B[6] */
        "ldr	r8, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #52]\n\t"
        /* A[7] * B[7] */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r3, r3, r7\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#endif
        "str	r5, [%[r], #56]\n\t"
        "str	r3, [%[r], #60]\n\t"
        "ldm	sp!, {r3, r4, r5, r6}\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	sp!, {r3, r4, r5, r6}\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
    );
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_16(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "subs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
    );
    return (uint32_t)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_8(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<8; i++) {
        r[i] = a[i] & m;
    }
#else
    r[0] = a[0] & m;
    r[1] = a[1] & m;
    r[2] = a[2] & m;
    r[3] = a[3] & m;
    r[4] = a[4] & m;
    r[5] = a[5] & m;
    r[6] = a[6] & m;
    r[7] = a[7] & m;
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[16];
    sp_digit a1[8];
    sp_digit b1[8];
    sp_digit* z2 = r + 16;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_8(a1, a, &a[8]);
    cb = sp_2048_add_8(b1, b, &b[8]);
    u  = ca & cb;

    sp_2048_mul_8(z2, &a[8], &b[8]);
    sp_2048_mul_8(z0, a, b);
    sp_2048_mul_8(z1, a1, b1);

    u += sp_2048_sub_in_place_16(z1, z0);
    u += sp_2048_sub_in_place_16(z1, z2);
    sp_2048_mask_8(a1, a1, 0 - cb);
    u += sp_2048_add_8(z1 + 8, z1 + 8, a1);
    sp_2048_mask_8(b1, b1, 0 - ca);
    u += sp_2048_add_8(z1 + 8, z1 + 8, b1);

    u += sp_2048_add_16(r + 8, r + 8, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (8 - 1));
    a1[0] = u;
    (void)sp_2048_add_8(r + 24, r + 24, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "subs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
    );
    return (uint32_t)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_16(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<16; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit b1[16];
    sp_digit* z2 = r + 32;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_16(a1, a, &a[16]);
    cb = sp_2048_add_16(b1, b, &b[16]);
    u  = ca & cb;

    sp_2048_mul_16(z2, &a[16], &b[16]);
    sp_2048_mul_16(z0, a, b);
    sp_2048_mul_16(z1, a1, b1);

    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_sub_in_place_32(z1, z2);
    sp_2048_mask_16(a1, a1, 0 - cb);
    u += sp_2048_add_16(z1 + 16, z1 + 16, a1);
    sp_2048_mask_16(b1, b1, 0 - ca);
    u += sp_2048_add_16(z1 + 16, z1 + 16, b1);

    u += sp_2048_add_32(r + 16, r + 16, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (16 - 1));
    a1[0] = u;
    (void)sp_2048_add_16(r + 48, r + 48, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "subs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
    );
    return (uint32_t)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<32; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[64];
    sp_digit a1[32];
    sp_digit b1[32];
    sp_digit* z2 = r + 64;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_2048_add_32(a1, a, &a[32]);
    cb = sp_2048_add_32(b1, b, &b[32]);
    u  = ca & cb;

    sp_2048_mul_32(z2, &a[32], &b[32]);
    sp_2048_mul_32(z0, a, b);
    sp_2048_mul_32(z1, a1, b1);

    u += sp_2048_sub_in_place_64(z1, z0);
    u += sp_2048_sub_in_place_64(z1, z2);
    sp_2048_mask_32(a1, a1, 0 - cb);
    u += sp_2048_add_32(z1 + 32, z1 + 32, a1);
    sp_2048_mask_32(b1, b1, 0 - ca);
    u += sp_2048_add_32(z1 + 32, z1 + 32, b1);

    u += sp_2048_add_64(r + 32, r + 32, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (32 - 1));
    a1[0] = u;
    (void)sp_2048_add_32(r + 96, r + 96, a1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #32\n\t"
        /* A[0] * A[0] */
        "ldr	r10, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r9, r10, #16\n\t"
        "lsl	r2, r10, #16\n\t"
        "lsr	r2, r2, #16\n\t"
        "mul	r8, r2, r2\n\t"
        "mul	r3, r9, r9\n\t"
        "mul	r2, r9, r2\n\t"
        "lsr	r9, r2, #15\n\t"
        "lsl	r2, r2, #17\n\t"
        "adds	r8, r8, r2\n\t"
        "adc	r3, r3, r9\n\t"
#else
        "umull	r8, r3, r10, r10\n\t"
#endif
        "mov	r4, #0\n\t"
        "str	r8, [sp]\n\t"
        /* A[0] * A[1] */
        "ldr	r10, [%[a], #4]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [sp, #4]\n\t"
        /* A[0] * A[2] */
        "ldr	r10, [%[a], #8]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * A[1] */
        "ldr	r10, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #8]\n\t"
        /* A[0] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * A[2] */
        "ldr	r10, [%[a], #8]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r2, [sp, #12]\n\t"
        /* A[0] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[1] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[2] * A[2] */
        "ldr	r10, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [sp, #16]\n\t"
        /* A[0] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [sp, #20]\n\t"
        /* A[0] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r4, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "adds	r2, r2, r5\n\t"
        "adcs	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "str	r2, [sp, #24]\n\t"
        /* A[0] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r2, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r3, r3, r5\n\t"
        "adcs	r4, r4, r6\n\t"
        "adc	r2, r2, r7\n\t"
        "str	r3, [sp, #28]\n\t"
        /* A[1] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[2] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [%[r], #32]\n\t"
        /* A[2] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r4, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[3] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r2, r2, r5\n\t"
        "adcs	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "str	r2, [%[r], #36]\n\t"
        /* A[3] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[4] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[5] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [%[r], #40]\n\t"
        /* A[4] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #44]\n\t"
        /* A[5] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[6] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r2, [%[r], #48]\n\t"
        /* A[6] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [%[r], #52]\n\t"
        /* A[7] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adc	r2, r2, r9\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adc	r2, r2, r9\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r4, r4, r8\n\t"
        "adc	r2, r2, r9\n\t"
#endif
        "str	r4, [%[r], #56]\n\t"
        "str	r2, [%[r], #60]\n\t"
        "ldm	sp!, {r2, r3, r4, r8}\n\t"
        "stm	%[r]!, {r2, r3, r4, r8}\n\t"
        "ldm	sp!, {r2, r3, r4, r8}\n\t"
        "stm	%[r]!, {r2, r3, r4, r8}\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 16;
    sp_digit z1[16];
    sp_digit* a1 = z1;
    sp_digit zero[8];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 8);

    mask = sp_2048_sub_8(a1, a, &a[8]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_8(a1, p1, p2);

    sp_2048_sqr_8(z2, &a[8]);
    sp_2048_sqr_8(z0, a);
    sp_2048_sqr_8(z1, a1);

    u = 0;
    u -= sp_2048_sub_in_place_16(z1, z2);
    u -= sp_2048_sub_in_place_16(z1, z0);
    u += sp_2048_sub_in_place_16(r + 8, z1);
    zero[0] = u;
    (void)sp_2048_add_8(r + 24, r + 24, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_16(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 32;
    sp_digit z1[32];
    sp_digit* a1 = z1;
    sp_digit zero[16];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 16);

    mask = sp_2048_sub_16(a1, a, &a[16]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_16(a1, p1, p2);

    sp_2048_sqr_16(z2, &a[16]);
    sp_2048_sqr_16(z0, a);
    sp_2048_sqr_16(z1, a1);

    u = 0;
    u -= sp_2048_sub_in_place_32(z1, z2);
    u -= sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_sub_in_place_32(r + 16, z1);
    zero[0] = u;
    (void)sp_2048_add_16(r + 48, r + 48, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 64;
    sp_digit z1[64];
    sp_digit* a1 = z1;
    sp_digit zero[32];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 32);

    mask = sp_2048_sub_32(a1, a, &a[32]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_2048_sub_32(a1, p1, p2);

    sp_2048_sqr_32(z2, &a[32]);
    sp_2048_sqr_32(z0, a);
    sp_2048_sqr_32(z1, a1);

    u = 0;
    u -= sp_2048_sub_in_place_64(z1, z2);
    u -= sp_2048_sub_in_place_64(z1, z0);
    u += sp_2048_sub_in_place_64(r + 32, z1);
    zero[0] = u;
    (void)sp_2048_add_32(r + 96, r + 96, zero);
}

#endif /* !WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r3, #0\n\t"
        "add	r12, %[a], #0x100\n\t"
        "\n"
    "L_sp_2048_add_64_word_%=: \n\t"
        "adds	r3, r3, #-1\n\t"
        "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
        "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "adcs	r7, r7, r11\n\t"
        "stm	%[r]!, {r4, r5, r6, r7}\n\t"
        "mov	r4, #0\n\t"
        "adc	r3, r4, #0\n\t"
        "cmp	%[a], r12\n\t"
        "bne	L_sp_2048_add_64_word_%=\n\t"
        "mov	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_64(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        "mov	r12, #0\n\t"
        "add	lr, %[a], #0x100\n\t"
        "\n"
    "L_sp_2048_sub_in_pkace_64_word_%=: \n\t"
        "subs	r12, r10, r12\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	r12, r10, r10\n\t"
        "cmp	%[a], lr\n\t"
        "bne	L_sp_2048_sub_in_pkace_64_word_%=\n\t"
        "mov	%[a], r12\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10"
    );
    return (uint32_t)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x200\n\t"
        "mov	r5, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "\n"
    "L_sp_2048_mul_64_outer_%=: \n\t"
        "subs	r3, r5, #0xfc\n\t"
        "it	cc\n\t"
        "movcc	r3, #0\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_2048_mul_64_inner_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[b], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0x100\n\t"
        "beq	L_sp_2048_mul_64_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_2048_mul_64_inner_%=\n\t"
        "\n"
    "L_sp_2048_mul_64_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0x1f8\n\t"
        "ble	L_sp_2048_mul_64_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_2048_mul_64_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_2048_mul_64_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_64(sp_digit* r, const sp_digit* a)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x200\n\t"
        "mov	r12, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "mov	r5, #0\n\t"
        "\n"
    "L_sp_2048_sqr_64_outer_%=: \n\t"
        "subs	r3, r5, #0xfc\n\t"
        "it	cc\n\t"
        "movcc	r3, r12\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_2048_sqr_64_inner_%=: \n\t"
        "cmp	r4, r3\n\t"
        "beq	L_sp_2048_sqr_64_op_sqr_%=\n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[a], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "bal	L_sp_2048_sqr_64_op_done_%=\n\t"
        "\n"
    "L_sp_2048_sqr_64_op_sqr_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mov	r11, r9\n\t"
        "mul	r9, r11, r9\n\t"
        "mov	r11, r10\n\t"
        "mul	r10, r11, r10\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsl	r9, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #15\n\t"
        "lsl	r9, r9, #17\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, lr\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "\n"
    "L_sp_2048_sqr_64_op_done_%=: \n\t"
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0x100\n\t"
        "beq	L_sp_2048_sqr_64_inner_done_%=\n\t"
        "cmp	r3, r4\n\t"
        "bgt	L_sp_2048_sqr_64_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_2048_sqr_64_inner_%=\n\t"
        "\n"
    "L_sp_2048_sqr_64_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0x1f8\n\t"
        "ble	L_sp_2048_sqr_64_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_2048_sqr_64_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_2048_sqr_64_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_32(sp_digit* r, const sp_digit* a, sp_digit m)
{
    int i;

    for (i=0; i<32; i++) {
        r[i] = a[i] & m;
    }
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r3, #0\n\t"
        "add	r12, %[a], #0x80\n\t"
        "\n"
    "L_sp_2048_add_32_word_%=: \n\t"
        "adds	r3, r3, #-1\n\t"
        "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
        "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "adcs	r7, r7, r11\n\t"
        "stm	%[r]!, {r4, r5, r6, r7}\n\t"
        "mov	r4, #0\n\t"
        "adc	r3, r4, #0\n\t"
        "cmp	%[a], r12\n\t"
        "bne	L_sp_2048_add_32_word_%=\n\t"
        "mov	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_in_place_32(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        "mov	r12, #0\n\t"
        "add	lr, %[a], #0x80\n\t"
        "\n"
    "L_sp_2048_sub_in_pkace_32_word_%=: \n\t"
        "subs	r12, r10, r12\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	r12, r10, r10\n\t"
        "cmp	%[a], lr\n\t"
        "bne	L_sp_2048_sub_in_pkace_32_word_%=\n\t"
        "mov	%[a], r12\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10"
    );
    return (uint32_t)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_2048_mul_32(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x100\n\t"
        "mov	r5, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "\n"
    "L_sp_2048_mul_32_outer_%=: \n\t"
        "subs	r3, r5, #0x7c\n\t"
        "it	cc\n\t"
        "movcc	r3, #0\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_2048_mul_32_inner_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[b], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0x80\n\t"
        "beq	L_sp_2048_mul_32_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_2048_mul_32_inner_%=\n\t"
        "\n"
    "L_sp_2048_mul_32_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0xf8\n\t"
        "ble	L_sp_2048_mul_32_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_2048_mul_32_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_2048_mul_32_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x100\n\t"
        "mov	r12, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "mov	r5, #0\n\t"
        "\n"
    "L_sp_2048_sqr_32_outer_%=: \n\t"
        "subs	r3, r5, #0x7c\n\t"
        "it	cc\n\t"
        "movcc	r3, r12\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_2048_sqr_32_inner_%=: \n\t"
        "cmp	r4, r3\n\t"
        "beq	L_sp_2048_sqr_32_op_sqr_%=\n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[a], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "bal	L_sp_2048_sqr_32_op_done_%=\n\t"
        "\n"
    "L_sp_2048_sqr_32_op_sqr_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mov	r11, r9\n\t"
        "mul	r9, r11, r9\n\t"
        "mov	r11, r10\n\t"
        "mul	r10, r11, r10\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsl	r9, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #15\n\t"
        "lsl	r9, r9, #17\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, lr\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "\n"
    "L_sp_2048_sqr_32_op_done_%=: \n\t"
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0x80\n\t"
        "beq	L_sp_2048_sqr_32_inner_done_%=\n\t"
        "cmp	r3, r4\n\t"
        "bgt	L_sp_2048_sqr_32_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_2048_sqr_32_inner_%=\n\t"
        "\n"
    "L_sp_2048_sqr_32_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0xf8\n\t"
        "ble	L_sp_2048_sqr_32_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_2048_sqr_32_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_2048_sqr_32_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */

    /* rho = -1/m mod b */
    *rho = (sp_digit)0 - x;
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        /* A[0] * B */
        "ldr	r8, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r5, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r6, r5\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r3, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r3, r3, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#else
        "umull	r5, r3, %[b], r8\n\t"
#endif
        "mov	r4, #0\n\t"
        "str	r5, [%[r]]\n\t"
        "mov	r5, #0\n\t"
        "mov	r9, #4\n\t"
        "\n"
    "L_sp_2048_mul_d_64_word_%=: \n\t"
        /* A[i] * B */
        "ldr	r8, [%[a], r9]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], r9]\n\t"
        "mov	r3, r4\n\t"
        "mov	r4, r5\n\t"
        "mov	r5, #0\n\t"
        "add	r9, r9, #4\n\t"
        "cmp	r9, #0x100\n\t"
        "blt	L_sp_2048_mul_d_64_word_%=\n\t"
        "str	r3, [%[r], #256]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_64(sp_digit* r, const sp_digit* a, sp_digit b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        /* A[0] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r3, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r3, r3, #16\n\t"
        "mul	r3, r6, r3\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r4, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r4, r4, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
#else
        "umull	r3, r4, %[b], r8\n\t"
#endif
        "mov	r5, #0\n\t"
        "str	r3, [%[r]], #4\n\t"
        /* A[1] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[2] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[3] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[4] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[5] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[6] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[7] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[8] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[9] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[10] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[11] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[12] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[13] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[14] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[15] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[16] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[17] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[18] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[19] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[20] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[21] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[22] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[23] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[24] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[25] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[26] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[27] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[28] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[29] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[30] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[31] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[32] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[33] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[34] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[35] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[36] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[37] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[38] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[39] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[40] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[41] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[42] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[43] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[44] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[45] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[46] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[47] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[48] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[49] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[50] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[51] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[52] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[53] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[54] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[55] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[56] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[57] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[58] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[59] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[60] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[61] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[62] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[63] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r4, r4, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        "str	r4, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_2048_mont_norm_32(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 32);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_32(r, m);
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	r6, #0\n\t"
        "mov	r12, #0\n\t"
        "mov	lr, #0\n\t"
        "\n"
    "L_sp_2048_cond_sub_32_words_%=: \n\t"
        "subs	r12, r6, r12\n\t"
        "ldr	r4, [%[a], lr]\n\t"
        "ldr	r5, [%[b], lr]\n\t"
        "and	r5, r5, %[m]\n\t"
        "sbcs	r4, r4, r5\n\t"
        "sbc	r12, r6, r6\n\t"
        "str	r4, [%[r], lr]\n\t"
        "add	lr, lr, #4\n\t"
        "cmp	lr, #0x80\n\t"
        "blt	L_sp_2048_cond_sub_32_words_%=\n\t"
        "mov	%[r], r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6"
    );
    return (uint32_t)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	lr, #0\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "subs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "sbc	%[r], lr, lr\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6", "r7"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static SP_NOINLINE void sp_2048_mont_reduce_32(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    __asm__ __volatile__ (
#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4))
        "ldr	r11, [%[m]]\n\t"
#endif
        /* i = 0 */
        "mov	r9, #0\n\t"
        "mov	r3, #0\n\t"
        "ldr	r12, [%[a]]\n\t"
        "ldr	lr, [%[a], #4]\n\t"
        "\n"
    "L_sp_2048_mont_reduce_32_word_%=: \n\t"
        /* mu = a[i] * mp */
        "mul	r8, %[mp], r12\n\t"
        /* a[i+0] += m[0] * mu */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "ldr	r11, [%[m]]\n\t"
#endif
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r7, r11, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r7\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r5, r7\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r12, r12, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r11, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r5, r7\n\t"
#else
        "umull	r6, r7, r8, r11\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        /* a[i+1] += m[1] * mu */
        "ldr	r7, [%[m], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r10, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r10\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r4, r10\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r6, r10\n\t"
        "adds	lr, lr, r10\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r10, r7, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r4, r10\n\t"
#else
        "umull	r6, r10, r8, r7\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r10, #0\n\t"
#endif
        "mov	r12, lr\n\t"
        "adds	r12, r12, r5\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+2] += m[2] * mu */
        "ldr	r7, [%[m], #8]\n\t"
        "ldr	lr, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r10, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r10\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r5, r10\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r6, r10\n\t"
        "adds	lr, lr, r10\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r10, r7, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r5, r10\n\t"
#else
        "umull	r6, r10, r8, r7\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r10, #0\n\t"
#endif
        "adds	lr, lr, r4\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+3] += m[3] * mu */
        "ldr	r7, [%[m], #12]\n\t"
        "ldr	r10, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #12]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+4] += m[4] * mu */
        "ldr	r7, [%[m], #16]\n\t"
        "ldr	r10, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #16]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+5] += m[5] * mu */
        "ldr	r7, [%[m], #20]\n\t"
        "ldr	r10, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #20]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+6] += m[6] * mu */
        "ldr	r7, [%[m], #24]\n\t"
        "ldr	r10, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #24]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+7] += m[7] * mu */
        "ldr	r7, [%[m], #28]\n\t"
        "ldr	r10, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #28]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+8] += m[8] * mu */
        "ldr	r7, [%[m], #32]\n\t"
        "ldr	r10, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #32]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+9] += m[9] * mu */
        "ldr	r7, [%[m], #36]\n\t"
        "ldr	r10, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #36]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+10] += m[10] * mu */
        "ldr	r7, [%[m], #40]\n\t"
        "ldr	r10, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #40]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+11] += m[11] * mu */
        "ldr	r7, [%[m], #44]\n\t"
        "ldr	r10, [%[a], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #44]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+12] += m[12] * mu */
        "ldr	r7, [%[m], #48]\n\t"
        "ldr	r10, [%[a], #48]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #48]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+13] += m[13] * mu */
        "ldr	r7, [%[m], #52]\n\t"
        "ldr	r10, [%[a], #52]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #52]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+14] += m[14] * mu */
        "ldr	r7, [%[m], #56]\n\t"
        "ldr	r10, [%[a], #56]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #56]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+15] += m[15] * mu */
        "ldr	r7, [%[m], #60]\n\t"
        "ldr	r10, [%[a], #60]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #60]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+16] += m[16] * mu */
        "ldr	r7, [%[m], #64]\n\t"
        "ldr	r10, [%[a], #64]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #64]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+17] += m[17] * mu */
        "ldr	r7, [%[m], #68]\n\t"
        "ldr	r10, [%[a], #68]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #68]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+18] += m[18] * mu */
        "ldr	r7, [%[m], #72]\n\t"
        "ldr	r10, [%[a], #72]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #72]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+19] += m[19] * mu */
        "ldr	r7, [%[m], #76]\n\t"
        "ldr	r10, [%[a], #76]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #76]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+20] += m[20] * mu */
        "ldr	r7, [%[m], #80]\n\t"
        "ldr	r10, [%[a], #80]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #80]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+21] += m[21] * mu */
        "ldr	r7, [%[m], #84]\n\t"
        "ldr	r10, [%[a], #84]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #84]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+22] += m[22] * mu */
        "ldr	r7, [%[m], #88]\n\t"
        "ldr	r10, [%[a], #88]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #88]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+23] += m[23] * mu */
        "ldr	r7, [%[m], #92]\n\t"
        "ldr	r10, [%[a], #92]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #92]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+24] += m[24] * mu */
        "ldr	r7, [%[m], #96]\n\t"
        "ldr	r10, [%[a], #96]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #96]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+25] += m[25] * mu */
        "ldr	r7, [%[m], #100]\n\t"
        "ldr	r10, [%[a], #100]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #100]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+26] += m[26] * mu */
        "ldr	r7, [%[m], #104]\n\t"
        "ldr	r10, [%[a], #104]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #104]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+27] += m[27] * mu */
        "ldr	r7, [%[m], #108]\n\t"
        "ldr	r10, [%[a], #108]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #108]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+28] += m[28] * mu */
        "ldr	r7, [%[m], #112]\n\t"
        "ldr	r10, [%[a], #112]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #112]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+29] += m[29] * mu */
        "ldr	r7, [%[m], #116]\n\t"
        "ldr	r10, [%[a], #116]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #116]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+30] += m[30] * mu */
        "ldr	r7, [%[m], #120]\n\t"
        "ldr	r10, [%[a], #120]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #120]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+31] += m[31] * mu */
#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4))
        "ldr	r7, [%[m], #124]\n\t"
#else
        "ldr	r11, [%[m], #124]\n\t"
#endif
        "ldr	r10, [%[a], #124]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r4, r3, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, r3\n\t"
        "lsr	r7, r11, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "mov	r6, r8\n\t"
        "lsr	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "lsl	r7, r11, #16\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r4, r7, r3\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, r3\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #124]\n\t"
        "ldr	r10, [%[a], #128]\n\t"
        "adcs	r10, r10, r4\n\t"
        "str	r10, [%[a], #128]\n\t"
        "adc	r3, r3, #0\n\t"
        /* i += 1 */
        "add	r9, r9, #4\n\t"
        "add	%[a], %[a], #4\n\t"
        "cmp	r9, #0x80\n\t"
        "blt	L_sp_2048_mont_reduce_32_word_%=\n\t"
        "str	r12, [%[a]]\n\t"
        "str	lr, [%[a], #4]\n\t"
        "mov	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
    );
    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - mp);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery mulitplier.
 */
SP_NOINLINE static void sp_2048_mont_mul_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_2048_mul_32(r, a, b);
    sp_2048_mont_reduce_32(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery mulitplier.
 */
SP_NOINLINE static void sp_2048_mont_sqr_32(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_2048_sqr_32(r, a);
    sp_2048_mont_reduce_32(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        /* A[0] * B */
        "ldr	r8, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r5, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r6, r5\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r3, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r3, r3, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#else
        "umull	r5, r3, %[b], r8\n\t"
#endif
        "mov	r4, #0\n\t"
        "str	r5, [%[r]]\n\t"
        "mov	r5, #0\n\t"
        "mov	r9, #4\n\t"
        "\n"
    "L_sp_2048_mul_d_32_word_%=: \n\t"
        /* A[i] * B */
        "ldr	r8, [%[a], r9]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], r9]\n\t"
        "mov	r3, r4\n\t"
        "mov	r4, r5\n\t"
        "mov	r5, #0\n\t"
        "add	r9, r9, #4\n\t"
        "cmp	r9, #0x80\n\t"
        "blt	L_sp_2048_mul_d_32_word_%=\n\t"
        "str	r3, [%[r], #128]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a, sp_digit b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        /* A[0] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r3, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r3, r3, #16\n\t"
        "mul	r3, r6, r3\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r4, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r4, r4, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
#else
        "umull	r3, r4, %[b], r8\n\t"
#endif
        "mov	r5, #0\n\t"
        "str	r3, [%[r]], #4\n\t"
        /* A[1] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[2] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[3] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[4] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[5] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[6] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[7] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[8] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[9] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[10] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[11] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[12] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[13] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[14] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[15] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[16] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[17] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[18] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[19] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[20] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[21] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[22] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[23] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[24] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[25] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[26] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[27] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[28] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[29] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[30] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[31] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adc	r5, r5, r7\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r5, r5, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adc	r5, r5, r7\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adc	r5, r5, r7\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        "str	r5, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
{
    __asm__ __volatile__ (
        "lsr	r6, %[div], #16\n\t"
        "add	lr, r6, #1\n\t"
        "udiv	r4, %[d1], lr\n\t"
        "lsl	r5, %[div], #16\n\t"
        "lsl	r4, r4, #16\n\t"
        "umull	r3, r12, %[div], r4\n\t"
        "subs	%[d0], %[d0], r3\n\t"
        "sbc	%[d1], %[d1], r12\n\t"
        "subs	r3, %[d1], lr\n\t"
        "sbc	r7, r7, r7\n\t"
        "add	r7, r7, #1\n\t"
        "rsb	r8, r7, #0\n\t"
        "lsl	r7, r7, #16\n\t"
        "and	r5, r5, r8\n\t"
        "and	r6, r6, r8\n\t"
        "subs	%[d0], %[d0], r5\n\t"
        "add	r4, r4, r7\n\t"
        "sbc	%[d1], %[d1], r6\n\t"
        "lsl	r12, %[d1], #16\n\t"
        "lsr	r3, %[d0], #16\n\t"
        "orr	r3, r3, r12\n\t"
        "udiv	r3, r3, lr\n\t"
        "add	r4, r4, r3\n\t"
        "umull	r3, r12, %[div], r3\n\t"
        "subs	%[d0], %[d0], r3\n\t"
        "sbc	%[d1], %[d1], r12\n\t"
        "lsl	r12, %[d1], #16\n\t"
        "lsr	r3, %[d0], #16\n\t"
        "orr	r3, r3, r12\n\t"
        "udiv	r3, r3, lr\n\t"
        "add	r4, r4, r3\n\t"
        "mul	r3, %[div], r3\n\t"
        "sub	%[d0], %[d0], r3\n\t"
        "udiv	r3, %[d0], %[div]\n\t"
        "add	%[d1], r4, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
    );
    return (uint32_t)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
{
    __asm__ __volatile__ (
        "lsr	lr, %[div], #1\n\t"
        "add	lr, lr, #1\n\t"
        "mov	r4, %[d0]\n\t"
        "mov	r5, %[d1]\n\t"
        /* Do top 32 */
        "subs	r6, lr, r5\n\t"
        "sbc	r6, r6, r6\n\t"
        "mov	r3, #0\n\t"
        "sub	r3, r3, r6\n\t"
        "and	r6, r6, lr\n\t"
        "subs	r5, r5, r6\n\t"
        /* Next 30 bits */
        "mov	r12, #29\n\t"
        "\n"
    "L_div_2048_word_32_bit_%=: \n\t"
        "lsls	r4, r4, #1\n\t"
        "adc	r5, r5, r5\n\t"
        "subs	r6, lr, r5\n\t"
        "sbc	r6, r6, r6\n\t"
        "add	r3, r3, r3\n\t"
        "sub	r3, r3, r6\n\t"
        "and	r6, r6, lr\n\t"
        "subs	r5, r5, r6\n\t"
        "subs	r12, r12, #1\n\t"
        "bpl	L_div_2048_word_32_bit_%=\n\t"
        "add	r3, r3, r3\n\t"
        "add	r3, r3, #1\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r7, r3, #16\n\t"
        "lsl	r4, %[div], #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "lsr	r4, r4, #16\n\t"
        "mul	r4, r7, r4\n\t"
        "lsr	r8, %[div], #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r5, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r3, #16\n\t"
        "mul	r8, r7, r8\n\t"
        "add	r5, r5, r8\n\t"
        "lsl	r8, %[div], #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r8, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, r8\n\t"
#else
        "umull	r4, r5, r3, %[div]\n\t"
#endif
        "subs	r7, %[d0], r4\n\t"
        "sbc	r8, %[d1], r5\n\t"
        "add	r3, r3, r8\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r7, r3, #16\n\t"
        "lsl	r4, %[div], #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "lsr	r4, r4, #16\n\t"
        "mul	r4, r7, r4\n\t"
        "lsr	r8, %[div], #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r5, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r3, #16\n\t"
        "mul	r8, r7, r8\n\t"
        "add	r5, r5, r8\n\t"
        "lsl	r8, %[div], #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r8, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, r8\n\t"
#else
        "umull	r4, r5, r3, %[div]\n\t"
#endif
        "subs	r7, %[d0], r4\n\t"
        "sbc	r8, %[d1], r5\n\t"
        "add	r3, r3, r8\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r7, r3, #16\n\t"
        "lsl	r4, %[div], #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "lsr	r4, r4, #16\n\t"
        "mul	r4, r7, r4\n\t"
        "lsr	r8, %[div], #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r5, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r3, #16\n\t"
        "mul	r8, r7, r8\n\t"
        "add	r5, r5, r8\n\t"
        "lsl	r8, %[div], #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r8, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, r8\n\t"
#else
        "umull	r4, r5, r3, %[div]\n\t"
#endif
        "subs	r7, %[d0], r4\n\t"
        "sbc	r8, %[d1], r5\n\t"
        "add	r3, r3, r8\n\t"
        "subs	r6, %[div], r7\n\t"
        "sbc	r6, r6, r6\n\t"
        "sub	%[d1], r3, r6\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
    );
    return (uint32_t)(size_t)d1;
}

#endif
/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_int32 sp_2048_cmp_32(const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r2, #-1\n\t"
        "mov	r6, #1\n\t"
        "mov	r5, #0\n\t"
        "mov	r3, #-1\n\t"
#ifdef WOLFSSL_SP_SMALL
        "mov	r4, #0x7c\n\t"
        "\n"
    "L_sp_2048_cmp_32_words_%=: \n\t"
        "ldr	r12, [%[a], r4]\n\t"
        "ldr	lr, [%[b], r4]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "subs	r4, r4, #4\n\t"
        "bcs	L_sp_2048_cmp_32_words_%=\n\t"
        "eor	r2, r2, r3\n\t"
#else
        "ldr	r12, [%[a], #124]\n\t"
        "ldr	lr, [%[b], #124]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #120]\n\t"
        "ldr	lr, [%[b], #120]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #116]\n\t"
        "ldr	lr, [%[b], #116]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #112]\n\t"
        "ldr	lr, [%[b], #112]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #108]\n\t"
        "ldr	lr, [%[b], #108]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #104]\n\t"
        "ldr	lr, [%[b], #104]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #100]\n\t"
        "ldr	lr, [%[b], #100]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #96]\n\t"
        "ldr	lr, [%[b], #96]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #92]\n\t"
        "ldr	lr, [%[b], #92]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #88]\n\t"
        "ldr	lr, [%[b], #88]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #84]\n\t"
        "ldr	lr, [%[b], #84]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #80]\n\t"
        "ldr	lr, [%[b], #80]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #76]\n\t"
        "ldr	lr, [%[b], #76]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #72]\n\t"
        "ldr	lr, [%[b], #72]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #68]\n\t"
        "ldr	lr, [%[b], #68]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #64]\n\t"
        "ldr	lr, [%[b], #64]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #60]\n\t"
        "ldr	lr, [%[b], #60]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #56]\n\t"
        "ldr	lr, [%[b], #56]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #52]\n\t"
        "ldr	lr, [%[b], #52]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #48]\n\t"
        "ldr	lr, [%[b], #48]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #44]\n\t"
        "ldr	lr, [%[b], #44]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #40]\n\t"
        "ldr	lr, [%[b], #40]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #36]\n\t"
        "ldr	lr, [%[b], #36]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #32]\n\t"
        "ldr	lr, [%[b], #32]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #28]\n\t"
        "ldr	lr, [%[b], #28]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #24]\n\t"
        "ldr	lr, [%[b], #24]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #20]\n\t"
        "ldr	lr, [%[b], #20]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #16]\n\t"
        "ldr	lr, [%[b], #16]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #12]\n\t"
        "ldr	lr, [%[b], #12]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #8]\n\t"
        "ldr	lr, [%[b], #8]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #4]\n\t"
        "ldr	lr, [%[b], #4]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a]]\n\t"
        "ldr	lr, [%[b]]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "eor	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "mov	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
    );
    return (uint32_t)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_div_32(const sp_digit* a, const sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[64], t2[33];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
    r1 = sp_2048_cmp_32(&t1[32], d) >= 0;
    sp_2048_cond_sub_32(&t1[32], &t1[32], d, (sp_digit)0 - r1);
    for (i = 31; i >= 0; i--) {
        sp_digit mask = 0 - (t1[32 + i] == div);
        sp_digit hi = t1[32 + i] + mask;
        r1 = div_2048_word_32(hi, t1[32 + i - 1], div);
        r1 |= mask;

        sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
        t1[32 + i] -= t2[32];
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_32(t1, d) >= 0;
    sp_2048_cond_sub_32(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_mod_32(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_32(a, m, NULL, r);
}

#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 64];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 64), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 64;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32U);
        if (reduceA != 0) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_32(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);

            sp_2048_mont_mul_32(r, r, t[y], m, mp);
        }

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32U);
        sp_2048_mont_reduce_32(r, m, mp);

        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_32(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* td = NULL;
#else
    sp_digit td[32 * 64];
#endif
    sp_digit* t[32];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (32 * 64), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++) {
            t[i] = td + i * 64;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32U);
        if (reduceA != 0) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_32(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_32(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_32(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_32(t[20], t[10], m, mp);
        sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_32(t[22], t[11], m, mp);
        sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_32(t[24], t[12], m, mp);
        sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_32(t[26], t[13], m, mp);
        sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_32(t[28], t[14], m, mp);
        sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_32(t[30], t[15], m, mp);
        sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);

            sp_2048_mont_mul_32(r, r, t[y], m, mp);
        }

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32U);
        sp_2048_mont_reduce_32(r, m, mp);

        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_2048_mont_norm_64(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 64);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_64(r, m);
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	r6, #0\n\t"
        "mov	r12, #0\n\t"
        "mov	lr, #0\n\t"
        "\n"
    "L_sp_2048_cond_sub_64_words_%=: \n\t"
        "subs	r12, r6, r12\n\t"
        "ldr	r4, [%[a], lr]\n\t"
        "ldr	r5, [%[b], lr]\n\t"
        "and	r5, r5, %[m]\n\t"
        "sbcs	r4, r4, r5\n\t"
        "sbc	r12, r6, r6\n\t"
        "str	r4, [%[r], lr]\n\t"
        "add	lr, lr, #4\n\t"
        "cmp	lr, #0x100\n\t"
        "blt	L_sp_2048_cond_sub_64_words_%=\n\t"
        "mov	%[r], r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6"
    );
    return (uint32_t)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	lr, #0\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "subs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "sbc	%[r], lr, lr\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6", "r7"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static SP_NOINLINE void sp_2048_mont_reduce_64(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    __asm__ __volatile__ (
#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4))
        "ldr	r11, [%[m]]\n\t"
#endif
        /* i = 0 */
        "mov	r9, #0\n\t"
        "mov	r3, #0\n\t"
        "ldr	r12, [%[a]]\n\t"
        "ldr	lr, [%[a], #4]\n\t"
        "\n"
    "L_sp_2048_mont_reduce_64_word_%=: \n\t"
        /* mu = a[i] * mp */
        "mul	r8, %[mp], r12\n\t"
        /* a[i+0] += m[0] * mu */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "ldr	r11, [%[m]]\n\t"
#endif
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r7, r11, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r7\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r5, r7\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r12, r12, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r11, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r5, r7\n\t"
#else
        "umull	r6, r7, r8, r11\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        /* a[i+1] += m[1] * mu */
        "ldr	r7, [%[m], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r10, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r10\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r4, r10\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r6, r10\n\t"
        "adds	lr, lr, r10\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r10, r7, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r4, r10\n\t"
#else
        "umull	r6, r10, r8, r7\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r10, #0\n\t"
#endif
        "mov	r12, lr\n\t"
        "adds	r12, r12, r5\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+2] += m[2] * mu */
        "ldr	r7, [%[m], #8]\n\t"
        "ldr	lr, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r10, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r10\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r5, r10\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r6, r10\n\t"
        "adds	lr, lr, r10\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r10, r7, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r5, r10\n\t"
#else
        "umull	r6, r10, r8, r7\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r10, #0\n\t"
#endif
        "adds	lr, lr, r4\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+3] += m[3] * mu */
        "ldr	r7, [%[m], #12]\n\t"
        "ldr	r10, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #12]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+4] += m[4] * mu */
        "ldr	r7, [%[m], #16]\n\t"
        "ldr	r10, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #16]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+5] += m[5] * mu */
        "ldr	r7, [%[m], #20]\n\t"
        "ldr	r10, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #20]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+6] += m[6] * mu */
        "ldr	r7, [%[m], #24]\n\t"
        "ldr	r10, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #24]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+7] += m[7] * mu */
        "ldr	r7, [%[m], #28]\n\t"
        "ldr	r10, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #28]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+8] += m[8] * mu */
        "ldr	r7, [%[m], #32]\n\t"
        "ldr	r10, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #32]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+9] += m[9] * mu */
        "ldr	r7, [%[m], #36]\n\t"
        "ldr	r10, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #36]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+10] += m[10] * mu */
        "ldr	r7, [%[m], #40]\n\t"
        "ldr	r10, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #40]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+11] += m[11] * mu */
        "ldr	r7, [%[m], #44]\n\t"
        "ldr	r10, [%[a], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #44]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+12] += m[12] * mu */
        "ldr	r7, [%[m], #48]\n\t"
        "ldr	r10, [%[a], #48]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #48]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+13] += m[13] * mu */
        "ldr	r7, [%[m], #52]\n\t"
        "ldr	r10, [%[a], #52]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #52]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+14] += m[14] * mu */
        "ldr	r7, [%[m], #56]\n\t"
        "ldr	r10, [%[a], #56]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #56]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+15] += m[15] * mu */
        "ldr	r7, [%[m], #60]\n\t"
        "ldr	r10, [%[a], #60]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #60]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+16] += m[16] * mu */
        "ldr	r7, [%[m], #64]\n\t"
        "ldr	r10, [%[a], #64]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #64]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+17] += m[17] * mu */
        "ldr	r7, [%[m], #68]\n\t"
        "ldr	r10, [%[a], #68]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #68]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+18] += m[18] * mu */
        "ldr	r7, [%[m], #72]\n\t"
        "ldr	r10, [%[a], #72]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #72]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+19] += m[19] * mu */
        "ldr	r7, [%[m], #76]\n\t"
        "ldr	r10, [%[a], #76]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #76]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+20] += m[20] * mu */
        "ldr	r7, [%[m], #80]\n\t"
        "ldr	r10, [%[a], #80]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #80]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+21] += m[21] * mu */
        "ldr	r7, [%[m], #84]\n\t"
        "ldr	r10, [%[a], #84]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #84]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+22] += m[22] * mu */
        "ldr	r7, [%[m], #88]\n\t"
        "ldr	r10, [%[a], #88]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #88]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+23] += m[23] * mu */
        "ldr	r7, [%[m], #92]\n\t"
        "ldr	r10, [%[a], #92]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #92]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+24] += m[24] * mu */
        "ldr	r7, [%[m], #96]\n\t"
        "ldr	r10, [%[a], #96]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #96]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+25] += m[25] * mu */
        "ldr	r7, [%[m], #100]\n\t"
        "ldr	r10, [%[a], #100]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #100]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+26] += m[26] * mu */
        "ldr	r7, [%[m], #104]\n\t"
        "ldr	r10, [%[a], #104]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #104]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+27] += m[27] * mu */
        "ldr	r7, [%[m], #108]\n\t"
        "ldr	r10, [%[a], #108]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #108]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+28] += m[28] * mu */
        "ldr	r7, [%[m], #112]\n\t"
        "ldr	r10, [%[a], #112]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #112]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+29] += m[29] * mu */
        "ldr	r7, [%[m], #116]\n\t"
        "ldr	r10, [%[a], #116]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #116]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+30] += m[30] * mu */
        "ldr	r7, [%[m], #120]\n\t"
        "ldr	r10, [%[a], #120]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #120]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+31] += m[31] * mu */
        "ldr	r7, [%[m], #124]\n\t"
        "ldr	r10, [%[a], #124]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #124]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+32] += m[32] * mu */
        "ldr	r7, [%[m], #128]\n\t"
        "ldr	r10, [%[a], #128]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #128]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+33] += m[33] * mu */
        "ldr	r7, [%[m], #132]\n\t"
        "ldr	r10, [%[a], #132]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #132]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+34] += m[34] * mu */
        "ldr	r7, [%[m], #136]\n\t"
        "ldr	r10, [%[a], #136]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #136]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+35] += m[35] * mu */
        "ldr	r7, [%[m], #140]\n\t"
        "ldr	r10, [%[a], #140]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #140]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+36] += m[36] * mu */
        "ldr	r7, [%[m], #144]\n\t"
        "ldr	r10, [%[a], #144]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #144]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+37] += m[37] * mu */
        "ldr	r7, [%[m], #148]\n\t"
        "ldr	r10, [%[a], #148]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #148]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+38] += m[38] * mu */
        "ldr	r7, [%[m], #152]\n\t"
        "ldr	r10, [%[a], #152]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #152]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+39] += m[39] * mu */
        "ldr	r7, [%[m], #156]\n\t"
        "ldr	r10, [%[a], #156]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #156]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+40] += m[40] * mu */
        "ldr	r7, [%[m], #160]\n\t"
        "ldr	r10, [%[a], #160]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #160]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+41] += m[41] * mu */
        "ldr	r7, [%[m], #164]\n\t"
        "ldr	r10, [%[a], #164]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #164]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+42] += m[42] * mu */
        "ldr	r7, [%[m], #168]\n\t"
        "ldr	r10, [%[a], #168]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #168]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+43] += m[43] * mu */
        "ldr	r7, [%[m], #172]\n\t"
        "ldr	r10, [%[a], #172]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #172]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+44] += m[44] * mu */
        "ldr	r7, [%[m], #176]\n\t"
        "ldr	r10, [%[a], #176]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #176]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+45] += m[45] * mu */
        "ldr	r7, [%[m], #180]\n\t"
        "ldr	r10, [%[a], #180]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #180]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+46] += m[46] * mu */
        "ldr	r7, [%[m], #184]\n\t"
        "ldr	r10, [%[a], #184]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #184]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+47] += m[47] * mu */
        "ldr	r7, [%[m], #188]\n\t"
        "ldr	r10, [%[a], #188]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #188]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+48] += m[48] * mu */
        "ldr	r7, [%[m], #192]\n\t"
        "ldr	r10, [%[a], #192]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #192]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+49] += m[49] * mu */
        "ldr	r7, [%[m], #196]\n\t"
        "ldr	r10, [%[a], #196]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #196]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+50] += m[50] * mu */
        "ldr	r7, [%[m], #200]\n\t"
        "ldr	r10, [%[a], #200]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #200]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+51] += m[51] * mu */
        "ldr	r7, [%[m], #204]\n\t"
        "ldr	r10, [%[a], #204]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #204]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+52] += m[52] * mu */
        "ldr	r7, [%[m], #208]\n\t"
        "ldr	r10, [%[a], #208]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #208]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+53] += m[53] * mu */
        "ldr	r7, [%[m], #212]\n\t"
        "ldr	r10, [%[a], #212]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #212]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+54] += m[54] * mu */
        "ldr	r7, [%[m], #216]\n\t"
        "ldr	r10, [%[a], #216]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #216]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+55] += m[55] * mu */
        "ldr	r7, [%[m], #220]\n\t"
        "ldr	r10, [%[a], #220]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #220]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+56] += m[56] * mu */
        "ldr	r7, [%[m], #224]\n\t"
        "ldr	r10, [%[a], #224]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #224]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+57] += m[57] * mu */
        "ldr	r7, [%[m], #228]\n\t"
        "ldr	r10, [%[a], #228]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #228]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+58] += m[58] * mu */
        "ldr	r7, [%[m], #232]\n\t"
        "ldr	r10, [%[a], #232]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #232]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+59] += m[59] * mu */
        "ldr	r7, [%[m], #236]\n\t"
        "ldr	r10, [%[a], #236]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #236]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+60] += m[60] * mu */
        "ldr	r7, [%[m], #240]\n\t"
        "ldr	r10, [%[a], #240]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #240]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+61] += m[61] * mu */
        "ldr	r7, [%[m], #244]\n\t"
        "ldr	r10, [%[a], #244]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #244]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+62] += m[62] * mu */
        "ldr	r7, [%[m], #248]\n\t"
        "ldr	r10, [%[a], #248]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #248]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+63] += m[63] * mu */
#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4))
        "ldr	r7, [%[m], #252]\n\t"
#else
        "ldr	r11, [%[m], #252]\n\t"
#endif
        "ldr	r10, [%[a], #252]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r4, r3, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, r3\n\t"
        "lsr	r7, r11, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "mov	r6, r8\n\t"
        "lsr	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "lsl	r7, r11, #16\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r4, r7, r3\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, r3\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #252]\n\t"
        "ldr	r10, [%[a], #256]\n\t"
        "adcs	r10, r10, r4\n\t"
        "str	r10, [%[a], #256]\n\t"
        "adc	r3, r3, #0\n\t"
        /* i += 1 */
        "add	r9, r9, #4\n\t"
        "add	%[a], %[a], #4\n\t"
        "cmp	r9, #0x100\n\t"
        "blt	L_sp_2048_mont_reduce_64_word_%=\n\t"
        "str	r12, [%[a]]\n\t"
        "str	lr, [%[a], #4]\n\t"
        "mov	%[mp], r3\n\t"
        : [a] "+r" (a), [m] "+r" (m), [mp] "+r" (mp)
        :
        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11"
    );
    sp_2048_cond_sub_64(a - 64, a, m, (sp_digit)0 - mp);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery mulitplier.
 */
SP_NOINLINE static void sp_2048_mont_mul_64(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_2048_mul_64(r, a, b);
    sp_2048_mont_reduce_64(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery mulitplier.
 */
SP_NOINLINE static void sp_2048_mont_sqr_64(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_2048_sqr_64(r, a);
    sp_2048_mont_reduce_64(r, m, mp);
}

#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "add	lr, %[a], #0x100\n\t"
        "\n"
    "L_sp_2048_sub_64_word_%=: \n\t"
        "rsbs	r12, r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	r12, r3, r3\n\t"
        "cmp	%[a], lr\n\t"
        "bne	L_sp_2048_sub_64_word_%=\n\t"
        "mov	%[r], r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "lr"
    );
    return (uint32_t)(size_t)r;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_2048_sub_64(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_USE_UDIV
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
{
    __asm__ __volatile__ (
        "lsr	r6, %[div], #16\n\t"
        "add	lr, r6, #1\n\t"
        "udiv	r4, %[d1], lr\n\t"
        "lsl	r5, %[div], #16\n\t"
        "lsl	r4, r4, #16\n\t"
        "umull	r3, r12, %[div], r4\n\t"
        "subs	%[d0], %[d0], r3\n\t"
        "sbc	%[d1], %[d1], r12\n\t"
        "subs	r3, %[d1], lr\n\t"
        "sbc	r7, r7, r7\n\t"
        "add	r7, r7, #1\n\t"
        "rsb	r8, r7, #0\n\t"
        "lsl	r7, r7, #16\n\t"
        "and	r5, r5, r8\n\t"
        "and	r6, r6, r8\n\t"
        "subs	%[d0], %[d0], r5\n\t"
        "add	r4, r4, r7\n\t"
        "sbc	%[d1], %[d1], r6\n\t"
        "lsl	r12, %[d1], #16\n\t"
        "lsr	r3, %[d0], #16\n\t"
        "orr	r3, r3, r12\n\t"
        "udiv	r3, r3, lr\n\t"
        "add	r4, r4, r3\n\t"
        "umull	r3, r12, %[div], r3\n\t"
        "subs	%[d0], %[d0], r3\n\t"
        "sbc	%[d1], %[d1], r12\n\t"
        "lsl	r12, %[d1], #16\n\t"
        "lsr	r3, %[d0], #16\n\t"
        "orr	r3, r3, r12\n\t"
        "udiv	r3, r3, lr\n\t"
        "add	r4, r4, r3\n\t"
        "mul	r3, %[div], r3\n\t"
        "sub	%[d0], %[d0], r3\n\t"
        "udiv	r3, %[d0], %[div]\n\t"
        "add	%[d1], r4, r3\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
    );
    return (uint32_t)(size_t)d1;
}

#else
/* Divide the double width number (d1|d0) by the divisor. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The divisor.
 * returns the result of the division.
 *
 * Note that this is an approximate div. It may give an answer 1 larger.
 */
static sp_digit div_2048_word_64(sp_digit d1, sp_digit d0, sp_digit div)
{
    __asm__ __volatile__ (
        "lsr	lr, %[div], #1\n\t"
        "add	lr, lr, #1\n\t"
        "mov	r4, %[d0]\n\t"
        "mov	r5, %[d1]\n\t"
        /* Do top 32 */
        "subs	r6, lr, r5\n\t"
        "sbc	r6, r6, r6\n\t"
        "mov	r3, #0\n\t"
        "sub	r3, r3, r6\n\t"
        "and	r6, r6, lr\n\t"
        "subs	r5, r5, r6\n\t"
        /* Next 30 bits */
        "mov	r12, #29\n\t"
        "\n"
    "L_div_2048_word_64_bit_%=: \n\t"
        "lsls	r4, r4, #1\n\t"
        "adc	r5, r5, r5\n\t"
        "subs	r6, lr, r5\n\t"
        "sbc	r6, r6, r6\n\t"
        "add	r3, r3, r3\n\t"
        "sub	r3, r3, r6\n\t"
        "and	r6, r6, lr\n\t"
        "subs	r5, r5, r6\n\t"
        "subs	r12, r12, #1\n\t"
        "bpl	L_div_2048_word_64_bit_%=\n\t"
        "add	r3, r3, r3\n\t"
        "add	r3, r3, #1\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r7, r3, #16\n\t"
        "lsl	r4, %[div], #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "lsr	r4, r4, #16\n\t"
        "mul	r4, r7, r4\n\t"
        "lsr	r8, %[div], #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r5, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r3, #16\n\t"
        "mul	r8, r7, r8\n\t"
        "add	r5, r5, r8\n\t"
        "lsl	r8, %[div], #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r8, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, r8\n\t"
#else
        "umull	r4, r5, r3, %[div]\n\t"
#endif
        "subs	r7, %[d0], r4\n\t"
        "sbc	r8, %[d1], r5\n\t"
        "add	r3, r3, r8\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r7, r3, #16\n\t"
        "lsl	r4, %[div], #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "lsr	r4, r4, #16\n\t"
        "mul	r4, r7, r4\n\t"
        "lsr	r8, %[div], #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r5, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r3, #16\n\t"
        "mul	r8, r7, r8\n\t"
        "add	r5, r5, r8\n\t"
        "lsl	r8, %[div], #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r8, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, r8\n\t"
#else
        "umull	r4, r5, r3, %[div]\n\t"
#endif
        "subs	r7, %[d0], r4\n\t"
        "sbc	r8, %[d1], r5\n\t"
        "add	r3, r3, r8\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r7, r3, #16\n\t"
        "lsl	r4, %[div], #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "lsr	r4, r4, #16\n\t"
        "mul	r4, r7, r4\n\t"
        "lsr	r8, %[div], #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r5, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r3, #16\n\t"
        "mul	r8, r7, r8\n\t"
        "add	r5, r5, r8\n\t"
        "lsl	r8, %[div], #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r7, r8, r7\n\t"
        "lsr	r8, r7, #16\n\t"
        "lsl	r7, r7, #16\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, r8\n\t"
#else
        "umull	r4, r5, r3, %[div]\n\t"
#endif
        "subs	r7, %[d0], r4\n\t"
        "sbc	r8, %[d1], r5\n\t"
        "add	r3, r3, r8\n\t"
        "subs	r6, %[div], r7\n\t"
        "sbc	r6, r6, r6\n\t"
        "sub	%[d1], r3, r6\n\t"
        : [d1] "+r" (d1), [d0] "+r" (d0), [div] "+r" (div)
        :
        : "memory", "r3", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
    );
    return (uint32_t)(size_t)d1;
}

#endif
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_div_64_cond(const sp_digit* a, const sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[128], t2[65];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
    for (i = 63; i > 0; i--) {
        if (t1[i + 64] != d[i])
            break;
    }
    if (t1[i + 64] >= d[i]) {
        sp_2048_sub_in_place_64(&t1[64], d);
    }
    for (i = 63; i >= 0; i--) {
        if (t1[64 + i] == div) {
            r1 = SP_DIGIT_MAX;
        }
        else {
            r1 = div_2048_word_64(t1[64 + i], t1[64 + i - 1], div);
        }

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
        t1[64 + i] -= t2[64];
        if (t1[64 + i] != 0) {
            t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d);
            if (t1[64 + i] != 0)
                t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], d);
        }
    }

    for (i = 63; i > 0; i--) {
        if (t1[i] != d[i])
            break;
    }
    if (t1[i] >= d[i]) {
        sp_2048_sub_64(r, t1, d);
    }
    else {
        XMEMCPY(r, t1, sizeof(*t1) * 64);
    }

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_mod_64_cond(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_64_cond(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_HAVE_SP_DH) || !defined(WOLFSSL_RSA_PUBLIC_ONLY)
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_64(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<64; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_int32 sp_2048_cmp_64(const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r2, #-1\n\t"
        "mov	r6, #1\n\t"
        "mov	r5, #0\n\t"
        "mov	r3, #-1\n\t"
#ifdef WOLFSSL_SP_SMALL
        "mov	r4, #0xfc\n\t"
        "\n"
    "L_sp_2048_cmp_64_words_%=: \n\t"
        "ldr	r12, [%[a], r4]\n\t"
        "ldr	lr, [%[b], r4]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "subs	r4, r4, #4\n\t"
        "bcs	L_sp_2048_cmp_64_words_%=\n\t"
        "eor	r2, r2, r3\n\t"
#else
        "ldr	r12, [%[a], #252]\n\t"
        "ldr	lr, [%[b], #252]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #248]\n\t"
        "ldr	lr, [%[b], #248]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #244]\n\t"
        "ldr	lr, [%[b], #244]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #240]\n\t"
        "ldr	lr, [%[b], #240]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #236]\n\t"
        "ldr	lr, [%[b], #236]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #232]\n\t"
        "ldr	lr, [%[b], #232]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #228]\n\t"
        "ldr	lr, [%[b], #228]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #224]\n\t"
        "ldr	lr, [%[b], #224]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #220]\n\t"
        "ldr	lr, [%[b], #220]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #216]\n\t"
        "ldr	lr, [%[b], #216]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #212]\n\t"
        "ldr	lr, [%[b], #212]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #208]\n\t"
        "ldr	lr, [%[b], #208]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #204]\n\t"
        "ldr	lr, [%[b], #204]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #200]\n\t"
        "ldr	lr, [%[b], #200]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #196]\n\t"
        "ldr	lr, [%[b], #196]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #192]\n\t"
        "ldr	lr, [%[b], #192]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #188]\n\t"
        "ldr	lr, [%[b], #188]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #184]\n\t"
        "ldr	lr, [%[b], #184]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #180]\n\t"
        "ldr	lr, [%[b], #180]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #176]\n\t"
        "ldr	lr, [%[b], #176]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #172]\n\t"
        "ldr	lr, [%[b], #172]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #168]\n\t"
        "ldr	lr, [%[b], #168]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #164]\n\t"
        "ldr	lr, [%[b], #164]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #160]\n\t"
        "ldr	lr, [%[b], #160]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #156]\n\t"
        "ldr	lr, [%[b], #156]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #152]\n\t"
        "ldr	lr, [%[b], #152]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #148]\n\t"
        "ldr	lr, [%[b], #148]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #144]\n\t"
        "ldr	lr, [%[b], #144]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #140]\n\t"
        "ldr	lr, [%[b], #140]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #136]\n\t"
        "ldr	lr, [%[b], #136]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #132]\n\t"
        "ldr	lr, [%[b], #132]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #128]\n\t"
        "ldr	lr, [%[b], #128]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #124]\n\t"
        "ldr	lr, [%[b], #124]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #120]\n\t"
        "ldr	lr, [%[b], #120]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #116]\n\t"
        "ldr	lr, [%[b], #116]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #112]\n\t"
        "ldr	lr, [%[b], #112]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #108]\n\t"
        "ldr	lr, [%[b], #108]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #104]\n\t"
        "ldr	lr, [%[b], #104]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #100]\n\t"
        "ldr	lr, [%[b], #100]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #96]\n\t"
        "ldr	lr, [%[b], #96]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #92]\n\t"
        "ldr	lr, [%[b], #92]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #88]\n\t"
        "ldr	lr, [%[b], #88]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #84]\n\t"
        "ldr	lr, [%[b], #84]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #80]\n\t"
        "ldr	lr, [%[b], #80]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #76]\n\t"
        "ldr	lr, [%[b], #76]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #72]\n\t"
        "ldr	lr, [%[b], #72]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #68]\n\t"
        "ldr	lr, [%[b], #68]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #64]\n\t"
        "ldr	lr, [%[b], #64]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #60]\n\t"
        "ldr	lr, [%[b], #60]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #56]\n\t"
        "ldr	lr, [%[b], #56]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #52]\n\t"
        "ldr	lr, [%[b], #52]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #48]\n\t"
        "ldr	lr, [%[b], #48]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #44]\n\t"
        "ldr	lr, [%[b], #44]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #40]\n\t"
        "ldr	lr, [%[b], #40]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #36]\n\t"
        "ldr	lr, [%[b], #36]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #32]\n\t"
        "ldr	lr, [%[b], #32]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #28]\n\t"
        "ldr	lr, [%[b], #28]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #24]\n\t"
        "ldr	lr, [%[b], #24]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #20]\n\t"
        "ldr	lr, [%[b], #20]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #16]\n\t"
        "ldr	lr, [%[b], #16]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #12]\n\t"
        "ldr	lr, [%[b], #12]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #8]\n\t"
        "ldr	lr, [%[b], #8]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a], #4]\n\t"
        "ldr	lr, [%[b], #4]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "ldr	r12, [%[a]]\n\t"
        "ldr	lr, [%[b]]\n\t"
        "and	r12, r12, r3\n\t"
        "and	lr, lr, r3\n\t"
        "subs	r12, r12, lr\n\t"
        "it	hi\n\t"
        "movhi	r2, r6\n\t"
        "it	lo\n\t"
        "movlo	r2, r3\n\t"
        "it	ne\n\t"
        "movne	r3, r5\n\t"
        "eor	r2, r2, r3\n\t"
#endif /*WOLFSSL_SP_SMALL */
        "mov	%[a], r2\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r12", "lr", "r4", "r5", "r6"
    );
    return (uint32_t)(size_t)a;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_div_64(const sp_digit* a, const sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[128], t2[65];
    sp_digit div, r1;
    int i;

    (void)m;

    div = d[63];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 64);
    r1 = sp_2048_cmp_64(&t1[64], d) >= 0;
    sp_2048_cond_sub_64(&t1[64], &t1[64], d, (sp_digit)0 - r1);
    for (i = 63; i >= 0; i--) {
        sp_digit mask = 0 - (t1[64 + i] == div);
        sp_digit hi = t1[64 + i] + mask;
        r1 = div_2048_word_64(hi, t1[64 + i - 1], div);
        r1 |= mask;

        sp_2048_mul_d_64(t2, d, r1);
        t1[64 + i] += sp_2048_sub_in_place_64(&t1[i], t2);
        t1[64 + i] -= t2[64];
        sp_2048_mask_64(t2, d, t1[64 + i]);
        t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], t2);
        sp_2048_mask_64(t2, d, t1[64 + i]);
        t1[64 + i] += sp_2048_add_64(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_64(t1, d) >= 0;
    sp_2048_cond_sub_64(r, t1, d, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static WC_INLINE int sp_2048_mod_64(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_64(a, m, NULL, r);
}

#endif /* WOLFSSL_HAVE_SP_DH || !WOLFSSL_RSA_PUBLIC_ONLY */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* td = NULL;
#else
    sp_digit td[8 * 128];
#endif
    sp_digit* t[8];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (8 * 128), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<8; i++) {
            t[i] = td + i * 128;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_64(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
        if (reduceA != 0) {
            err = sp_2048_mod_64(t[1] + 64, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_64(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 3;
        if (c == 32) {
            c = 29;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
        for (; i>=0 || c>=3; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 29);
                n <<= 3;
                c = 29;
            }
            else if (c < 3) {
                y = (byte)(n >> 29);
                n = e[i--];
                c = 3 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 29) & 0x7);
                n <<= 3;
                c -= 3;
            }

            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);

            sp_2048_mont_mul_64(r, r, t[y], m, mp);
        }

        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
        sp_2048_mont_reduce_64(r, m, mp);

        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
        sp_2048_cond_sub_64(r, r, m, mask);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#else
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_64(sp_digit* r, const sp_digit* a, const sp_digit* e,
        int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* td = NULL;
#else
    sp_digit td[16 * 128];
#endif
    sp_digit* t[16];
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * (16 * 128), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++) {
            t[i] = td + i * 128;
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_64(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 64U);
        if (reduceA != 0) {
            err = sp_2048_mod_64(t[1] + 64, a, m);
            if (err == MP_OKAY) {
                err = sp_2048_mod_64(t[1], t[1], m);
            }
        }
        else {
            XMEMCPY(t[1] + 64, a, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_64(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_64(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_64(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_64(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_64(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_64(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_64(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_64(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_64(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_64(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_64(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_64(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_64(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_64(t[15], t[ 8], t[ 7], m, mp);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 4;
        if (c == 32) {
            c = 28;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        XMEMCPY(r, t[y], sizeof(sp_digit) * 64);
        for (; i>=0 || c>=4; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 28);
                n <<= 4;
                c = 28;
            }
            else if (c < 4) {
                y = (byte)(n >> 28);
                n = e[i--];
                c = 4 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }

            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);

            sp_2048_mont_mul_64(r, r, t[y], m, mp);
        }

        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
        sp_2048_mont_reduce_64(r, m, mp);

        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
        sp_2048_cond_sub_64(r, r, m, mask);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* a = NULL;
#else
    sp_digit a[64 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit *ah = NULL;
    sp_digit e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 256) {
        err = MP_TO_E;
    }
    else if (mp_count_bits(em) > 32 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 64 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        ah = a + 64;
        r = a + 64 * 2;
        m = r + 64 * 2;

        sp_2048_from_bin(ah, 64, in, inLen);
#if DIGIT_BIT >= 32
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 64, mm);

        if (e[0] == 0x10001) {
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64_cond(r, a, m);
            /* Montgomery form: r = a.R mod m */

            if (err == MP_OKAY) {
                /* r = a ^ 0x10000 => r = a squared 16 times */
                for (i = 15; i >= 0; i--) {
                    sp_2048_mont_sqr_64(r, r, m, mp);
                }
                /* mont_red(r.R.R) = (r.R.R / R) mod m = r.R mod m
                 * mont_red(r.R * a) = (r.R.a / R) mod m = r.a mod m
                 */
                sp_2048_mont_mul_64(r, r, ah, m, mp);

                for (i = 63; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_2048_sub_in_place_64(r, m);
                }
            }
        }
        else if (e[0] == 0x3) {
            if (err == MP_OKAY) {
                sp_2048_sqr_64(r, ah);
                err = sp_2048_mod_64_cond(r, r, m);
            }
            if (err == MP_OKAY) {
                sp_2048_mul_64(r, ah, r);
                err = sp_2048_mod_64_cond(r, r, m);
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 64);
            err = sp_2048_mod_64_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i = 31; i >= 0; i--) {
                    if (e[0] >> i) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 64);
                for (i--; i >= 0; i--) {
                    sp_2048_mont_sqr_64(r, r, m, mp);
                    if (((e[0] >> i) & 1) == 1) {
                        sp_2048_mont_mul_64(r, r, a, m, mp);
                    }
                }
                XMEMSET(&r[64], 0, sizeof(sp_digit) * 64);
                sp_2048_mont_reduce_64(r, m, mp);

                for (i = 63; i > 0; i--) {
                    if (r[i] != m[i]) {
                        break;
                    }
                }
                if (r[i] >= m[i]) {
                    sp_2048_sub_in_place_64(r, m);
                }
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_64(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (a != NULL)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	lr, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r12, #0\n\t"
        "\n"
    "L_sp_2048_cond_add_32_words_%=: \n\t"
        "adds	lr, lr, #-1\n\t"
        "ldr	r4, [%[a], r12]\n\t"
        "ldr	r5, [%[b], r12]\n\t"
        "and	r5, r5, %[m]\n\t"
        "adcs	r4, r4, r5\n\t"
        "adc	lr, r6, r6\n\t"
        "str	r4, [%[r], r12]\n\t"
        "add	r12, r12, #4\n\t"
        "cmp	r12, #0x80\n\t"
        "blt	L_sp_2048_cond_add_32_words_%=\n\t"
        "mov	%[r], lr\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6"
    );
    return (uint32_t)(size_t)r;
}

#else
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_add_32(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	r8, #0\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "adcs	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "adc	%[r], r8, r8\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6", "r7", "r8"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* d = NULL;
#else
    sp_digit  d[64 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 2048) {
           err = MP_READ_E;
        }
        else if (inLen > 256) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 64 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 64;
        m = a + 128;
        r = a;

        sp_2048_from_bin(a, 64, in, inLen);
        sp_2048_from_mp(d, 64, dm);
        sp_2048_from_mp(m, 64, mm);
        err = sp_2048_mod_exp_64(r, a, d, 2048, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_64(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 64);
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* a = NULL;
#else
    sp_digit a[32 * 11];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    sp_digit* qi = NULL;
    sp_digit* dq = NULL;
    sp_digit c;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256) {
        err = MP_TO_E;
    }
    else if (inLen > 256 || mp_count_bits(mm) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(pm)) {
        err = MP_VAL;
    }
    else if (mp_iseven(qm)) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 11, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 64 * 2;
        q = p + 32;
        qi = dq = dp = q + 32;
        tmpa = qi + 32;
        tmpb = tmpa + 64;
        r = a;

        sp_2048_from_bin(a, 64, in, inLen);
        sp_2048_from_mp(p, 32, pm);
        sp_2048_from_mp(q, 32, qm);
        sp_2048_from_mp(dp, 32, dpm);

        err = sp_2048_mod_exp_32(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(dq, 32, dqm);
        err = sp_2048_mod_exp_32(tmpb, a, dq, 1024, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_2048_sub_in_place_32(tmpa, tmpb);
        c += sp_2048_cond_add_32(tmpa, tmpa, p, c);
        sp_2048_cond_add_32(tmpa, tmpa, p, c);

        sp_2048_from_mp(qi, 32, qim);
        sp_2048_mul_32(tmpa, tmpa, qi);
        err = sp_2048_mod_32(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_32(tmpa, q, tmpa);
        XMEMSET(&tmpb[32], 0, sizeof(sp_digit) * 32);
        sp_2048_add_64(r, tmpb, tmpa);

        sp_2048_to_bin_64(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 32 * 11);
    #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
    return err;
}
#endif /* WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 32
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 64);
        r->used = 64;
        mp_clamp(r);
#elif DIGIT_BIT < 32
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 32) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 32 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 64; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 32 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 32 - s;
            }
            else {
                s += 32;
            }
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[128];
    sp_digit e[64];
    sp_digit m[64];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expBits > 2048) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 64, base);
        sp_2048_from_mp(e, 64, exp);
        sp_2048_from_mp(m, 64, mod);

        err = sp_2048_mod_exp_64(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_2048
static void sp_2048_lshift_64(sp_digit* r, const sp_digit* a, byte n)
{
    __asm__ __volatile__ (
        "rsb	r12, %[n], #31\n\t"
        "ldr	r5, [%[a], #252]\n\t"
        "lsr	r6, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r6, r6, r12\n\t"
        "ldr	r4, [%[a], #248]\n\t"
        "str	r6, [%[r], #256]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #244]\n\t"
        "str	r5, [%[r], #252]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #240]\n\t"
        "str	r4, [%[r], #248]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #236]\n\t"
        "str	r6, [%[r], #244]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #232]\n\t"
        "str	r5, [%[r], #240]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #228]\n\t"
        "str	r4, [%[r], #236]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #224]\n\t"
        "str	r6, [%[r], #232]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #220]\n\t"
        "str	r5, [%[r], #228]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #216]\n\t"
        "str	r4, [%[r], #224]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #212]\n\t"
        "str	r6, [%[r], #220]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #208]\n\t"
        "str	r5, [%[r], #216]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #204]\n\t"
        "str	r4, [%[r], #212]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #200]\n\t"
        "str	r6, [%[r], #208]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #196]\n\t"
        "str	r5, [%[r], #204]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #192]\n\t"
        "str	r4, [%[r], #200]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #188]\n\t"
        "str	r6, [%[r], #196]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #184]\n\t"
        "str	r5, [%[r], #192]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #180]\n\t"
        "str	r4, [%[r], #188]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #176]\n\t"
        "str	r6, [%[r], #184]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #172]\n\t"
        "str	r5, [%[r], #180]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #168]\n\t"
        "str	r4, [%[r], #176]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #164]\n\t"
        "str	r6, [%[r], #172]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #160]\n\t"
        "str	r5, [%[r], #168]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #156]\n\t"
        "str	r4, [%[r], #164]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #152]\n\t"
        "str	r6, [%[r], #160]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #148]\n\t"
        "str	r5, [%[r], #156]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #144]\n\t"
        "str	r4, [%[r], #152]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #140]\n\t"
        "str	r6, [%[r], #148]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #136]\n\t"
        "str	r5, [%[r], #144]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #132]\n\t"
        "str	r4, [%[r], #140]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #128]\n\t"
        "str	r6, [%[r], #136]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #124]\n\t"
        "str	r5, [%[r], #132]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #120]\n\t"
        "str	r4, [%[r], #128]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #116]\n\t"
        "str	r6, [%[r], #124]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #112]\n\t"
        "str	r5, [%[r], #120]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #108]\n\t"
        "str	r4, [%[r], #116]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #104]\n\t"
        "str	r6, [%[r], #112]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #100]\n\t"
        "str	r5, [%[r], #108]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #96]\n\t"
        "str	r4, [%[r], #104]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #92]\n\t"
        "str	r6, [%[r], #100]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #88]\n\t"
        "str	r5, [%[r], #96]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #84]\n\t"
        "str	r4, [%[r], #92]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #80]\n\t"
        "str	r6, [%[r], #88]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #76]\n\t"
        "str	r5, [%[r], #84]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #72]\n\t"
        "str	r4, [%[r], #80]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #68]\n\t"
        "str	r6, [%[r], #76]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #64]\n\t"
        "str	r5, [%[r], #72]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #60]\n\t"
        "str	r4, [%[r], #68]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #56]\n\t"
        "str	r6, [%[r], #64]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #52]\n\t"
        "str	r5, [%[r], #60]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #48]\n\t"
        "str	r4, [%[r], #56]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #44]\n\t"
        "str	r6, [%[r], #52]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #40]\n\t"
        "str	r5, [%[r], #48]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #36]\n\t"
        "str	r4, [%[r], #44]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #32]\n\t"
        "str	r6, [%[r], #40]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #28]\n\t"
        "str	r5, [%[r], #36]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #24]\n\t"
        "str	r4, [%[r], #32]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #20]\n\t"
        "str	r6, [%[r], #28]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #16]\n\t"
        "str	r5, [%[r], #24]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a], #12]\n\t"
        "str	r4, [%[r], #20]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "ldr	r4, [%[a], #8]\n\t"
        "str	r6, [%[r], #16]\n\t"
        "lsr	r3, r4, #1\n\t"
        "lsl	r4, r4, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r5, r5, r3\n\t"
        "ldr	r6, [%[a], #4]\n\t"
        "str	r5, [%[r], #12]\n\t"
        "lsr	r3, r6, #1\n\t"
        "lsl	r6, r6, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r4, r4, r3\n\t"
        "ldr	r5, [%[a]]\n\t"
        "str	r4, [%[r], #8]\n\t"
        "lsr	r3, r5, #1\n\t"
        "lsl	r5, r5, %[n]\n\t"
        "lsr	r3, r3, r12\n\t"
        "orr	r6, r6, r3\n\t"
        "str	r5, [%[r]]\n\t"
        "str	r6, [%[r], #4]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [n] "+r" (n)
        :
        : "memory", "r4", "r5", "r6", "r3", "r12"
    );
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_2048_mod_exp_2_64(sp_digit* r, const sp_digit* e, int bits,
        const sp_digit* m)
{
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_digit* td = NULL;
#else
    sp_digit td[193];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    sp_digit mask;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 193, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp = td + 128;

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_64(norm, m);

        i = (bits - 1) / 32;
        n = e[i--];
        c = bits & 31;
        if (c == 0) {
            c = 32;
        }
        c -= bits % 5;
        if (c == 32) {
            c = 27;
        }
        if (c < 0) {
            /* Number of bits in top word is less than number needed. */
            c = -c;
            y = (byte)(n << c);
            n = e[i--];
            y |= (byte)(n >> (64 - c));
            n <<= c;
            c = 64 - c;
        }
        else if (c == 0) {
            /* All bits in top word used. */
            y = (byte)n;
        }
        else {
            y = (byte)(n >> c);
            n <<= 32 - c;
        }
        sp_2048_lshift_64(r, norm, y);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = (byte)(n >> 27);
                n <<= 5;
                c = 27;
            }
            else if (c < 5) {
                y = (byte)(n >> 27);
                n = e[i--];
                c = 5 - c;
                y |= (byte)(n >> (32 - c));
                n <<= c;
                c = 32 - c;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);
            sp_2048_mont_sqr_64(r, r, m, mp);

            sp_2048_lshift_64(r, r, y);
            sp_2048_mul_d_64(tmp, norm, r[64]);
            r[64] = 0;
            o = sp_2048_add_64(r, r, tmp);
            sp_2048_cond_sub_64(r, r, m, (sp_digit)0 - o);
        }

        XMEMSET(&r[64], 0, sizeof(sp_digit) * 64U);
        sp_2048_mont_reduce_64(r, m, mp);

        mask = 0 - (sp_2048_cmp_64(r, m) >= 0);
        sp_2048_cond_sub_64(r, r, m, mask);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_FFDHE_2048 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[128];
    sp_digit e[64];
    sp_digit m[64];
    sp_digit* r = b;
    word32 i;

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expLen > 256) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 64, base);
        sp_2048_from_bin(e, 64, exp, expLen);
        sp_2048_from_mp(m, 64, mod);

    #ifdef HAVE_FFDHE_2048
        if (base->used == 1 && base->dp[0] == 2 && m[63] == (sp_digit)-1)
            err = sp_2048_mod_exp_2_64(r, e, expLen * 8, m);
        else
    #endif
            err = sp_2048_mod_exp_64(r, b, e, expLen * 8, m, 0);

    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_64(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);

    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[64];
    sp_digit e[32];
    sp_digit m[32];
    sp_digit* r = b;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1024) {
        err = MP_READ_E;
    }
    else if (expBits > 1024) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1024) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 32, base);
        sp_2048_from_mp(e, 32, exp);
        sp_2048_from_mp(m, 32, mod);

        err = sp_2048_mod_exp_32(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 32, 0, sizeof(*r) * 32U);
        err = sp_2048_to_mp(r, res);
        res->used = mod->used;
        mp_clamp(res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* !WOLFSSL_SP_NO_2048 */

#ifndef WOLFSSL_SP_NO_3072
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j;
    byte* d;

    for (i = n - 1,j = 0; i >= 3; i -= 4) {
        r[j]  = ((sp_digit)a[i - 0] <<  0) |
                ((sp_digit)a[i - 1] <<  8) |
                ((sp_digit)a[i - 2] << 16) |
                ((sp_digit)a[i - 3] << 24);
        j++;
    }

    if (i >= 0) {
        r[j] = 0;

        d = (byte*)r;
        switch (i) {
            case 2: d[n - 1 - 2] = a[2]; //fallthrough
            case 1: d[n - 1 - 1] = a[1]; //fallthrough
            case 0: d[n - 1 - 0] = a[0]; //fallthrough
        }
        j++;
    }

    for (; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 32
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < size; j++) {
        r[j] = 0;
    }
#elif DIGIT_BIT > 32
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0xffffffff;
        s = 32U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 32U) <= (word32)DIGIT_BIT) {
            s += 32U;
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 32) {
            r[j] &= 0xffffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 32 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin_96(sp_digit* r, byte* a)
{
    int i;
    int j = 0;

    for (i = 95; i >= 0; i--) {
        a[j++] = r[i] >> 24;
        a[j++] = r[i] >> 16;
        a[j++] = r[i] >> 8;
        a[j++] = r[i] >> 0;
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_3072_norm_96(a)

#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 32.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_3072_norm_96(a)

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #48\n\t"
        "mov	r10, #0\n\t"
        /* A[0] * B[0] */
        "ldr	r11, [%[a]]\n\t"
        "ldr	r12, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r3, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r3, r3, #16\n\t"
        "mul	r3, r6, r3\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r4, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r4, r4, r7\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
#else
        "umull	r3, r4, r11, r12\n\t"
        "mov	r5, #0\n\t"
#endif
        "str	r3, [sp]\n\t"
        /* A[0] * B[1] */
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[0] */
        "ldr	r8, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #4]\n\t"
        /* A[2] * B[0] */
        "ldr	r8, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[1] */
        "ldr	r11, [%[a], #4]\n\t"
        "ldr	r12, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[0] * B[2] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [sp, #8]\n\t"
        /* A[0] * B[3] */
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[1] * B[2] */
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[2] * B[1] */
        "ldr	r8, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[0] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [sp, #12]\n\t"
        /* A[4] * B[0] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[1] */
        "ldr	r8, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[2] * B[2] */
        "ldr	r11, [%[a], #8]\n\t"
        "ldr	r12, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[3] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[0] * B[4] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #16]\n\t"
        /* A[0] * B[5] */
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[4] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[2] * B[3] */
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[3] * B[2] */
        "ldr	r8, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[4] * B[1] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[0] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [sp, #20]\n\t"
        /* A[6] * B[0] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[1] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[4] * B[2] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[3] */
        "ldr	r11, [%[a], #12]\n\t"
        "ldr	r12, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[2] * B[4] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[1] * B[5] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[0] * B[6] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [sp, #24]\n\t"
        /* A[0] * B[7] */
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[6] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[2] * B[5] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[4] */
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[4] * B[3] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * B[2] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[6] * B[1] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[7] * B[0] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #28]\n\t"
        /* A[8] * B[0] */
        "ldr	r8, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[7] * B[1] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[6] * B[2] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[3] */
        "ldr	r8, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[4] * B[4] */
        "ldr	r11, [%[a], #16]\n\t"
        "ldr	r12, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[3] * B[5] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[2] * B[6] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[7] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[0] * B[8] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [sp, #32]\n\t"
        /* A[0] * B[9] */
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[1] * B[8] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[2] * B[7] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[6] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[4] * B[5] */
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[4] */
        "ldr	r8, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[6] * B[3] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[7] * B[2] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[8] * B[1] */
        "ldr	r8, [%[a], #32]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[9] * B[0] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [sp, #36]\n\t"
        /* A[10] * B[0] */
        "ldr	r8, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[9] * B[1] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[8] * B[2] */
        "ldr	r8, [%[a], #32]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[7] * B[3] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[6] * B[4] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * B[5] */
        "ldr	r11, [%[a], #20]\n\t"
        "ldr	r12, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[4] * B[6] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[7] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[2] * B[8] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * B[9] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[0] * B[10] */
        "ldr	r8, [%[a]]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #40]\n\t"
        /* A[0] * B[11] */
        "ldr	r9, [%[b], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * B[10] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[2] * B[9] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[3] * B[8] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[4] * B[7] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[6] */
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[6] * B[5] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[7] * B[4] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[8] * B[3] */
        "ldr	r8, [%[a], #32]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[9] * B[2] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[10] * B[1] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[11] * B[0] */
        "ldr	r8, [%[a], #44]\n\t"
        "ldr	r9, [%[b]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [sp, #44]\n\t"
        /* A[11] * B[1] */
        "ldr	r9, [%[b], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[10] * B[2] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[9] * B[3] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[8] * B[4] */
        "ldr	r8, [%[a], #32]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[7] * B[5] */
        "ldr	r8, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[6] * B[6] */
        "ldr	r11, [%[a], #24]\n\t"
        "ldr	r12, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[7] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[4] * B[8] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[3] * B[9] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[2] * B[10] */
        "ldr	r8, [%[a], #8]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[1] * B[11] */
        "ldr	r8, [%[a], #4]\n\t"
        "ldr	r9, [%[b], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], #48]\n\t"
        /* A[2] * B[11] */
        "ldr	r8, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[3] * B[10] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[4] * B[9] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * B[8] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[6] * B[7] */
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[7] * B[6] */
        "ldr	r8, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[8] * B[5] */
        "ldr	r8, [%[a], #32]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[9] * B[4] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[10] * B[3] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[11] * B[2] */
        "ldr	r8, [%[a], #44]\n\t"
        "ldr	r9, [%[b], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #52]\n\t"
        /* A[11] * B[3] */
        "ldr	r9, [%[b], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[10] * B[4] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[9] * B[5] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[8] * B[6] */
        "ldr	r8, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[7] * B[7] */
        "ldr	r11, [%[a], #28]\n\t"
        "ldr	r12, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[6] * B[8] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[5] * B[9] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[4] * B[10] */
        "ldr	r8, [%[a], #16]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[3] * B[11] */
        "ldr	r8, [%[a], #12]\n\t"
        "ldr	r9, [%[b], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r], #56]\n\t"
        /* A[4] * B[11] */
        "ldr	r8, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[5] * B[10] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[6] * B[9] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[7] * B[8] */
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[8] * B[7] */
        "ldr	r8, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[9] * B[6] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[10] * B[5] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[11] * B[4] */
        "ldr	r8, [%[a], #44]\n\t"
        "ldr	r9, [%[b], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], #60]\n\t"
        /* A[11] * B[5] */
        "ldr	r9, [%[b], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[10] * B[6] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[9] * B[7] */
        "ldr	r8, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[8] * B[8] */
        "ldr	r11, [%[a], #32]\n\t"
        "ldr	r12, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[7] * B[9] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[6] * B[10] */
        "ldr	r8, [%[a], #24]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[5] * B[11] */
        "ldr	r8, [%[a], #20]\n\t"
        "ldr	r9, [%[b], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #64]\n\t"
        /* A[6] * B[11] */
        "ldr	r8, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[7] * B[10] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[8] * B[9] */
        "ldr	r9, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[9] * B[8] */
        "ldr	r8, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[10] * B[7] */
        "ldr	r8, [%[a], #40]\n\t"
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[11] * B[6] */
        "ldr	r8, [%[a], #44]\n\t"
        "ldr	r9, [%[b], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r], #68]\n\t"
        /* A[11] * B[7] */
        "ldr	r9, [%[b], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[10] * B[8] */
        "ldr	r8, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[9] * B[9] */
        "ldr	r11, [%[a], #36]\n\t"
        "ldr	r12, [%[b], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[8] * B[10] */
        "ldr	r8, [%[a], #32]\n\t"
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[7] * B[11] */
        "ldr	r8, [%[a], #28]\n\t"
        "ldr	r9, [%[b], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], #72]\n\t"
        /* A[8] * B[11] */
        "ldr	r8, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[9] * B[10] */
        "ldr	r9, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[10] * B[9] */
        "ldr	r8, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[11] * B[8] */
        "ldr	r8, [%[a], #44]\n\t"
        "ldr	r9, [%[b], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #76]\n\t"
        /* A[11] * B[9] */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[10] * B[10] */
        "ldr	r11, [%[a], #40]\n\t"
        "ldr	r12, [%[b], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r11, r12\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[9] * B[11] */
        "ldr	r8, [%[a], #36]\n\t"
        "ldr	r9, [%[b], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r], #80]\n\t"
        /* A[10] * B[11] */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r11, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r11, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r11, r9\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        /* A[11] * B[10] */
        "ldr	r8, [%[a], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r12, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r12, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, r8, r12\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], #84]\n\t"
        /* A[11] * B[11] */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adc	r5, r5, r7\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsr	r7, r9, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r5, r5, r7\n\t"
        "lsl	r7, r9, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adc	r5, r5, r7\n\t"
#else
        "umull	r6, r7, r8, r9\n\t"
        "adds	r4, r4, r6\n\t"
        "adc	r5, r5, r7\n\t"
#endif
        "str	r4, [%[r], #88]\n\t"
        "str	r5, [%[r], #92]\n\t"
        "ldm	sp!, {r3, r4, r5, r6}\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	sp!, {r3, r4, r5, r6}\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	sp!, {r3, r4, r5, r6}\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12"
    );
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_add_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_in_place_24(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "subs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
    );
    return (uint32_t)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_12(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<12; i++) {
        r[i] = a[i] & m;
    }
#else
    r[0] = a[0] & m;
    r[1] = a[1] & m;
    r[2] = a[2] & m;
    r[3] = a[3] & m;
    r[4] = a[4] & m;
    r[5] = a[5] & m;
    r[6] = a[6] & m;
    r[7] = a[7] & m;
    r[8] = a[8] & m;
    r[9] = a[9] & m;
    r[10] = a[10] & m;
    r[11] = a[11] & m;
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[24];
    sp_digit a1[12];
    sp_digit b1[12];
    sp_digit* z2 = r + 24;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_3072_add_12(a1, a, &a[12]);
    cb = sp_3072_add_12(b1, b, &b[12]);
    u  = ca & cb;

    sp_3072_mul_12(z2, &a[12], &b[12]);
    sp_3072_mul_12(z0, a, b);
    sp_3072_mul_12(z1, a1, b1);

    u += sp_3072_sub_in_place_24(z1, z0);
    u += sp_3072_sub_in_place_24(z1, z2);
    sp_3072_mask_12(a1, a1, 0 - cb);
    u += sp_3072_add_12(z1 + 12, z1 + 12, a1);
    sp_3072_mask_12(b1, b1, 0 - ca);
    u += sp_3072_add_12(z1 + 12, z1 + 12, b1);

    u += sp_3072_add_24(r + 12, r + 12, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (12 - 1));
    a1[0] = u;
    (void)sp_3072_add_12(r + 36, r + 36, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "subs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
    );
    return (uint32_t)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_24(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<24; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[48];
    sp_digit a1[24];
    sp_digit b1[24];
    sp_digit* z2 = r + 48;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_3072_add_24(a1, a, &a[24]);
    cb = sp_3072_add_24(b1, b, &b[24]);
    u  = ca & cb;

    sp_3072_mul_24(z2, &a[24], &b[24]);
    sp_3072_mul_24(z0, a, b);
    sp_3072_mul_24(z1, a1, b1);

    u += sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_sub_in_place_48(z1, z2);
    sp_3072_mask_24(a1, a1, 0 - cb);
    u += sp_3072_add_24(z1 + 24, z1 + 24, a1);
    sp_3072_mask_24(b1, b1, 0 - ca);
    u += sp_3072_add_24(z1 + 24, z1 + 24, b1);

    u += sp_3072_add_48(r + 24, r + 24, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (24 - 1));
    a1[0] = u;
    (void)sp_3072_add_24(r + 72, r + 72, a1);
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "subs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	%[a], r9, r9\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9"
    );
    return (uint32_t)(size_t)a;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r12, #0\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "adcs	r3, r3, r7\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "adc	%[r], r12, r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
    return (uint32_t)(size_t)r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<48; i++) {
        r[i] = a[i] & m;
    }
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_96(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[96];
    sp_digit a1[48];
    sp_digit b1[48];
    sp_digit* z2 = r + 96;
    sp_digit u;
    sp_digit ca;
    sp_digit cb;

    ca = sp_3072_add_48(a1, a, &a[48]);
    cb = sp_3072_add_48(b1, b, &b[48]);
    u  = ca & cb;

    sp_3072_mul_48(z2, &a[48], &b[48]);
    sp_3072_mul_48(z0, a, b);
    sp_3072_mul_48(z1, a1, b1);

    u += sp_3072_sub_in_place_96(z1, z0);
    u += sp_3072_sub_in_place_96(z1, z2);
    sp_3072_mask_48(a1, a1, 0 - cb);
    u += sp_3072_add_48(z1 + 48, z1 + 48, a1);
    sp_3072_mask_48(b1, b1, 0 - ca);
    u += sp_3072_add_48(z1 + 48, z1 + 48, b1);

    u += sp_3072_add_96(r + 48, r + 48, z1);
    XMEMSET(a1 + 1, 0, sizeof(sp_digit) * (48 - 1));
    a1[0] = u;
    (void)sp_3072_add_48(r + 144, r + 144, a1);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_12(sp_digit* r, const sp_digit* a)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #48\n\t"
        /* A[0] * A[0] */
        "ldr	r10, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r9, r10, #16\n\t"
        "lsl	r2, r10, #16\n\t"
        "lsr	r2, r2, #16\n\t"
        "mul	r8, r2, r2\n\t"
        "mul	r3, r9, r9\n\t"
        "mul	r2, r9, r2\n\t"
        "lsr	r9, r2, #15\n\t"
        "lsl	r2, r2, #17\n\t"
        "adds	r8, r8, r2\n\t"
        "adc	r3, r3, r9\n\t"
#else
        "umull	r8, r3, r10, r10\n\t"
#endif
        "mov	r4, #0\n\t"
        "str	r8, [sp]\n\t"
        /* A[0] * A[1] */
        "ldr	r10, [%[a], #4]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [sp, #4]\n\t"
        /* A[0] * A[2] */
        "ldr	r10, [%[a], #8]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[1] * A[1] */
        "ldr	r10, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [sp, #8]\n\t"
        /* A[0] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[1] * A[2] */
        "ldr	r10, [%[a], #8]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r2, [sp, #12]\n\t"
        /* A[0] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[1] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[2] * A[2] */
        "ldr	r10, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [sp, #16]\n\t"
        /* A[0] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [sp, #20]\n\t"
        /* A[0] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r4, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[3] */
        "ldr	r10, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "adds	r2, r2, r5\n\t"
        "adcs	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "str	r2, [sp, #24]\n\t"
        /* A[0] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r2, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r3, r3, r5\n\t"
        "adcs	r4, r4, r6\n\t"
        "adc	r2, r2, r7\n\t"
        "str	r3, [sp, #28]\n\t"
        /* A[0] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[4] */
        "ldr	r10, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [sp, #32]\n\t"
        /* A[0] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r4, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r2, r2, r5\n\t"
        "adcs	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "str	r2, [sp, #36]\n\t"
        /* A[0] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r2, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[5] * A[5] */
        "ldr	r10, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "adds	r3, r3, r5\n\t"
        "adcs	r4, r4, r6\n\t"
        "adc	r2, r2, r7\n\t"
        "str	r3, [sp, #40]\n\t"
        /* A[0] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[1] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[2] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[5] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [sp, #44]\n\t"
        /* A[1] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r4, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[2] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[3] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[5] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[6] * A[6] */
        "ldr	r10, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "adds	r2, r2, r5\n\t"
        "adcs	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "str	r2, [%[r], #48]\n\t"
        /* A[2] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r2, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[3] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[4] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[5] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[6] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
        "ldr	r12, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r3, r3, r5\n\t"
        "adcs	r4, r4, r6\n\t"
        "adc	r2, r2, r7\n\t"
        "str	r3, [%[r], #52]\n\t"
        /* A[3] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[4] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[5] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[6] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[7] * A[7] */
        "ldr	r10, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [%[r], #56]\n\t"
        /* A[4] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r4, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[5] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[6] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[7] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
        "ldr	r12, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r2, r2, r5\n\t"
        "adcs	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
        "str	r2, [%[r], #60]\n\t"
        /* A[5] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r2, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[6] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[7] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[8] * A[8] */
        "ldr	r10, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "adds	r3, r3, r5\n\t"
        "adcs	r4, r4, r6\n\t"
        "adc	r2, r2, r7\n\t"
        "str	r3, [%[r], #64]\n\t"
        /* A[6] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r5, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r8, r5\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r6, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "add	r6, r6, r9\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adc	r6, r6, r9\n\t"
#else
        "umull	r5, r6, r10, r12\n\t"
#endif
        "mov	r3, #0\n\t"
        "mov	r7, #0\n\t"
        /* A[7] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        /* A[8] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
        "ldr	r12, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r5, r5, r9\n\t"
        "adcs	r6, r6, #0\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r5, r5, r8\n\t"
        "adcs	r6, r6, r9\n\t"
        "adc	r7, r7, #0\n\t"
#endif
        "adds	r5, r5, r5\n\t"
        "adcs	r6, r6, r6\n\t"
        "adc	r7, r7, r7\n\t"
        "adds	r4, r4, r5\n\t"
        "adcs	r2, r2, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "str	r4, [%[r], #68]\n\t"
        /* A[7] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[8] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        /* A[9] * A[9] */
        "ldr	r10, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r2, [%[r], #72]\n\t"
        /* A[8] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "mov	r2, #0\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        /* A[9] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
        "ldr	r12, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
        "adds	r3, r3, r8\n\t"
        "adcs	r4, r4, r9\n\t"
        "adc	r2, r2, #0\n\t"
#endif
        "str	r3, [%[r], #76]\n\t"
        /* A[9] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r9\n\t"
        "adcs	r2, r2, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        /* A[10] * A[10] */
        "ldr	r10, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r4, r4, r8\n\t"
        "adcs	r2, r2, r9\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r], #80]\n\t"
        /* A[10] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
        "ldr	r12, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r9\n\t"
        "adcs	r3, r3, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r8, r10, #16\n\t"
        "lsr	r9, r12, #16\n\t"
        "mul	r9, r8, r9\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r9, r12, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #16\n\t"
        "lsl	r8, r8, #16\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r8, r9, r10, r12\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "adds	r2, r2, r8\n\t"
        "adcs	r3, r3, r9\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r2, [%[r], #84]\n\t"
        /* A[11] * A[11] */
        "ldr	r10, [%[a], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r8, r10, #16\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mov	r12, r8\n\t"
        "mul	r8, r12, r8\n\t"
        "mov	r12, r9\n\t"
        "mul	r9, r12, r9\n\t"
        "adds	r3, r3, r8\n\t"
        "adc	r4, r4, r9\n\t"
        "lsr	r9, r10, #16\n\t"
        "lsl	r8, r10, #16\n\t"
        "lsr	r8, r8, #16\n\t"
        "mul	r8, r9, r8\n\t"
        "lsr	r9, r8, #15\n\t"
        "lsl	r8, r8, #17\n\t"
        "adds	r3, r3, r8\n\t"
        "adc	r4, r4, r9\n\t"
#else
        "umull	r8, r9, r10, r10\n\t"
        "adds	r3, r3, r8\n\t"
        "adc	r4, r4, r9\n\t"
#endif
        "str	r3, [%[r], #88]\n\t"
        "str	r4, [%[r], #92]\n\t"
        "ldm	sp!, {r2, r3, r4, r8}\n\t"
        "stm	%[r]!, {r2, r3, r4, r8}\n\t"
        "ldm	sp!, {r2, r3, r4, r8}\n\t"
        "stm	%[r]!, {r2, r3, r4, r8}\n\t"
        "ldm	sp!, {r2, r3, r4, r8}\n\t"
        "stm	%[r]!, {r2, r3, r4, r8}\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12"
    );
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_12(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 24;
    sp_digit z1[24];
    sp_digit* a1 = z1;
    sp_digit zero[12];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 12);

    mask = sp_3072_sub_12(a1, a, &a[12]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_3072_sub_12(a1, p1, p2);

    sp_3072_sqr_12(z2, &a[12]);
    sp_3072_sqr_12(z0, a);
    sp_3072_sqr_12(z1, a1);

    u = 0;
    u -= sp_3072_sub_in_place_24(z1, z2);
    u -= sp_3072_sub_in_place_24(z1, z0);
    u += sp_3072_sub_in_place_24(r + 12, z1);
    zero[0] = u;
    (void)sp_3072_add_12(r + 36, r + 36, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_24(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 48;
    sp_digit z1[48];
    sp_digit* a1 = z1;
    sp_digit zero[24];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 24);

    mask = sp_3072_sub_24(a1, a, &a[24]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_3072_sub_24(a1, p1, p2);

    sp_3072_sqr_24(z2, &a[24]);
    sp_3072_sqr_24(z0, a);
    sp_3072_sqr_24(z1, a1);

    u = 0;
    u -= sp_3072_sub_in_place_48(z1, z2);
    u -= sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_sub_in_place_48(r + 24, z1);
    zero[0] = u;
    (void)sp_3072_add_24(r + 72, r + 72, zero);
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "subs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[a]!, {r3, r4, r5, r6}\n\t"
        "ldm	%[b]!, {r7, r8, r9, r10}\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "sbcs	r6, r6, r10\n\t"
        "stm	%[r]!, {r3, r4, r5, r6}\n\t"
        "sbc	%[r], r6, r6\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
    return (uint32_t)(size_t)r;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit* z2 = r + 96;
    sp_digit z1[96];
    sp_digit* a1 = z1;
    sp_digit zero[48];
    sp_digit u;
    sp_digit mask;
    sp_digit* p1;
    sp_digit* p2;

    XMEMSET(zero, 0, sizeof(sp_digit) * 48);

    mask = sp_3072_sub_48(a1, a, &a[48]);
    p1 = (sp_digit*)(((sp_digit)zero &   mask ) | ((sp_digit)a1 & (~mask)));
    p2 = (sp_digit*)(((sp_digit)zero & (~mask)) | ((sp_digit)a1 &   mask ));
    (void)sp_3072_sub_48(a1, p1, p2);

    sp_3072_sqr_48(z2, &a[48]);
    sp_3072_sqr_48(z0, a);
    sp_3072_sqr_48(z1, a1);

    u = 0;
    u -= sp_3072_sub_in_place_96(z1, z2);
    u -= sp_3072_sub_in_place_96(z1, z0);
    u += sp_3072_sub_in_place_96(r + 48, z1);
    zero[0] = u;
    (void)sp_3072_add_48(r + 144, r + 144, zero);
}

#endif /* !WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_add_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r3, #0\n\t"
        "add	r12, %[a], #0x180\n\t"
        "\n"
    "L_sp_3072_add_96_word_%=: \n\t"
        "adds	r3, r3, #-1\n\t"
        "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
        "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "adcs	r7, r7, r11\n\t"
        "stm	%[r]!, {r4, r5, r6, r7}\n\t"
        "mov	r4, #0\n\t"
        "adc	r3, r4, #0\n\t"
        "cmp	%[a], r12\n\t"
        "bne	L_sp_3072_add_96_word_%=\n\t"
        "mov	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_in_place_96(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        "mov	r12, #0\n\t"
        "add	lr, %[a], #0x180\n\t"
        "\n"
    "L_sp_3072_sub_in_pkace_96_word_%=: \n\t"
        "subs	r12, r10, r12\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	r12, r10, r10\n\t"
        "cmp	%[a], lr\n\t"
        "bne	L_sp_3072_sub_in_pkace_96_word_%=\n\t"
        "mov	%[a], r12\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10"
    );
    return (uint32_t)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_96(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x300\n\t"
        "mov	r5, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "\n"
    "L_sp_3072_mul_96_outer_%=: \n\t"
        "subs	r3, r5, #0x17c\n\t"
        "it	cc\n\t"
        "movcc	r3, #0\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_3072_mul_96_inner_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[b], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0x180\n\t"
        "beq	L_sp_3072_mul_96_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_3072_mul_96_inner_%=\n\t"
        "\n"
    "L_sp_3072_mul_96_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0x2f8\n\t"
        "ble	L_sp_3072_mul_96_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_3072_mul_96_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_3072_mul_96_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_96(sp_digit* r, const sp_digit* a)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x300\n\t"
        "mov	r12, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "mov	r5, #0\n\t"
        "\n"
    "L_sp_3072_sqr_96_outer_%=: \n\t"
        "subs	r3, r5, #0x17c\n\t"
        "it	cc\n\t"
        "movcc	r3, r12\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_3072_sqr_96_inner_%=: \n\t"
        "cmp	r4, r3\n\t"
        "beq	L_sp_3072_sqr_96_op_sqr_%=\n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[a], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "bal	L_sp_3072_sqr_96_op_done_%=\n\t"
        "\n"
    "L_sp_3072_sqr_96_op_sqr_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mov	r11, r9\n\t"
        "mul	r9, r11, r9\n\t"
        "mov	r11, r10\n\t"
        "mul	r10, r11, r10\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsl	r9, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #15\n\t"
        "lsl	r9, r9, #17\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, lr\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "\n"
    "L_sp_3072_sqr_96_op_done_%=: \n\t"
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0x180\n\t"
        "beq	L_sp_3072_sqr_96_inner_done_%=\n\t"
        "cmp	r3, r4\n\t"
        "bgt	L_sp_3072_sqr_96_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_3072_sqr_96_inner_%=\n\t"
        "\n"
    "L_sp_3072_sqr_96_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0x2f8\n\t"
        "ble	L_sp_3072_sqr_96_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_3072_sqr_96_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_3072_sqr_96_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_48(sp_digit* r, const sp_digit* a, sp_digit m)
{
    int i;

    for (i=0; i<48; i++) {
        r[i] = a[i] & m;
    }
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r3, #0\n\t"
        "add	r12, %[a], #0xc0\n\t"
        "\n"
    "L_sp_3072_add_48_word_%=: \n\t"
        "adds	r3, r3, #-1\n\t"
        "ldm	%[a]!, {r4, r5, r6, r7}\n\t"
        "ldm	%[b]!, {r8, r9, r10, r11}\n\t"
        "adcs	r4, r4, r8\n\t"
        "adcs	r5, r5, r9\n\t"
        "adcs	r6, r6, r10\n\t"
        "adcs	r7, r7, r11\n\t"
        "stm	%[r]!, {r4, r5, r6, r7}\n\t"
        "mov	r4, #0\n\t"
        "adc	r3, r4, #0\n\t"
        "cmp	%[a], r12\n\t"
        "bne	L_sp_3072_add_48_word_%=\n\t"
        "mov	%[r], r3\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r3", "r12"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer.
 * b  A single precision integer.
 */
static sp_digit sp_3072_sub_in_place_48(sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        "mov	r12, #0\n\t"
        "add	lr, %[a], #0xc0\n\t"
        "\n"
    "L_sp_3072_sub_in_pkace_48_word_%=: \n\t"
        "subs	r12, r10, r12\n\t"
        "ldm	%[a], {r2, r3, r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7, r8, r9}\n\t"
        "sbcs	r2, r2, r6\n\t"
        "sbcs	r3, r3, r7\n\t"
        "sbcs	r4, r4, r8\n\t"
        "sbcs	r5, r5, r9\n\t"
        "stm	%[a]!, {r2, r3, r4, r5}\n\t"
        "sbc	r12, r10, r10\n\t"
        "cmp	%[a], lr\n\t"
        "bne	L_sp_3072_sub_in_pkace_48_word_%=\n\t"
        "mov	%[a], r12\n\t"
        : [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r12", "lr", "r10"
    );
    return (uint32_t)(size_t)a;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
static void sp_3072_mul_48(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x180\n\t"
        "mov	r5, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "\n"
    "L_sp_3072_mul_48_outer_%=: \n\t"
        "subs	r3, r5, #0xbc\n\t"
        "it	cc\n\t"
        "movcc	r3, #0\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_3072_mul_48_inner_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[b], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0xc0\n\t"
        "beq	L_sp_3072_mul_48_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_3072_mul_48_inner_%=\n\t"
        "\n"
    "L_sp_3072_mul_48_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0x178\n\t"
        "ble	L_sp_3072_mul_48_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_3072_mul_48_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_3072_mul_48_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11"
    );
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
{
    __asm__ __volatile__ (
        "sub	sp, sp, #0x180\n\t"
        "mov	r12, #0\n\t"
        "mov	r6, #0\n\t"
        "mov	r7, #0\n\t"
        "mov	r8, #0\n\t"
        "mov	r5, #0\n\t"
        "\n"
    "L_sp_3072_sqr_48_outer_%=: \n\t"
        "subs	r3, r5, #0xbc\n\t"
        "it	cc\n\t"
        "movcc	r3, r12\n\t"
        "sub	r4, r5, r3\n\t"
        "\n"
    "L_sp_3072_sqr_48_inner_%=: \n\t"
        "cmp	r4, r3\n\t"
        "beq	L_sp_3072_sqr_48_op_sqr_%=\n\t"
        "ldr	lr, [%[a], r3]\n\t"
        "ldr	r11, [%[a], r4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r10\n\t"
        "adcs	r7, r7, #0\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r9, lr, #16\n\t"
        "lsr	r10, r11, #16\n\t"
        "mul	r10, r9, r10\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsl	r10, r11, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #16\n\t"
        "lsl	r9, r9, #16\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, r11\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "bal	L_sp_3072_sqr_48_op_done_%=\n\t"
        "\n"
    "L_sp_3072_sqr_48_op_sqr_%=: \n\t"
        "ldr	lr, [%[a], r3]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r9, lr, #16\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mov	r11, r9\n\t"
        "mul	r9, r11, r9\n\t"
        "mov	r11, r10\n\t"
        "mul	r10, r11, r10\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
        "lsr	r10, lr, #16\n\t"
        "lsl	r9, lr, #16\n\t"
        "lsr	r9, r9, #16\n\t"
        "mul	r9, r10, r9\n\t"
        "lsr	r10, r9, #15\n\t"
        "lsl	r9, r9, #17\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#else
        "umull	r9, r10, lr, lr\n\t"
        "adds	r6, r6, r9\n\t"
        "adcs	r7, r7, r10\n\t"
        "adc	r8, r8, #0\n\t"
#endif
        "\n"
    "L_sp_3072_sqr_48_op_done_%=: \n\t"
        "add	r3, r3, #4\n\t"
        "sub	r4, r4, #4\n\t"
        "cmp	r3, #0xc0\n\t"
        "beq	L_sp_3072_sqr_48_inner_done_%=\n\t"
        "cmp	r3, r4\n\t"
        "bgt	L_sp_3072_sqr_48_inner_done_%=\n\t"
        "cmp	r3, r5\n\t"
        "ble	L_sp_3072_sqr_48_inner_%=\n\t"
        "\n"
    "L_sp_3072_sqr_48_inner_done_%=: \n\t"
        "str	r6, [sp, r5]\n\t"
        "mov	r6, r7\n\t"
        "mov	r7, r8\n\t"
        "mov	r8, #0\n\t"
        "add	r5, r5, #4\n\t"
        "cmp	r5, #0x178\n\t"
        "ble	L_sp_3072_sqr_48_outer_%=\n\t"
        "str	r6, [sp, r5]\n\t"
        "\n"
    "L_sp_3072_sqr_48_store_%=: \n\t"
        "ldm	sp!, {r6, r7, r8, r9}\n\t"
        "stm	%[r]!, {r6, r7, r8, r9}\n\t"
        "subs	r5, r5, #16\n\t"
        "bgt	L_sp_3072_sqr_48_store_%=\n\t"
        : [r] "+r" (r), [a] "+r" (a)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "lr", "r11", "r12"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */

    /* rho = -1/m mod b */
    *rho = (sp_digit)0 - x;
}

#ifdef WOLFSSL_SP_SMALL
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        /* A[0] * B */
        "ldr	r8, [%[a]]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r5, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r5, r5, #16\n\t"
        "mul	r5, r6, r5\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r3, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r3, r3, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#else
        "umull	r5, r3, %[b], r8\n\t"
#endif
        "mov	r4, #0\n\t"
        "str	r5, [%[r]]\n\t"
        "mov	r5, #0\n\t"
        "mov	r9, #4\n\t"
        "\n"
    "L_sp_3072_mul_d_96_word_%=: \n\t"
        /* A[i] * B */
        "ldr	r8, [%[a], r9]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r], r9]\n\t"
        "mov	r3, r4\n\t"
        "mov	r4, r5\n\t"
        "mov	r5, #0\n\t"
        "add	r9, r9, #4\n\t"
        "cmp	r9, #0x180\n\t"
        "blt	L_sp_3072_mul_d_96_word_%=\n\t"
        "str	r3, [%[r], #384]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
    );
}

#else
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
static void sp_3072_mul_d_96(sp_digit* r, const sp_digit* a, sp_digit b)
{
    __asm__ __volatile__ (
        "mov	r10, #0\n\t"
        /* A[0] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r3, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r3, r3, #16\n\t"
        "mul	r3, r6, r3\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r4, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r4, r4, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adc	r4, r4, r7\n\t"
#else
        "umull	r3, r4, %[b], r8\n\t"
#endif
        "mov	r5, #0\n\t"
        "str	r3, [%[r]], #4\n\t"
        /* A[1] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[2] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[3] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[4] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[5] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[6] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[7] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[8] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[9] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[10] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[11] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[12] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[13] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[14] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[15] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[16] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[17] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[18] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[19] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[20] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[21] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[22] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[23] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[24] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[25] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[26] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[27] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[28] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[29] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[30] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[31] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[32] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[33] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[34] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[35] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[36] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[37] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[38] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[39] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[40] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[41] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[42] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[43] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[44] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[45] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[46] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[47] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[48] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[49] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[50] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[51] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[52] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[53] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[54] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[55] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[56] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[57] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[58] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[59] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[60] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[61] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[62] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[63] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[64] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[65] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[66] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[67] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[68] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[69] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[70] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[71] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[72] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[73] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[74] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[75] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[76] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[77] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[78] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[79] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[80] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[81] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[82] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[83] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[84] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[85] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[86] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[87] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[88] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[89] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[90] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[91] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[92] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adcs	r3, r3, #0\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "adc	r4, r4, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adcs	r3, r3, r7\n\t"
        "mov	r4, #0\n\t"
        "adc	r4, r4, #0\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        /* A[93] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r3, r3, r7\n\t"
        "adcs	r4, r4, #0\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "adc	r5, r5, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r3, r3, r6\n\t"
        "adcs	r4, r4, r7\n\t"
        "mov	r5, #0\n\t"
        "adc	r5, r5, #0\n\t"
#endif
        "str	r3, [%[r]], #4\n\t"
        /* A[94] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r4, r4, r7\n\t"
        "adcs	r5, r5, #0\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r4, r4, r6\n\t"
        "adcs	r5, r5, r7\n\t"
        "mov	r3, #0\n\t"
        "adc	r3, r3, #0\n\t"
#endif
        "str	r4, [%[r]], #4\n\t"
        /* A[95] * B */
        "ldr	r8, [%[a]], #4\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsl	r6, %[b], #16\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r5, r5, r7\n\t"
        "adc	r3, r3, #0\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
        "lsr	r6, %[b], #16\n\t"
        "lsr	r7, r8, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "add	r3, r3, r7\n\t"
        "lsl	r7, r8, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#else
        "umull	r6, r7, %[b], r8\n\t"
        "adds	r5, r5, r6\n\t"
        "adc	r3, r3, r7\n\t"
#endif
        "str	r5, [%[r]], #4\n\t"
        "str	r3, [%[r]]\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b)
        :
        : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r10"
    );
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_48(sp_digit* r, const sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 48);

    /* r = 2^n mod m */
    sp_3072_sub_in_place_48(r, m);
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	r6, #0\n\t"
        "mov	r12, #0\n\t"
        "mov	lr, #0\n\t"
        "\n"
    "L_sp_3072_cond_sub_48_words_%=: \n\t"
        "subs	r12, r6, r12\n\t"
        "ldr	r4, [%[a], lr]\n\t"
        "ldr	r5, [%[b], lr]\n\t"
        "and	r5, r5, %[m]\n\t"
        "sbcs	r4, r4, r5\n\t"
        "sbc	r12, r6, r6\n\t"
        "str	r4, [%[r], lr]\n\t"
        "add	lr, lr, #4\n\t"
        "cmp	lr, #0xc0\n\t"
        "blt	L_sp_3072_cond_sub_48_words_%=\n\t"
        "mov	%[r], r12\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6"
    );
    return (uint32_t)(size_t)r;
}

#else
/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_3072_cond_sub_48(sp_digit* r, const sp_digit* a, const sp_digit* b, sp_digit m)
{
    __asm__ __volatile__ (
        "mov	lr, #0\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "subs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "ldm	%[a]!, {r4, r5}\n\t"
        "ldm	%[b]!, {r6, r7}\n\t"
        "and	r6, r6, %[m]\n\t"
        "and	r7, r7, %[m]\n\t"
        "sbcs	r4, r4, r6\n\t"
        "sbcs	r5, r5, r7\n\t"
        "stm	%[r]!, {r4, r5}\n\t"
        "sbc	%[r], lr, lr\n\t"
        : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), [m] "+r" (m)
        :
        : "memory", "r12", "lr", "r4", "r5", "r6", "r7"
    );
    return (uint32_t)(size_t)r;
}

#endif /* WOLFSSL_SP_SMALL */
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static SP_NOINLINE void sp_3072_mont_reduce_48(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    __asm__ __volatile__ (
#if !(defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4))
        "ldr	r11, [%[m]]\n\t"
#endif
        /* i = 0 */
        "mov	r9, #0\n\t"
        "mov	r3, #0\n\t"
        "ldr	r12, [%[a]]\n\t"
        "ldr	lr, [%[a], #4]\n\t"
        "\n"
    "L_sp_3072_mont_reduce_48_word_%=: \n\t"
        /* mu = a[i] * mp */
        "mul	r8, %[mp], r12\n\t"
        /* a[i+0] += m[0] * mu */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "ldr	r11, [%[m]]\n\t"
#endif
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r7, r11, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r7\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r5, r7\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r7, r11, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r7, r7, #16\n\t"
        "mul	r7, r6, r7\n\t"
        "adds	r12, r12, r7\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r7, r11, #16\n\t"
        "mul	r6, r7, r6\n\t"
        "lsr	r7, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r5, r7\n\t"
#else
        "umull	r6, r7, r8, r11\n\t"
        "adds	r12, r12, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        /* a[i+1] += m[1] * mu */
        "ldr	r7, [%[m], #4]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r10, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r10\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r4, r10\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r6, r10\n\t"
        "adds	lr, lr, r10\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r10, r7, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r4, r10\n\t"
#else
        "umull	r6, r10, r8, r7\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r4, r10, #0\n\t"
#endif
        "mov	r12, lr\n\t"
        "adds	r12, r12, r5\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+2] += m[2] * mu */
        "ldr	r7, [%[m], #8]\n\t"
        "ldr	lr, [%[a], #8]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r10, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r10\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r5, r10\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r10, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r10, r10, #16\n\t"
        "mul	r10, r6, r10\n\t"
        "adds	lr, lr, r10\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r10, r7, #16\n\t"
        "mul	r6, r10, r6\n\t"
        "lsr	r10, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r5, r10\n\t"
#else
        "umull	r6, r10, r8, r7\n\t"
        "adds	lr, lr, r6\n\t"
        "adc	r5, r10, #0\n\t"
#endif
        "adds	lr, lr, r4\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+3] += m[3] * mu */
        "ldr	r7, [%[m], #12]\n\t"
        "ldr	r10, [%[a], #12]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #12]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+4] += m[4] * mu */
        "ldr	r7, [%[m], #16]\n\t"
        "ldr	r10, [%[a], #16]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #16]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+5] += m[5] * mu */
        "ldr	r7, [%[m], #20]\n\t"
        "ldr	r10, [%[a], #20]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #20]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+6] += m[6] * mu */
        "ldr	r7, [%[m], #24]\n\t"
        "ldr	r10, [%[a], #24]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #24]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+7] += m[7] * mu */
        "ldr	r7, [%[m], #28]\n\t"
        "ldr	r10, [%[a], #28]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #28]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+8] += m[8] * mu */
        "ldr	r7, [%[m], #32]\n\t"
        "ldr	r10, [%[a], #32]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #32]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+9] += m[9] * mu */
        "ldr	r7, [%[m], #36]\n\t"
        "ldr	r10, [%[a], #36]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #36]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+10] += m[10] * mu */
        "ldr	r7, [%[m], #40]\n\t"
        "ldr	r10, [%[a], #40]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #40]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+11] += m[11] * mu */
        "ldr	r7, [%[m], #44]\n\t"
        "ldr	r10, [%[a], #44]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #44]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+12] += m[12] * mu */
        "ldr	r7, [%[m], #48]\n\t"
        "ldr	r10, [%[a], #48]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #48]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+13] += m[13] * mu */
        "ldr	r7, [%[m], #52]\n\t"
        "ldr	r10, [%[a], #52]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #52]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+14] += m[14] * mu */
        "ldr	r7, [%[m], #56]\n\t"
        "ldr	r10, [%[a], #56]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #56]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+15] += m[15] * mu */
        "ldr	r7, [%[m], #60]\n\t"
        "ldr	r10, [%[a], #60]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #60]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+16] += m[16] * mu */
        "ldr	r7, [%[m], #64]\n\t"
        "ldr	r10, [%[a], #64]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #64]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+17] += m[17] * mu */
        "ldr	r7, [%[m], #68]\n\t"
        "ldr	r10, [%[a], #68]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #68]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+18] += m[18] * mu */
        "ldr	r7, [%[m], #72]\n\t"
        "ldr	r10, [%[a], #72]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #72]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+19] += m[19] * mu */
        "ldr	r7, [%[m], #76]\n\t"
        "ldr	r10, [%[a], #76]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #76]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+20] += m[20] * mu */
        "ldr	r7, [%[m], #80]\n\t"
        "ldr	r10, [%[a], #80]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #80]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+21] += m[21] * mu */
        "ldr	r7, [%[m], #84]\n\t"
        "ldr	r10, [%[a], #84]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #84]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+22] += m[22] * mu */
        "ldr	r7, [%[m], #88]\n\t"
        "ldr	r10, [%[a], #88]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #88]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+23] += m[23] * mu */
        "ldr	r7, [%[m], #92]\n\t"
        "ldr	r10, [%[a], #92]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #92]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+24] += m[24] * mu */
        "ldr	r7, [%[m], #96]\n\t"
        "ldr	r10, [%[a], #96]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #96]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+25] += m[25] * mu */
        "ldr	r7, [%[m], #100]\n\t"
        "ldr	r10, [%[a], #100]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #100]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+26] += m[26] * mu */
        "ldr	r7, [%[m], #104]\n\t"
        "ldr	r10, [%[a], #104]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #104]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+27] += m[27] * mu */
        "ldr	r7, [%[m], #108]\n\t"
        "ldr	r10, [%[a], #108]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #108]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+28] += m[28] * mu */
        "ldr	r7, [%[m], #112]\n\t"
        "ldr	r10, [%[a], #112]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #112]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+29] += m[29] * mu */
        "ldr	r7, [%[m], #116]\n\t"
        "ldr	r10, [%[a], #116]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #116]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+30] += m[30] * mu */
        "ldr	r7, [%[m], #120]\n\t"
        "ldr	r10, [%[a], #120]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #120]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+31] += m[31] * mu */
        "ldr	r7, [%[m], #124]\n\t"
        "ldr	r10, [%[a], #124]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #124]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+32] += m[32] * mu */
        "ldr	r7, [%[m], #128]\n\t"
        "ldr	r10, [%[a], #128]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r7, #0\n\t"
#endif
        "adds	r10, r10, r4\n\t"
        "str	r10, [%[a], #128]\n\t"
        "adc	r5, r5, #0\n\t"
        /* a[i+33] += m[33] * mu */
        "ldr	r7, [%[m], #132]\n\t"
        "ldr	r10, [%[a], #132]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r4, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r4, r4, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r4, r11\n\t"
#else
        "umull	r6, r7, r8, r7\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r4, r7, #0\n\t"
#endif
        "adds	r10, r10, r5\n\t"
        "str	r10, [%[a], #132]\n\t"
        "adc	r4, r4, #0\n\t"
        /* a[i+34] += m[34] * mu */
        "ldr	r7, [%[m], #136]\n\t"
        "ldr	r10, [%[a], #136]\n\t"
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 4)
        "lsr	r11, r7, #16\n\t"
        "lsr	r6, r8, #16\n\t"
        "mul	r5, r6, r11\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
        "adc	r5, r5, r11\n\t"
        "lsl	r6, r8, #16\n\t"
        "lsl	r11, r7, #16\n\t"
        "lsr	r6, r6, #16\n\t"
        "lsr	r11, r11, #16\n\t"
        "mul	r11, r6, r11\n\t"
        "adds	r10, r10, r11\n\t"
        "adc	r5, r5, #0\n\t"
        "lsr	r11, r7, #16\n\t"
        "mul	r6, r11, r6\n\t"
        "lsr	r11, r6, #16\n\t"
        "lsl	r6, r6, #16\n\t"
        "adds	r10, r10, r6\n\t"
       