Support sm3 and md5 multi-buffer calculation by SVE instructions.
Weili Qian (4): uadk/wd_alg: check whether the platform supports SVE uadk/sched: fix async mode ctx id uadk: initializes ctx resources in SVE mode uadk/hash_mb: support multi-buffer calculation for sm3 and md5
Makefile.am | 15 +- drv/hash_mb/hash_mb.c | 860 ++++++++++++++++++++++++++++++++++ drv/hash_mb/hash_mb.h | 60 +++ drv/hash_mb/md5_mb_asimd_x1.S | 248 ++++++++++ drv/hash_mb/md5_mb_asimd_x4.S | 526 +++++++++++++++++++++ drv/hash_mb/md5_mb_sve.S | 158 +++++++ drv/hash_mb/md5_sve_common.S | 478 +++++++++++++++++++ drv/hash_mb/sm3_mb_asimd_x1.S | 387 +++++++++++++++ drv/hash_mb/sm3_mb_asimd_x4.S | 576 +++++++++++++++++++++++ drv/hash_mb/sm3_mb_sve.S | 161 +++++++ drv/hash_mb/sm3_sve_common.S | 505 ++++++++++++++++++++ include/wd_alg_common.h | 4 + wd_alg.c | 14 + wd_sched.c | 4 +- wd_util.c | 95 +++- 15 files changed, 4074 insertions(+), 17 deletions(-) create mode 100644 drv/hash_mb/hash_mb.c create mode 100644 drv/hash_mb/hash_mb.h create mode 100644 drv/hash_mb/md5_mb_asimd_x1.S create mode 100644 drv/hash_mb/md5_mb_asimd_x4.S create mode 100644 drv/hash_mb/md5_mb_sve.S create mode 100644 drv/hash_mb/md5_sve_common.S create mode 100644 drv/hash_mb/sm3_mb_asimd_x1.S create mode 100644 drv/hash_mb/sm3_mb_asimd_x4.S create mode 100644 drv/hash_mb/sm3_mb_sve.S create mode 100644 drv/hash_mb/sm3_sve_common.S
If the algorithm uses the SVE instruction, check whether the platform supports SVE before algorithm driver registration. If the platform does not support SVE, do not register the algorithm.
Signed-off-by: Weili Qian qianweili@huawei.com --- wd_alg.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/wd_alg.c b/wd_alg.c index f34a407..de352d7 100644 --- a/wd_alg.c +++ b/wd_alg.c @@ -109,6 +109,19 @@ static bool wd_check_ce_support(const char *dev_name) return false; }
+static bool wd_check_sve_support(void) +{ + unsigned long hwcaps = 0; + + #if defined(__aarch64__) + hwcaps = getauxval(AT_HWCAP); + #endif + if (hwcaps & HWCAP_SVE) + return true; + + return false; +} + static bool wd_alg_check_available(int calc_type, const char *dev_name) { bool ret = false; @@ -122,6 +135,7 @@ static bool wd_alg_check_available(int calc_type, const char *dev_name) break; /* Should find the CPU if not support SVE */ case UADK_ALG_SVE_INSTR: + ret = wd_check_sve_support(); break; /* Check if the current driver has device support */ case UADK_ALG_HW:
In the single scheduler scenario, ctx id 1 is asynchronous ctx, but the function sched_single_poll_policy() uses ctx id 0. As a result, packets fail to be received. Change the value of ctx id to 1.
Signed-off-by: Weili Qian qianweili@huawei.com --- wd_sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/wd_sched.c b/wd_sched.c index b43834d..6766872 100644 --- a/wd_sched.c +++ b/wd_sched.c @@ -428,9 +428,9 @@ static int sched_single_poll_policy(handle_t h_sched_ctx, }
while (loop_times > 0) { - /* Default async mode use ctx 0 */ + /* Default async mode use ctx 1 */ loop_times--; - ret = sched_ctx->poll_func(0, 1, &poll_num); + ret = sched_ctx->poll_func(1, 1, &poll_num); if ((ret < 0) && (ret != -EAGAIN)) return ret; else if (ret == -EAGAIN)
Initializes ctx resources in SVE mode. In addition, when the driver is released, the config resources need to be released in all modes, not only UADK_ALG_HW.
Signed-off-by: Weili Qian qianweili@huawei.com --- include/wd_alg_common.h | 4 ++ wd_util.c | 95 +++++++++++++++++++++++++++++++++++------ 2 files changed, 85 insertions(+), 14 deletions(-)
diff --git a/include/wd_alg_common.h b/include/wd_alg_common.h index 32b8630..5fee085 100644 --- a/include/wd_alg_common.h +++ b/include/wd_alg_common.h @@ -127,6 +127,10 @@ struct wd_ctx_params { struct wd_cap_config *cap; };
+struct wd_soft_ctx { + void *priv; +}; + struct wd_ctx_internal { handle_t ctx; __u8 op_type; diff --git a/wd_util.c b/wd_util.c index fb58167..1e2b190 100644 --- a/wd_util.c +++ b/wd_util.c @@ -28,6 +28,10 @@ #define US2S(us) ((us) >> 20) #define WD_INIT_RETRY_TIMEOUT 3
+#define WD_SOFT_CTX_NUM 2 +#define WD_SOFT_SYNC_CTX 0 +#define WD_SOFT_ASYNC_CTX 1 + #define WD_DRV_LIB_DIR "uadk"
struct msg_pool { @@ -1968,8 +1972,7 @@ void wd_alg_uninit_driver(struct wd_ctx_config_internal *config,
driver->exit(driver); /* Ctx config just need clear once */ - if (driver->calc_type == UADK_ALG_HW) - wd_clear_ctx_config(config); + wd_clear_ctx_config(config);
if (driver->fallback) wd_alg_uninit_fallback((struct wd_alg_driver *)driver->fallback); @@ -2660,6 +2663,47 @@ static void wd_alg_ctx_uninit(struct wd_ctx_config *ctx_config) free(ctx_config->ctxs); }
+static int wd_alg_init_sve_ctx(struct wd_ctx_config *ctx_config) +{ + struct wd_soft_ctx *ctx_sync, *ctx_async; + + ctx_config->ctx_num = WD_SOFT_CTX_NUM; + ctx_config->ctxs = calloc(ctx_config->ctx_num, sizeof(struct wd_ctx)); + if (!ctx_config->ctxs) + return -WD_ENOMEM; + + ctx_sync = calloc(1, sizeof(struct wd_soft_ctx)); + if (!ctx_sync) + goto free_ctxs; + + ctx_config->ctxs[WD_SOFT_SYNC_CTX].op_type = 0; + ctx_config->ctxs[WD_SOFT_SYNC_CTX].ctx_mode = CTX_MODE_SYNC; + ctx_config->ctxs[WD_SOFT_SYNC_CTX].ctx = (handle_t)ctx_sync; + + ctx_async = calloc(1, sizeof(struct wd_soft_ctx)); + if (!ctx_async) + goto free_ctx_sync; + + ctx_config->ctxs[WD_SOFT_ASYNC_CTX].op_type = 0; + ctx_config->ctxs[WD_SOFT_ASYNC_CTX].ctx_mode = CTX_MODE_ASYNC; + ctx_config->ctxs[WD_SOFT_ASYNC_CTX].ctx = (handle_t)ctx_async; + + return 0; + +free_ctx_sync: + free(ctx_sync); +free_ctxs: + free(ctx_config->ctxs); + return -WD_ENOMEM; +} + +static void wd_alg_uninit_sve_ctx(struct wd_ctx_config *ctx_config) +{ + free((struct wd_soft_ctx *)ctx_config->ctxs[WD_SOFT_ASYNC_CTX].ctx); + free((struct wd_soft_ctx *)ctx_config->ctxs[WD_SOFT_SYNC_CTX].ctx); + free(ctx_config->ctxs); +} + int wd_alg_attrs_init(struct wd_init_attrs *attrs) { wd_alg_poll_ctx alg_poll_func = attrs->alg_poll_ctx; @@ -2717,9 +2761,23 @@ int wd_alg_attrs_init(struct wd_init_attrs *attrs) } attrs->sched = alg_sched;
- ret = wd_sched_rr_instance(alg_sched, NULL); + ctx_config = calloc(1, sizeof(*ctx_config)); + if (!ctx_config) { + WD_ERR("fail to alloc ctx config\n"); + goto out_freesched; + } + attrs->ctx_config = ctx_config; + + ret = wd_alg_init_sve_ctx(ctx_config); if (ret) { - WD_ERR("fail to instance scheduler\n"); + WD_ERR("fail to init sve ctx!\n"); + goto out_freesched; + } + + ctx_config->cap = attrs->ctx_params->cap; + ret = alg_init_func(ctx_config, alg_sched); + if (ret) { + wd_alg_uninit_sve_ctx(ctx_config); goto out_freesched; } break; @@ -2780,17 +2838,26 @@ void wd_alg_attrs_uninit(struct wd_init_attrs *attrs) struct wd_sched *alg_sched = attrs->sched; int driver_type = attrs->driver->calc_type;
- if (driver_type == UADK_ALG_CE_INSTR || driver_type == UADK_ALG_SOFT) { - if (ctx_config) { - wd_alg_ce_ctx_uninit(ctx_config); - free(ctx_config); - } - } else { - if (ctx_config) { - wd_alg_ctx_uninit(ctx_config); - free(ctx_config); - } + if (!ctx_config) { + wd_sched_rr_release(alg_sched); + return; + } + + switch (driver_type) { + case UADK_ALG_SOFT: + case UADK_ALG_CE_INSTR: + wd_alg_ce_ctx_uninit(ctx_config); + break; + case UADK_ALG_SVE_INSTR: + wd_alg_uninit_sve_ctx(ctx_config); + break; + case UADK_ALG_HW: + wd_alg_ctx_uninit(ctx_config); + break; + default: + break; }
+ free(ctx_config); wd_sched_rr_release(alg_sched); }
Supports sm3 and md5 multi-buffer calculation by using SVE instructions. If the platform supports SVE instructions, uesrs can choose SVE instructions to perform sm3 and md5 algorithm calculation.
The assembly implementation is from isa-l_crypto: https://github.com/intel/isa-l_crypto.git
Signed-off-by: Weili Qian qianweili@huawei.com --- Makefile.am | 15 +- drv/hash_mb/hash_mb.c | 860 ++++++++++++++++++++++++++++++++++ drv/hash_mb/hash_mb.h | 60 +++ drv/hash_mb/md5_mb_asimd_x1.S | 248 ++++++++++ drv/hash_mb/md5_mb_asimd_x4.S | 526 +++++++++++++++++++++ drv/hash_mb/md5_mb_sve.S | 158 +++++++ drv/hash_mb/md5_sve_common.S | 478 +++++++++++++++++++ drv/hash_mb/sm3_mb_asimd_x1.S | 387 +++++++++++++++ drv/hash_mb/sm3_mb_asimd_x4.S | 576 +++++++++++++++++++++++ drv/hash_mb/sm3_mb_sve.S | 161 +++++++ drv/hash_mb/sm3_sve_common.S | 505 ++++++++++++++++++++ 11 files changed, 3973 insertions(+), 1 deletion(-) create mode 100644 drv/hash_mb/hash_mb.c create mode 100644 drv/hash_mb/hash_mb.h create mode 100644 drv/hash_mb/md5_mb_asimd_x1.S create mode 100644 drv/hash_mb/md5_mb_asimd_x4.S create mode 100644 drv/hash_mb/md5_mb_sve.S create mode 100644 drv/hash_mb/md5_sve_common.S create mode 100644 drv/hash_mb/sm3_mb_asimd_x1.S create mode 100644 drv/hash_mb/sm3_mb_asimd_x4.S create mode 100644 drv/hash_mb/sm3_mb_sve.S create mode 100644 drv/hash_mb/sm3_sve_common.S
diff --git a/Makefile.am b/Makefile.am index cd3d7e5..27eb785 100644 --- a/Makefile.am +++ b/Makefile.am @@ -45,7 +45,7 @@ lib_LTLIBRARIES=libwd.la libwd_comp.la libwd_crypto.la
uadk_driversdir=$(libdir)/uadk uadk_drivers_LTLIBRARIES=libhisi_sec.la libhisi_hpre.la libhisi_zip.la \ - libisa_ce.la + libisa_ce.la libisa_sve.la
libwd_la_SOURCES=wd.c wd_mempool.c wd.h wd_alg.c wd_alg.h \ v1/wd.c v1/wd.h v1/wd_adapter.c v1/wd_adapter.h \ @@ -93,6 +93,12 @@ libhisi_hpre_la_SOURCES=drv/hisi_hpre.c drv/hisi_qm_udrv.c \ libisa_ce_la_SOURCES=drv/isa_ce_sm3.c drv/isa_ce_sm3_armv8.S arm_arch_ce.h \ drv/isa_ce_sm3.h
+libisa_sve_la_SOURCES=drv/hash_mb/hash_mb.c wd_digest_drv.h drv/hash_mb/hash_mb.h \ + drv/hash_mb/sm3_sve_common.S drv/hash_mb/sm3_mb_asimd_x1.S \ + drv/hash_mb/sm3_mb_asimd_x4.S drv/hash_mb/sm3_mb_sve.S \ + drv/hash_mb/md5_sve_common.S drv/hash_mb/md5_mb_asimd_x1.S \ + drv/hash_mb/md5_mb_asimd_x4.S drv/hash_mb/md5_mb_sve.S + if WD_STATIC_DRV AM_CFLAGS += -DWD_STATIC_DRV -fPIC AM_CFLAGS += -DWD_NO_LOG @@ -116,6 +122,9 @@ libhisi_hpre_la_DEPENDENCIES = libwd.la libwd_crypto.la libisa_ce_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) libisa_ce_la_DEPENDENCIES = libwd.la libwd_crypto.la
+libisa_sve_la_LIBADD = $(libwd_la_OBJECTS) $(libwd_crypto_la_OBJECTS) +libisa_sve_la_DEPENDENCIES = libwd.la libwd_crypto.la + else UADK_WD_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd.map UADK_CRYPTO_SYMBOL= -Wl,--version-script,$(top_srcdir)/libwd_crypto.map @@ -148,6 +157,10 @@ libhisi_hpre_la_DEPENDENCIES= libwd.la libwd_crypto.la libisa_ce_la_LIBADD= -lwd -lwd_crypto libisa_ce_la_LDFLAGS=$(UADK_VERSION) libisa_ce_la_DEPENDENCIES= libwd.la libwd_crypto.la + +libisa_sve_la_LIBADD= -lwd -lwd_crypto +libisa_sve_la_LDFLAGS=$(UADK_VERSION) +libisa_sve_la_DEPENDENCIES= libwd.la libwd_crypto.la endif # WD_STATIC_DRV
pkgconfigdir = $(libdir)/pkgconfig diff --git a/drv/hash_mb/hash_mb.c b/drv/hash_mb/hash_mb.c new file mode 100644 index 0000000..463e983 --- /dev/null +++ b/drv/hash_mb/hash_mb.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright 2023 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ +/* + * Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. + */ + +#include <sys/auxv.h> +#include <pthread.h> +#include <stdlib.h> +#include <string.h> +#include "hash_mb.h" + +#define MIN(a, b) (((a) > (b)) ? (b) : (a)) +#define IPAD_VALUE 0x36 +#define OPAD_VALUE 0x5C +#define HASH_KEY_LEN 64 +#define HASH_BLOCK_OFFSET 6 +#define HASH_BLOCK_SIZE 64 +#define HASH_PADLENGTHFIELD_SIZE 56 +#define HASH_PADDING_SIZE 120 +#define HASH_HIGH_32BITS 32 +#define HASH_PADDING_BLOCKS 2 +#define HASH_NENO_PROCESS_JOBS 4 +#define HASH_TRY_PROCESS_COUNT 16 +#define BYTES_TO_BITS_OFFSET 3 + +#define MD5_DIGEST_DATA_SIZE 16 +#define SM3_DIGEST_DATA_SIZE 32 +#define HASH_MAX_LANES 32 +#define SM3_MAX_LANES 16 + +#define PUTU32(p, V) \ + ((p)[0] = (uint8_t)((V) >> 24), \ + (p)[1] = (uint8_t)((V) >> 16), \ + (p)[2] = (uint8_t)((V) >> 8), \ + (p)[3] = (uint8_t)(V)) + +struct hash_mb_ops { + int (*max_lanes)(void); + void (*asimd_x4)(struct hash_job *job1, struct hash_job *job2, + struct hash_job *job3, struct hash_job *job4, int len); + void (*asimd_x1)(struct hash_job *job, int len); + void (*sve)(int blocks, int total_lanes, struct hash_job **job_vec); + __u8 *iv_data; + int iv_bytes; + int max_jobs; +}; + +struct hash_mb_poll_queue { + struct hash_job *head; + struct hash_job *tail; + pthread_spinlock_t s_lock; + const struct hash_mb_ops *ops; + __u32 job_num; +}; + +struct hash_mb_queue { + struct hash_mb_poll_queue sm3_poll_queue; + struct hash_mb_poll_queue md5_poll_queue; + pthread_spinlock_t r_lock; + struct hash_job *recv_head; + struct hash_job *recv_tail; + __u32 complete_cnt; + __u8 ctx_mode; +}; + +struct hash_mb_ctx { + struct wd_ctx_config_internal config; +}; + +static __u8 sm3_iv_data[SM3_DIGEST_DATA_SIZE] = { + 0x73, 0x80, 0x16, 0x6f, 0x49, 0x14, 0xb2, 0xb9, + 0x17, 0x24, 0x42, 0xd7, 0xda, 0x8a, 0x06, 0x00, + 0xa9, 0x6f, 0x30, 0xbc, 0x16, 0x31, 0x38, 0xaa, + 0xe3, 0x8d, 0xee, 0x4d, 0xb0, 0xfb, 0x0e, 0x4e, +}; + +static __u8 md5_iv_data[MD5_DIGEST_DATA_SIZE] = { + 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, + 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, +}; + +static struct hash_mb_ops md5_ops = { + .max_lanes = md5_mb_sve_max_lanes, + .asimd_x4 = md5_mb_asimd_x4, + .asimd_x1 = md5_mb_asimd_x1, + .sve = md5_mb_sve, + .iv_data = md5_iv_data, + .iv_bytes = MD5_DIGEST_DATA_SIZE, + .max_jobs = HASH_MAX_LANES, +}; + +static struct hash_mb_ops sm3_ops = { + .max_lanes = sm3_mb_sve_max_lanes, + .asimd_x4 = sm3_mb_asimd_x4, + .asimd_x1 = sm3_mb_asimd_x1, + .sve = sm3_mb_sve, + .iv_data = sm3_iv_data, + .iv_bytes = SM3_DIGEST_DATA_SIZE, + .max_jobs = SM3_MAX_LANES, +}; + +static void hash_mb_uninit_poll_queue(struct hash_mb_poll_queue *poll_queue) +{ + pthread_spin_destroy(&poll_queue->s_lock); +} + +static void hash_mb_queue_uninit(struct wd_ctx_config_internal *config, int ctx_num) +{ + struct hash_mb_queue *mb_queue; + struct wd_soft_ctx *ctx; + int i; + + for (i = 0; i < ctx_num; i++) { + ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx; + mb_queue = ctx->priv; + pthread_spin_destroy(&mb_queue->r_lock); + hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue); + hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue); + free(mb_queue); + } +} + +static int hash_mb_init_poll_queue(struct hash_mb_poll_queue *poll_queue) +{ + int ret; + + ret = pthread_spin_init(&poll_queue->s_lock, PTHREAD_PROCESS_SHARED); + if (ret) { + WD_ERR("failed to init s_lock!\n"); + return ret; + } + + poll_queue->head = NULL; + poll_queue->tail = NULL; + poll_queue->job_num = 0; + + return WD_SUCCESS; +} + +static int hash_mb_queue_init(struct wd_ctx_config_internal *config) +{ + struct hash_mb_queue *mb_queue; + int ctx_num = config->ctx_num; + struct wd_soft_ctx *ctx; + int i, ret; + + for (i = 0; i < ctx_num; i++) { + mb_queue = calloc(1, sizeof(struct hash_mb_queue)); + if (!mb_queue) { + ret = -WD_ENOMEM; + goto free_mb_queue; + } + + mb_queue->ctx_mode = config->ctxs[i].ctx_mode; + ctx = (struct wd_soft_ctx *)config->ctxs[i].ctx; + ctx->priv = mb_queue; + ret = hash_mb_init_poll_queue(&mb_queue->sm3_poll_queue); + if (ret) + goto free_mem; + + ret = hash_mb_init_poll_queue(&mb_queue->md5_poll_queue); + if (ret) + goto uninit_sm3_poll; + + ret = pthread_spin_init(&mb_queue->r_lock, PTHREAD_PROCESS_SHARED); + if (ret) { + WD_ERR("failed to init r_lock!\n"); + goto uninit_md5_poll; + } + + mb_queue->sm3_poll_queue.ops = &sm3_ops; + mb_queue->md5_poll_queue.ops = &md5_ops; + mb_queue->recv_head = NULL; + mb_queue->recv_tail = NULL; + mb_queue->complete_cnt = 0; + } + + return WD_SUCCESS; + +uninit_md5_poll: + hash_mb_uninit_poll_queue(&mb_queue->md5_poll_queue); +uninit_sm3_poll: + hash_mb_uninit_poll_queue(&mb_queue->sm3_poll_queue); +free_mem: + free(mb_queue); +free_mb_queue: + hash_mb_queue_uninit(config, i); + return ret; +} + +static int hash_mb_init(struct wd_alg_driver *drv, void *conf) +{ + struct wd_ctx_config_internal *config = conf; + struct hash_mb_ctx *priv; + int ret; + + priv = malloc(sizeof(struct hash_mb_ctx)); + if (!priv) + return -WD_ENOMEM; + + /* multibuff does not use epoll. */ + config->epoll_en = 0; + memcpy(&priv->config, config, sizeof(struct wd_ctx_config_internal)); + + ret = hash_mb_queue_init(config); + if (ret) { + free(priv); + return ret; + } + + drv->priv = priv; + + return WD_SUCCESS; +} + +static void hash_mb_exit(struct wd_alg_driver *drv) +{ + struct hash_mb_ctx *priv = (struct hash_mb_ctx *)drv->priv; + + if (!priv) + return; + + hash_mb_queue_uninit(&priv->config, priv->config.ctx_num); + free(priv); + drv->priv = NULL; +} + +static void hash_mb_pad_data(struct hash_pad *hash_pad, __u8 *in, __u32 partial, + __u64 total_len, bool transfer) +{ + __u64 size = total_len << BYTES_TO_BITS_OFFSET; + __u8 *buffer = hash_pad->pad; + + if (partial) + memcpy(buffer, in, partial); + + buffer[partial++] = 0x80; + if (partial <= HASH_PADLENGTHFIELD_SIZE) { + memset(buffer + partial, 0, HASH_PADLENGTHFIELD_SIZE - partial); + if (transfer) { + PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE, size >> HASH_HIGH_32BITS); + PUTU32(buffer + HASH_PADLENGTHFIELD_SIZE + sizeof(__u32), size); + } else { + memcpy(buffer + HASH_PADLENGTHFIELD_SIZE, &size, sizeof(__u64)); + } + hash_pad->pad_len = 1; + } else { + memset(buffer + partial, 0, HASH_PADDING_SIZE - partial); + if (transfer) { + PUTU32(buffer + HASH_PADDING_SIZE, size >> HASH_HIGH_32BITS); + PUTU32(buffer + HASH_PADDING_SIZE + sizeof(__u32), size); + } else { + memcpy(buffer + HASH_PADDING_SIZE, &size, sizeof(__u64)); + } + hash_pad->pad_len = HASH_PADDING_BLOCKS; + } +} + +static inline void hash_xor(__u8 *key_out, __u8 *key_in, __u32 key_len, __u8 xor_value) +{ + __u32 i; + + for (i = 0; i < HASH_KEY_LEN; i++) { + if (i < key_len) + key_out[i] = key_in[i] ^ xor_value; + else + key_out[i] = xor_value; + } +} + +static int hash_middle_block_process(struct hash_mb_poll_queue *poll_queue, + struct wd_digest_msg *d_msg, + struct hash_job *job) +{ + __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes; + __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes; + + if (length < HASH_BLOCK_SIZE) { + memcpy(buffer, d_msg->in, d_msg->in_bytes); + d_msg->partial_bytes = length; + return -WD_EAGAIN; + } + + if (d_msg->partial_bytes) { + memcpy(buffer, d_msg->in, HASH_BLOCK_SIZE - d_msg->partial_bytes); + job->buffer = d_msg->partial_block; + poll_queue->ops->asimd_x1(job, 1); + length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes); + buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes); + } else { + buffer = d_msg->in; + } + + job->len = length >> HASH_BLOCK_OFFSET; + d_msg->partial_bytes = length & (HASH_BLOCK_SIZE - 1); + if (d_msg->partial_bytes) + memcpy(d_msg->partial_block, buffer + (job->len << HASH_BLOCK_OFFSET), + d_msg->partial_bytes); + + if (!job->len) { + memcpy(d_msg->out, job->result_digest, poll_queue->ops->iv_bytes); + return -WD_EAGAIN; + } + + job->buffer = buffer; + job->pad.pad_len = 0; + + return WD_SUCCESS; +} + +static void hash_signle_block_process(struct wd_digest_msg *d_msg, + struct hash_job *job, __u64 total_len) +{ + __u32 hash_partial = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1); + __u8 *buffer; + + job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET; + buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET); + hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer); + if (!job->len) { + job->buffer = job->pad.pad; + job->len = job->pad.pad_len; + job->pad.pad_len = 0; + return; + } + + job->buffer = d_msg->in; +} + +static void hash_final_block_process(struct hash_mb_poll_queue *poll_queue, + struct wd_digest_msg *d_msg, + struct hash_job *job) +{ + __u8 *buffer = d_msg->partial_block + d_msg->partial_bytes; + __u64 length = (__u64)d_msg->partial_bytes + d_msg->in_bytes; + __u32 hash_partial = length & (HASH_BLOCK_SIZE - 1); + __u64 total_len = d_msg->long_data_len; + + if (job->opad.opad_size) + total_len += HASH_BLOCK_SIZE; + + if (!d_msg->partial_bytes) { + hash_signle_block_process(d_msg, job, total_len); + return; + } + + if (length <= HASH_BLOCK_SIZE) { + memcpy(buffer, d_msg->in, d_msg->in_bytes); + job->len = length >> HASH_BLOCK_OFFSET; + buffer = d_msg->partial_block + (job->len << HASH_BLOCK_OFFSET); + hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer); + if (!job->len) { + job->buffer = job->pad.pad; + job->len = job->pad.pad_len; + job->pad.pad_len = 0; + return; + } + + job->buffer = d_msg->partial_block; + return; + } + + memcpy(buffer, d_msg->in, (HASH_BLOCK_SIZE - d_msg->partial_bytes)); + job->buffer = d_msg->partial_block; + poll_queue->ops->asimd_x1(job, 1); + job->buffer = d_msg->in + (HASH_BLOCK_SIZE - d_msg->partial_bytes); + length = d_msg->in_bytes - (HASH_BLOCK_SIZE - d_msg->partial_bytes); + job->len = length >> HASH_BLOCK_OFFSET; + buffer = job->buffer + (job->len << HASH_BLOCK_OFFSET); + hash_partial = length & (HASH_BLOCK_SIZE - 1); + hash_mb_pad_data(&job->pad, buffer, hash_partial, total_len, job->is_transfer); + if (!job->len) { + job->buffer = job->pad.pad; + job->len = job->pad.pad_len; + job->pad.pad_len = 0; + } +} + +static int hash_first_block_process(struct wd_digest_msg *d_msg, + struct hash_job *job, + __u32 iv_bytes) +{ + __u8 *buffer; + + job->len = d_msg->in_bytes >> HASH_BLOCK_OFFSET; + d_msg->partial_bytes = d_msg->in_bytes & (HASH_BLOCK_SIZE - 1); + if (d_msg->partial_bytes) { + buffer = d_msg->in + (job->len << HASH_BLOCK_OFFSET); + memcpy(d_msg->partial_block, buffer, d_msg->partial_bytes); + } + + /* + * Long hash mode, if first block is less than HASH_BLOCK_SIZE, + * copy ikey hash result to out. + */ + if (!job->len) { + memcpy(d_msg->out, job->result_digest, iv_bytes); + return -WD_EAGAIN; + } + job->buffer = d_msg->in; + job->pad.pad_len = 0; + + return WD_SUCCESS; +} + +static int hash_do_partial(struct hash_mb_poll_queue *poll_queue, + struct wd_digest_msg *d_msg, struct hash_job *job) +{ + enum hash_block_type bd_type = get_hash_block_type(d_msg); + __u64 total_len = d_msg->in_bytes; + int ret = WD_SUCCESS; + + switch (bd_type) { + case HASH_FIRST_BLOCK: + ret = hash_first_block_process(d_msg, job, poll_queue->ops->iv_bytes); + break; + case HASH_MIDDLE_BLOCK: + ret = hash_middle_block_process(poll_queue, d_msg, job); + break; + case HASH_END_BLOCK: + hash_final_block_process(poll_queue, d_msg, job); + break; + case HASH_SINGLE_BLOCK: + if (job->opad.opad_size) + total_len += HASH_BLOCK_SIZE; + hash_signle_block_process(d_msg, job, total_len); + break; + } + + return ret; +} + +static void hash_mb_init_iv(struct hash_mb_poll_queue *poll_queue, + struct wd_digest_msg *d_msg, struct hash_job *job) +{ + enum hash_block_type bd_type = get_hash_block_type(d_msg); + __u8 key_ipad[HASH_KEY_LEN]; + __u8 key_opad[HASH_KEY_LEN]; + + job->opad.opad_size = 0; + switch (bd_type) { + case HASH_FIRST_BLOCK: + memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); + if (d_msg->mode != WD_DIGEST_HMAC) + return; + + hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE); + job->buffer = key_ipad; + poll_queue->ops->asimd_x1(job, 1); + break; + case HASH_MIDDLE_BLOCK: + memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes); + break; + case HASH_END_BLOCK: + if (d_msg->mode != WD_DIGEST_HMAC) { + memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes); + return; + } + memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); + hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE); + job->buffer = key_opad; + poll_queue->ops->asimd_x1(job, 1); + memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes); + job->opad.opad_size = poll_queue->ops->iv_bytes; + memcpy(job->result_digest, d_msg->out, poll_queue->ops->iv_bytes); + break; + case HASH_SINGLE_BLOCK: + memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); + if (d_msg->mode != WD_DIGEST_HMAC) + return; + + hash_xor(key_ipad, d_msg->key, d_msg->key_bytes, IPAD_VALUE); + hash_xor(key_opad, d_msg->key, d_msg->key_bytes, OPAD_VALUE); + job->buffer = key_opad; + poll_queue->ops->asimd_x1(job, 1); + memcpy(job->opad.opad, job->result_digest, poll_queue->ops->iv_bytes); + job->opad.opad_size = poll_queue->ops->iv_bytes; + job->buffer = key_ipad; + memcpy(job->result_digest, poll_queue->ops->iv_data, poll_queue->ops->iv_bytes); + poll_queue->ops->asimd_x1(job, 1); + break; + } +} + +static void hash_do_sync(struct hash_mb_poll_queue *poll_queue, struct hash_job *job) +{ + __u32 iv_bytes = poll_queue->ops->iv_bytes; + __u32 length; + + poll_queue->ops->asimd_x1(job, job->len); + + if (job->pad.pad_len) { + job->buffer = job->pad.pad; + poll_queue->ops->asimd_x1(job, job->pad.pad_len); + } + + if (job->opad.opad_size) { + job->buffer = job->opad.opad + job->opad.opad_size; + memcpy(job->buffer, job->result_digest, iv_bytes); + memcpy(job->result_digest, job->opad.opad, iv_bytes); + length = HASH_BLOCK_SIZE + iv_bytes; + hash_mb_pad_data(&job->pad, job->buffer, iv_bytes, length, job->is_transfer); + job->buffer = job->pad.pad; + poll_queue->ops->asimd_x1(job, job->pad.pad_len); + } +} + +static void hash_mb_add_job_tail(struct hash_mb_poll_queue *poll_queue, struct hash_job *job) +{ + pthread_spin_lock(&poll_queue->s_lock); + if (poll_queue->job_num) { + poll_queue->tail->next = job; + poll_queue->tail = job; + } else { + poll_queue->head = job; + poll_queue->tail = job; + } + poll_queue->job_num++; + pthread_spin_unlock(&poll_queue->s_lock); +} + +static void hash_mb_add_job_head(struct hash_mb_poll_queue *poll_queue, struct hash_job *job) +{ + pthread_spin_lock(&poll_queue->s_lock); + if (poll_queue->job_num) { + job->next = poll_queue->head; + poll_queue->head = job; + } else { + poll_queue->head = job; + poll_queue->tail = job; + } + poll_queue->job_num++; + pthread_spin_unlock(&poll_queue->s_lock); +} + +static int hash_mb_check_param(struct hash_mb_queue *mb_queue, struct wd_digest_msg *d_msg) +{ + if (unlikely(mb_queue->ctx_mode == CTX_MODE_ASYNC && d_msg->has_next)) { + WD_ERR("invalid: async mode not supports long hash!\n"); + return -WD_EINVAL; + } + + if (unlikely(d_msg->data_fmt != WD_FLAT_BUF)) { + WD_ERR("invalid: hash multibuffer not supports sgl mode!\n"); + return -WD_EINVAL; + } + + return WD_SUCCESS; +} + +static int hash_mb_send(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg) +{ + struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx; + struct hash_mb_queue *mb_queue = s_ctx->priv; + struct wd_digest_msg *d_msg = drv_msg; + struct hash_mb_poll_queue *poll_queue; + struct hash_job hash_sync_job; + struct hash_job *hash_job; + int ret; + + ret = hash_mb_check_param(mb_queue, d_msg); + if (ret) + return ret; + + if (mb_queue->ctx_mode == CTX_MODE_ASYNC) { + hash_job = malloc(sizeof(struct hash_job)); + if (unlikely(!hash_job)) + return -WD_ENOMEM; + } else { + hash_job = &hash_sync_job; + } + + switch (d_msg->alg) { + case WD_DIGEST_SM3: + poll_queue = &mb_queue->sm3_poll_queue; + hash_job->is_transfer = true; + break; + case WD_DIGEST_MD5: + poll_queue = &mb_queue->md5_poll_queue; + hash_job->is_transfer = false; + break; + default: + WD_ERR("invalid: alg type %u not support!\n", d_msg->alg); + if (mb_queue->ctx_mode == CTX_MODE_ASYNC) + free(hash_job); + return -WD_EINVAL; + } + + hash_mb_init_iv(poll_queue, d_msg, hash_job); + /* If block not need process, return directly. */ + ret = hash_do_partial(poll_queue, d_msg, hash_job); + if (ret == -WD_EAGAIN) { + if (mb_queue->ctx_mode == CTX_MODE_ASYNC) + free(hash_job); + + d_msg->result = WD_SUCCESS; + return WD_SUCCESS; + } + + if (mb_queue->ctx_mode == CTX_MODE_SYNC) { + hash_do_sync(poll_queue, hash_job); + memcpy(d_msg->out, hash_job->result_digest, d_msg->out_bytes); + d_msg->result = WD_SUCCESS; + return WD_SUCCESS; + } + + hash_job->msg = d_msg; + hash_mb_add_job_tail(poll_queue, hash_job); + + return WD_SUCCESS; +} + +static struct hash_job *hash_mb_find_complete_job(struct hash_mb_queue *mb_queue) +{ + struct hash_job *job; + + pthread_spin_lock(&mb_queue->r_lock); + if (!mb_queue->complete_cnt) { + pthread_spin_unlock(&mb_queue->r_lock); + return NULL; + } + + job = mb_queue->recv_head; + mb_queue->recv_head = job->next; + mb_queue->complete_cnt--; + pthread_spin_unlock(&mb_queue->r_lock); + + return job; +} + +static int hash_recv_complete_job(struct hash_mb_queue *mb_queue, struct wd_digest_msg *msg) +{ + struct hash_mb_poll_queue *poll_queue; + struct hash_job *hash_job; + __u32 total_len; + + hash_job = hash_mb_find_complete_job(mb_queue); + if (!hash_job) + return -WD_EAGAIN; + + if (!hash_job->opad.opad_size) { + msg->tag = hash_job->msg->tag; + memcpy(hash_job->msg->out, hash_job->result_digest, hash_job->msg->out_bytes); + free(hash_job); + msg->result = WD_SUCCESS; + return WD_SUCCESS; + } + + if (hash_job->msg->alg == WD_DIGEST_SM3) + poll_queue = &mb_queue->sm3_poll_queue; + else + poll_queue = &mb_queue->md5_poll_queue; + hash_job->buffer = hash_job->opad.opad + poll_queue->ops->iv_bytes; + memcpy(hash_job->buffer, hash_job->result_digest, poll_queue->ops->iv_bytes); + total_len = poll_queue->ops->iv_bytes + HASH_BLOCK_SIZE; + hash_mb_pad_data(&hash_job->pad, hash_job->buffer, poll_queue->ops->iv_bytes, + total_len, hash_job->is_transfer); + memcpy(hash_job->result_digest, hash_job->opad.opad, poll_queue->ops->iv_bytes); + hash_job->opad.opad_size = 0; + hash_job->buffer = hash_job->pad.pad; + hash_job->len = hash_job->pad.pad_len; + hash_job->pad.pad_len = 0; + + hash_mb_add_job_head(poll_queue, hash_job); + + return -WD_EAGAIN; +} + +static struct hash_job *hash_mb_get_job(struct hash_mb_poll_queue *poll_queue) +{ + struct hash_job *job; + + pthread_spin_lock(&poll_queue->s_lock); + if (!poll_queue->job_num) { + pthread_spin_unlock(&poll_queue->s_lock); + return NULL; + } + + job = poll_queue->head; + poll_queue->head = job->next; + poll_queue->job_num--; + pthread_spin_unlock(&poll_queue->s_lock); + + return job; +} + +static void hash_mb_add_finish_job(struct hash_mb_queue *mb_queue, struct hash_job *job) +{ + pthread_spin_lock(&mb_queue->r_lock); + if (mb_queue->complete_cnt) { + mb_queue->recv_tail->next = job; + mb_queue->recv_tail = job; + } else { + mb_queue->recv_head = job; + mb_queue->recv_tail = job; + } + mb_queue->complete_cnt++; + pthread_spin_unlock(&mb_queue->r_lock); +} + +static struct hash_mb_poll_queue *hash_get_poll_queue(struct hash_mb_queue *mb_queue) +{ + if (!mb_queue->sm3_poll_queue.job_num && + !mb_queue->md5_poll_queue.job_num) + return NULL; + + if (mb_queue->md5_poll_queue.job_num >= mb_queue->sm3_poll_queue.job_num) + return &mb_queue->md5_poll_queue; + + return &mb_queue->sm3_poll_queue; +} + +static int hash_mb_do_jobs(struct hash_mb_queue *mb_queue) +{ + struct hash_mb_poll_queue *poll_queue = hash_get_poll_queue(mb_queue); + struct hash_job *job_vecs[HASH_MAX_LANES]; + __u64 len = 0; + int maxjobs; + int j = 0; + int i = 0; + + if (!poll_queue) + return -WD_EAGAIN; + + maxjobs = poll_queue->ops->max_lanes(); + maxjobs = MIN(maxjobs, poll_queue->ops->max_jobs); + while (j < maxjobs) { + job_vecs[j] = hash_mb_get_job(poll_queue); + if (!job_vecs[j]) + break; + + if (!j) + len = job_vecs[j]->len; + else + len = MIN(job_vecs[j]->len, len); + j++; + } + + if (!j) + return -WD_EAGAIN; + + if (j > HASH_NENO_PROCESS_JOBS) { + poll_queue->ops->sve(len, j, job_vecs); + } else if (j == HASH_NENO_PROCESS_JOBS) { + poll_queue->ops->asimd_x4(job_vecs[0], job_vecs[1], + job_vecs[2], job_vecs[3], len); + } else { + while (i < j) + poll_queue->ops->asimd_x1(job_vecs[i++], len); + } + + for (i = 0; i < j; i++) { + if (job_vecs[i]->len == len) { + if (!job_vecs[i]->pad.pad_len) { + hash_mb_add_finish_job(mb_queue, job_vecs[i]); + } else { + job_vecs[i]->buffer = job_vecs[i]->pad.pad; + job_vecs[i]->len = job_vecs[i]->pad.pad_len; + job_vecs[i]->pad.pad_len = 0; + hash_mb_add_job_head(poll_queue, job_vecs[i]); + } + } else { + job_vecs[i]->len -= len; + job_vecs[i]->buffer += len << HASH_BLOCK_OFFSET; + hash_mb_add_job_head(poll_queue, job_vecs[i]); + } + } + + return WD_SUCCESS; +} + +static int hash_mb_recv(struct wd_alg_driver *drv, handle_t ctx, void *drv_msg) +{ + struct wd_soft_ctx *s_ctx = (struct wd_soft_ctx *)ctx; + struct hash_mb_queue *mb_queue = s_ctx->priv; + struct wd_digest_msg *msg = drv_msg; + int ret, i = 0; + + if (mb_queue->ctx_mode == CTX_MODE_SYNC) + return WD_SUCCESS; + + while (i++ < HASH_TRY_PROCESS_COUNT) { + ret = hash_recv_complete_job(mb_queue, msg); + if (!ret) + return WD_SUCCESS; + + ret = hash_mb_do_jobs(mb_queue); + if (ret) + return ret; + } + + return -WD_EAGAIN; +} + +static int hash_mb_get_usage(void *param) +{ + return 0; +} + +#define GEN_HASH_ALG_DRIVER(hash_alg_name) \ +{\ + .drv_name = "hash_mb",\ + .alg_name = (hash_alg_name),\ + .calc_type = UADK_ALG_SVE_INSTR,\ + .priority = 100,\ + .queue_num = 1,\ + .op_type_num = 1,\ + .fallback = 0,\ + .init = hash_mb_init,\ + .exit = hash_mb_exit,\ + .send = hash_mb_send,\ + .recv = hash_mb_recv,\ + .get_usage = hash_mb_get_usage,\ +} + +static struct wd_alg_driver hash_mb_driver[] = { + GEN_HASH_ALG_DRIVER("sm3"), + GEN_HASH_ALG_DRIVER("md5"), +}; + +static void __attribute__((constructor)) hash_mb_probe(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + size_t alg_num = ARRAY_SIZE(hash_mb_driver); + size_t i; + int ret; + + if (!(auxval & HWCAP_SVE)) + return; + + WD_INFO("Info: register hash_mb alg drivers!\n"); + for (i = 0; i < alg_num; i++) { + ret = wd_alg_driver_register(&hash_mb_driver[i]); + if (ret) + WD_ERR("Error: register hash multibuff %s failed!\n", + hash_mb_driver[i].alg_name); + } +} + +static void __attribute__((destructor)) hash_mb_remove(void) +{ + unsigned long auxval = getauxval(AT_HWCAP); + size_t alg_num = ARRAY_SIZE(hash_mb_driver); + size_t i; + + if (!(auxval & HWCAP_SVE)) + return; + + WD_INFO("Info: unregister hash_mb alg drivers!\n"); + for (i = 0; i < alg_num; i++) + wd_alg_driver_unregister(&hash_mb_driver[i]); +} diff --git a/drv/hash_mb/hash_mb.h b/drv/hash_mb/hash_mb.h new file mode 100644 index 0000000..8b740e4 --- /dev/null +++ b/drv/hash_mb/hash_mb.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: Apache-2.0 */ +/* Copyright 2023 Huawei Technologies Co.,Ltd. All rights reserved. */ +#ifndef __HASH_MB_H +#define __HASH_MB_H + +#include <stdbool.h> +#include <stdint.h> +#include "drv/wd_digest_drv.h" +#include "wd_digest.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define HASH_BLOCK_SIZE 64 +#define HASH_DIGEST_NWORDS 32 + +#if __STDC_VERSION__ >= 201112L +# define __ALIGN_END __attribute__((aligned(64))) +#else +# define __ALIGN_END __aligned(64) +#endif + +struct hash_pad { + __u8 pad[HASH_BLOCK_SIZE * 2]; + __u32 pad_len; +}; + +struct hash_opad { + __u8 opad[HASH_BLOCK_SIZE]; + __u32 opad_size; +}; + +struct hash_job { + void *buffer; + __u64 len; + __u8 result_digest[HASH_DIGEST_NWORDS] __ALIGN_END; + struct hash_pad pad; + struct hash_opad opad; + struct hash_job *next; + struct wd_digest_msg *msg; + bool is_transfer; +}; + +void sm3_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec); +void sm3_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2, + struct hash_job *job3, struct hash_job *job4, int len); +void sm3_mb_asimd_x1(struct hash_job *job, int len); +int sm3_mb_sve_max_lanes(void); +void md5_mb_sve(int blocks, int total_lanes, struct hash_job **job_vec); +void md5_mb_asimd_x4(struct hash_job *job1, struct hash_job *job2, + struct hash_job *job3, struct hash_job *job4, int len); +void md5_mb_asimd_x1(struct hash_job *job, int len); +int md5_mb_sve_max_lanes(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __HASH_MB_H */ diff --git a/drv/hash_mb/md5_mb_asimd_x1.S b/drv/hash_mb/md5_mb_asimd_x1.S new file mode 100644 index 0000000..27d1124 --- /dev/null +++ b/drv/hash_mb/md5_mb_asimd_x1.S @@ -0,0 +1,248 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + q_\name .req q\reg + v_\name .req v\reg + s_\name .req s\reg +.endm + + +.macro round_0_15 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + eor tmp0,\d_c,\d_d + mov k,\kl + and tmp0,tmp0,\d_b + movk k,\kh,lsl 16 + eor tmp0,tmp0,\d_d + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm + +.macro round_16_31 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + eor tmp0,\d_b,\d_c + mov k,\kl + and tmp0,tmp0,\d_d + movk k,\kh,lsl 16 + eor tmp0,tmp0,\d_c + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm + +.macro round_32_47 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + eor tmp0,\d_b,\d_c + mov k,\kl + eor tmp0,tmp0,\d_d + movk k,\kh,lsl 16 + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm + +.macro round_48_63 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req + orn tmp0,\d_b,\d_d + mov k,\kl + eor tmp0,tmp0,\d_c + movk k,\kh,lsl 16 + add tmp1,k,\w + add tmp0,tmp1,tmp0 + add tmp0,\d_a,tmp0 + ror tmp0,tmp0,32 - \r + add \d_a,\d_b,tmp0 +.endm +/* + variables +*/ + job0 .req x0 + digest_addr .req x0 + len .req w1 + end .req x1 + + buf_adr .req x2 + d_a .req w3 + d_b .req w4 + d_c .req w5 + d_d .req w6 + k .req w7 + m0 .req w8 + m1 .req w9 + m2 .req w10 + m3 .req w11 + m4 .req w12 + m5 .req w13 + m6 .req w14 + m7 .req w15 + m8 .req w19 + m9 .req w20 + m10 .req w21 + m11 .req w22 + m12 .req w23 + m13 .req w24 + m14 .req w25 + m15 .req w26 + + tmp0 .req w27 + tmp1 .req w28 + + d_a1 .req w8 + d_b1 .req w9 + d_c1 .req w15 + d_d1 .req w19 + +/* + void md5_mb_asimd_x1(MD5_JOB * job0,int len) +*/ + .global md5_mb_asimd_x1 + .type md5_mb_asimd_x1, %function +md5_mb_asimd_x1: + cmp len,0 + stp x29, x30, [sp,-96]! + ldr buf_adr,[job0],64 + stp x19, x20, [sp, 16] + add end,buf_adr,end,lsl 6 + stp x21, x22, [sp, 32] + ldp d_a,d_b,[digest_addr] + stp x23, x24, [sp, 48] + ldp d_c,d_d,[digest_addr,8] + stp x25, x26, [sp, 64] + stp x27, x28, [sp, 80] + ble .exit + +.loop_start: + ldp m0,m1,[buf_adr],8 + ldp m2,m3,[buf_adr],8 + round_0_15 d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7 + + ldp m4,m5,[buf_adr],8 + round_0_15 d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12 + ldp m6,m7,[buf_adr],8 + round_0_15 d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17 + ldp m8,m9,[buf_adr],8 + round_0_15 d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22 + ldp m10,m11,[buf_adr],8 + round_0_15 d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7 + ldp m12,m13,[buf_adr],8 + round_0_15 d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12 + ldp m14,m15,[buf_adr],8 + round_0_15 d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17 + round_0_15 d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22 + round_0_15 d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7 + round_0_15 d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12 + round_0_15 d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17 + round_0_15 d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22 + round_0_15 d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7 + round_0_15 d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12 + round_0_15 d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17 + round_0_15 d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22 + + round_16_31 d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5 + round_16_31 d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9 + round_16_31 d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14 + round_16_31 d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20 + round_16_31 d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5 + round_16_31 d_d,d_a,d_b,d_c,0x244,0x1453,m10,9 + round_16_31 d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14 + round_16_31 d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20 + round_16_31 d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5 + round_16_31 d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9 + round_16_31 d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14 + round_16_31 d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20 + round_16_31 d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5 + round_16_31 d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9 + round_16_31 d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14 + round_16_31 d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20 + + round_32_47 d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4 + round_32_47 d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11 + round_32_47 d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16 + round_32_47 d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23 + round_32_47 d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4 + round_32_47 d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11 + round_32_47 d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16 + round_32_47 d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23 + round_32_47 d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4 + round_32_47 d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11 + round_32_47 d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16 + round_32_47 d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23 + round_32_47 d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4 + round_32_47 d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11 + round_32_47 d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16 + round_32_47 d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23 + + round_48_63 d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6 + round_48_63 d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10 + round_48_63 d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15 + round_48_63 d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21 + round_48_63 d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6 + round_48_63 d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10 + round_48_63 d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15 + round_48_63 d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21 + round_48_63 d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6 + round_48_63 d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10 + round_48_63 d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15 + round_48_63 d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21 + round_48_63 d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6 + ldp d_a1,d_b1,[digest_addr] + round_48_63 d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10 + ldp d_c1,d_d1,[digest_addr,8] + round_48_63 d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15 + round_48_63 d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21 + + cmp buf_adr,end + add d_a,d_a1 ,d_a + str d_a,[digest_addr] + add d_b,d_b1 ,d_b + str d_b,[digest_addr,4] + add d_c,d_c1 ,d_c + str d_c,[digest_addr,8] + add d_d,d_d1 ,d_d + str d_d,[digest_addr,12] + bne .loop_start + +.exit: + ldp x19, x20, [sp, 16] + ldp x21, x22, [sp, 32] + ldp x23, x24, [sp, 48] + ldp x25, x26, [sp, 64] + ldp x27, x28, [sp, 80] + ldp x29, x30, [sp], 96 + ret + .size md5_mb_asimd_x1, .-md5_mb_asimd_x1 diff --git a/drv/hash_mb/md5_mb_asimd_x4.S b/drv/hash_mb/md5_mb_asimd_x4.S new file mode 100644 index 0000000..5397913 --- /dev/null +++ b/drv/hash_mb/md5_mb_asimd_x4.S @@ -0,0 +1,526 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8-a + +/* +Macros +*/ + +.macro declare_var_vector_reg name:req,reg:req + q_\name .req q\reg + v_\name .req v\reg + s_\name .req s\reg +.endm + +.macro add_key_rol a:req,b:req,k:req,w:req,r:req + add v_tmp0.4s,v_\k().4s,v_\w().4s + add v_tmp1.4s,v_tmp1.4s,v_\a().4s + add v_tmp1.4s,v_tmp1.4s,v_tmp0.4s + shl v_tmp0.4s,v_tmp1.4s,\r + ushr v_tmp1.4s,v_tmp1.4s,32-\r + orr v_tmp0.16b,v_tmp1.16b,v_tmp0.16b + + add v_\a().4s,v_\b().4s,v_tmp0.4s +.endm +.macro round_0_15 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req + mov v_tmp1.16b, v_\b().16b + bsl v_tmp1.16b, v_\c().16b, v_\d().16b + ldr q_\k1,[key_adr],16 + add_key_rol \a,\b,\k,\w,\r +.endm + +.macro round_16_31 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req + mov v_tmp1.16b, v_\d().16b + bsl v_tmp1.16b, v_\b().16b, v_\c().16b + ldr q_\k1,[key_adr],16 + add_key_rol \a,\b,\k,\w,\r +.endm + +.macro round_32_47 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req + eor v_tmp1.16b,v_\b().16b,v_\c().16b + eor v_tmp1.16b,v_tmp1.16b,v_\d().16b + ldr q_\k1,[key_adr],16 + add_key_rol \a,\b,\k,\w,\r +.endm + +.macro round_48_63 a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req + orn v_tmp1.16b,v_\b().16b,v_\d().16b + eor v_tmp1.16b,v_tmp1.16b,v_\c().16b + .ifnb \k1 + ldr q_\k1,[key_adr],16 + .endif + add_key_rol \a,\b,\k,\w,\r +.endm +/* + variables +*/ + declare_var_vector_reg tmp0, 0 + declare_var_vector_reg tmp1, 1 + declare_var_vector_reg k, 2 + declare_var_vector_reg k1, 3 + declare_var_vector_reg a, 4 + declare_var_vector_reg b, 5 + declare_var_vector_reg c, 6 + declare_var_vector_reg d, 7 + declare_var_vector_reg a1, 8 + declare_var_vector_reg b1, 9 + declare_var_vector_reg c1, 10 + declare_var_vector_reg d1, 11 + + declare_var_vector_reg w0, 16 + declare_var_vector_reg w1, 17 + declare_var_vector_reg w2, 18 + declare_var_vector_reg w3, 19 + declare_var_vector_reg w4, 20 + declare_var_vector_reg w5, 21 + declare_var_vector_reg w6, 22 + declare_var_vector_reg w7, 23 + declare_var_vector_reg w8, 24 + declare_var_vector_reg w9, 25 + declare_var_vector_reg w10, 26 + declare_var_vector_reg w11, 27 + declare_var_vector_reg w12, 28 + declare_var_vector_reg w13, 29 + declare_var_vector_reg w14, 30 + declare_var_vector_reg w15, 31 + + len .req w4 + len_x .req x4 + lane0 .req x5 + lane1 .req x6 + lane2 .req x7 + lane3 .req x9 + end .req x4 + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + key_adr .req x10 + +/* + void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1, + MD5_JOB * job2, MD5_JOB * job3, int len) +*/ + .global md5_mb_asimd_x4 + .type md5_mb_asimd_x4, %function +md5_mb_asimd_x4: + stp x29,x30,[sp,-48]! + ldr lane0,[job0],64 + stp d8,d9,[sp,16] + ldr lane1,[job1],64 + stp d10,d11,[sp,32] + ldr lane2,[job2],64 + cmp len,0 + ldr lane3,[job3],64 + ble .exit + + //load digests + ld4 {v_a.s-v_d.s}[0],[job0] + add end,lane0,len_x,lsl 6 + ld4 {v_a.s-v_d.s}[1],[job1] + ld4 {v_a.s-v_d.s}[2],[job2] + ld4 {v_a.s-v_d.s}[3],[job3] +.loop_start: + ld1 {v_w0.s}[0],[lane0],4 + mov v_a1.16b,v_a.16b + ld1 {v_w0.s}[1],[lane1],4 + mov v_b1.16b,v_b.16b + ld1 {v_w0.s}[2],[lane2],4 + mov v_c1.16b,v_c.16b + ld1 {v_w0.s}[3],[lane3],4 + mov v_d1.16b,v_d.16b + + ld3 {v_w1.s-v_w3.s}[0],[lane0],12 + adrp key_adr,.key_consts + ld3 {v_w1.s-v_w3.s}[1],[lane1],12 + add key_adr,key_adr,#:lo12:.key_consts + ld3 {v_w1.s-v_w3.s}[2],[lane2],12 + ldr q_k,[key_adr],16 + ld3 {v_w1.s-v_w3.s}[3],[lane3],12 + + + ld4 {v_w4.s-v_w7.s}[0], [lane0],16 + + round_0_15 a,b,c,d,k,k1,w0,7 + + ld4 {v_w4.s-v_w7.s}[1], [lane1],16 + round_0_15 d,a,b,c,k1,k,w1,12 + ld4 {v_w4.s-v_w7.s}[2], [lane2],16 + round_0_15 c,d,a,b,k,k1,w2,17 + ld4 {v_w4.s-v_w7.s}[3], [lane3],16 + round_0_15 b,c,d,a,k1,k,w3,22 + ld4 {v_w8.s-v_w11.s}[0],[lane0],16 + round_0_15 a,b,c,d,k,k1,w4,7 + ld4 {v_w8.s-v_w11.s}[1],[lane1],16 + round_0_15 d,a,b,c,k1,k,w5,12 + ld4 {v_w8.s-v_w11.s}[2],[lane2],16 + round_0_15 c,d,a,b,k,k1,w6,17 + ld4 {v_w8.s-v_w11.s}[3],[lane3],16 + round_0_15 b,c,d,a,k1,k,w7,22 + ld4 {v_w12.s-v_w15.s}[0],[lane0],16 + round_0_15 a,b,c,d,k,k1,w8,7 + ld4 {v_w12.s-v_w15.s}[1],[lane1],16 + round_0_15 d,a,b,c,k1,k,w9,12 + ld4 {v_w12.s-v_w15.s}[2],[lane2],16 + round_0_15 c,d,a,b,k,k1,w10,17 + ld4 {v_w12.s-v_w15.s}[3],[lane3],16 + round_0_15 b,c,d,a,k1,k,w11,22 + round_0_15 a,b,c,d,k,k1,w12,7 + round_0_15 d,a,b,c,k1,k,w13,12 + round_0_15 c,d,a,b,k,k1,w14,17 + round_0_15 b,c,d,a,k1,k,w15,22 + + round_16_31 a,b,c,d,k,k1,w1,5 + round_16_31 d,a,b,c,k1,k,w6,9 + round_16_31 c,d,a,b,k,k1,w11,14 + round_16_31 b,c,d,a,k1,k,w0,20 + round_16_31 a,b,c,d,k,k1,w5,5 + round_16_31 d,a,b,c,k1,k,w10,9 + round_16_31 c,d,a,b,k,k1,w15,14 + round_16_31 b,c,d,a,k1,k,w4,20 + round_16_31 a,b,c,d,k,k1,w9,5 + round_16_31 d,a,b,c,k1,k,w14,9 + round_16_31 c,d,a,b,k,k1,w3,14 + round_16_31 b,c,d,a,k1,k,w8,20 + round_16_31 a,b,c,d,k,k1,w13,5 + round_16_31 d,a,b,c,k1,k,w2,9 + round_16_31 c,d,a,b,k,k1,w7,14 + round_16_31 b,c,d,a,k1,k,w12,20 + + round_32_47 a,b,c,d,k,k1,w5,4 + round_32_47 d,a,b,c,k1,k,w8,11 + round_32_47 c,d,a,b,k,k1,w11,16 + round_32_47 b,c,d,a,k1,k,w14,23 + round_32_47 a,b,c,d,k,k1,w1,4 + round_32_47 d,a,b,c,k1,k,w4,11 + round_32_47 c,d,a,b,k,k1,w7,16 + round_32_47 b,c,d,a,k1,k,w10,23 + round_32_47 a,b,c,d,k,k1,w13,4 + round_32_47 d,a,b,c,k1,k,w0,11 + round_32_47 c,d,a,b,k,k1,w3,16 + round_32_47 b,c,d,a,k1,k,w6,23 + round_32_47 a,b,c,d,k,k1,w9,4 + round_32_47 d,a,b,c,k1,k,w12,11 + round_32_47 c,d,a,b,k,k1,w15,16 + round_32_47 b,c,d,a,k1,k,w2,23 + + round_48_63 a,b,c,d,k,k1,w0,6 + round_48_63 d,a,b,c,k1,k,w7,10 + round_48_63 c,d,a,b,k,k1,w14,15 + round_48_63 b,c,d,a,k1,k,w5,21 + round_48_63 a,b,c,d,k,k1,w12,6 + round_48_63 d,a,b,c,k1,k,w3,10 + round_48_63 c,d,a,b,k,k1,w10,15 + round_48_63 b,c,d,a,k1,k,w1,21 + round_48_63 a,b,c,d,k,k1,w8,6 + round_48_63 d,a,b,c,k1,k,w15,10 + round_48_63 c,d,a,b,k,k1,w6,15 + round_48_63 b,c,d,a,k1,k,w13,21 + round_48_63 a,b,c,d,k,k1,w4,6 + round_48_63 d,a,b,c,k1,k,w11,10 + round_48_63 c,d,a,b,k,k1,w2,15 + round_48_63 b,c,d,a,k1, ,w9,21 + + + + + cmp lane0,end + add v_a.4s,v_a1.4s,v_a.4s + add v_b.4s,v_b1.4s,v_b.4s + add v_c.4s,v_c1.4s,v_c.4s + add v_d.4s,v_d1.4s,v_d.4s + bne .loop_start + + st4 {v_a.s-v_d.s}[0],[job0] + st4 {v_a.s-v_d.s}[1],[job1] + st4 {v_a.s-v_d.s}[2],[job2] + st4 {v_a.s-v_d.s}[3],[job3] +.exit: + ldp d8,d9,[sp,16] + ldp d10,d11,[sp,32] + ldp x29,x30,[sp],48 + ret +.key_consts: + .word 0xd76aa478 + .word 0xd76aa478 + .word 0xd76aa478 + .word 0xd76aa478 + .word 0xe8c7b756 + .word 0xe8c7b756 + .word 0xe8c7b756 + .word 0xe8c7b756 + .word 0x242070db + .word 0x242070db + .word 0x242070db + .word 0x242070db + .word 0xc1bdceee + .word 0xc1bdceee + .word 0xc1bdceee + .word 0xc1bdceee + .word 0xf57c0faf + .word 0xf57c0faf + .word 0xf57c0faf + .word 0xf57c0faf + .word 0x4787c62a + .word 0x4787c62a + .word 0x4787c62a + .word 0x4787c62a + .word 0xa8304613 + .word 0xa8304613 + .word 0xa8304613 + .word 0xa8304613 + .word 0xfd469501 + .word 0xfd469501 + .word 0xfd469501 + .word 0xfd469501 + .word 0x698098d8 + .word 0x698098d8 + .word 0x698098d8 + .word 0x698098d8 + .word 0x8b44f7af + .word 0x8b44f7af + .word 0x8b44f7af + .word 0x8b44f7af + .word 0xffff5bb1 + .word 0xffff5bb1 + .word 0xffff5bb1 + .word 0xffff5bb1 + .word 0x895cd7be + .word 0x895cd7be + .word 0x895cd7be + .word 0x895cd7be + .word 0x6b901122 + .word 0x6b901122 + .word 0x6b901122 + .word 0x6b901122 + .word 0xfd987193 + .word 0xfd987193 + .word 0xfd987193 + .word 0xfd987193 + .word 0xa679438e + .word 0xa679438e + .word 0xa679438e + .word 0xa679438e + .word 0x49b40821 + .word 0x49b40821 + .word 0x49b40821 + .word 0x49b40821 + .word 0xf61e2562 + .word 0xf61e2562 + .word 0xf61e2562 + .word 0xf61e2562 + .word 0xc040b340 + .word 0xc040b340 + .word 0xc040b340 + .word 0xc040b340 + .word 0x265e5a51 + .word 0x265e5a51 + .word 0x265e5a51 + .word 0x265e5a51 + .word 0xe9b6c7aa + .word 0xe9b6c7aa + .word 0xe9b6c7aa + .word 0xe9b6c7aa + .word 0xd62f105d + .word 0xd62f105d + .word 0xd62f105d + .word 0xd62f105d + .word 0x02441453 + .word 0x02441453 + .word 0x02441453 + .word 0x02441453 + .word 0xd8a1e681 + .word 0xd8a1e681 + .word 0xd8a1e681 + .word 0xd8a1e681 + .word 0xe7d3fbc8 + .word 0xe7d3fbc8 + .word 0xe7d3fbc8 + .word 0xe7d3fbc8 + .word 0x21e1cde6 + .word 0x21e1cde6 + .word 0x21e1cde6 + .word 0x21e1cde6 + .word 0xc33707d6 + .word 0xc33707d6 + .word 0xc33707d6 + .word 0xc33707d6 + .word 0xf4d50d87 + .word 0xf4d50d87 + .word 0xf4d50d87 + .word 0xf4d50d87 + .word 0x455a14ed + .word 0x455a14ed + .word 0x455a14ed + .word 0x455a14ed + .word 0xa9e3e905 + .word 0xa9e3e905 + .word 0xa9e3e905 + .word 0xa9e3e905 + .word 0xfcefa3f8 + .word 0xfcefa3f8 + .word 0xfcefa3f8 + .word 0xfcefa3f8 + .word 0x676f02d9 + .word 0x676f02d9 + .word 0x676f02d9 + .word 0x676f02d9 + .word 0x8d2a4c8a + .word 0x8d2a4c8a + .word 0x8d2a4c8a + .word 0x8d2a4c8a + .word 0xfffa3942 + .word 0xfffa3942 + .word 0xfffa3942 + .word 0xfffa3942 + .word 0x8771f681 + .word 0x8771f681 + .word 0x8771f681 + .word 0x8771f681 + .word 0x6d9d6122 + .word 0x6d9d6122 + .word 0x6d9d6122 + .word 0x6d9d6122 + .word 0xfde5380c + .word 0xfde5380c + .word 0xfde5380c + .word 0xfde5380c + .word 0xa4beea44 + .word 0xa4beea44 + .word 0xa4beea44 + .word 0xa4beea44 + .word 0x4bdecfa9 + .word 0x4bdecfa9 + .word 0x4bdecfa9 + .word 0x4bdecfa9 + .word 0xf6bb4b60 + .word 0xf6bb4b60 + .word 0xf6bb4b60 + .word 0xf6bb4b60 + .word 0xbebfbc70 + .word 0xbebfbc70 + .word 0xbebfbc70 + .word 0xbebfbc70 + .word 0x289b7ec6 + .word 0x289b7ec6 + .word 0x289b7ec6 + .word 0x289b7ec6 + .word 0xeaa127fa + .word 0xeaa127fa + .word 0xeaa127fa + .word 0xeaa127fa + .word 0xd4ef3085 + .word 0xd4ef3085 + .word 0xd4ef3085 + .word 0xd4ef3085 + .word 0x04881d05 + .word 0x04881d05 + .word 0x04881d05 + .word 0x04881d05 + .word 0xd9d4d039 + .word 0xd9d4d039 + .word 0xd9d4d039 + .word 0xd9d4d039 + .word 0xe6db99e5 + .word 0xe6db99e5 + .word 0xe6db99e5 + .word 0xe6db99e5 + .word 0x1fa27cf8 + .word 0x1fa27cf8 + .word 0x1fa27cf8 + .word 0x1fa27cf8 + .word 0xc4ac5665 + .word 0xc4ac5665 + .word 0xc4ac5665 + .word 0xc4ac5665 + .word 0xf4292244 + .word 0xf4292244 + .word 0xf4292244 + .word 0xf4292244 + .word 0x432aff97 + .word 0x432aff97 + .word 0x432aff97 + .word 0x432aff97 + .word 0xab9423a7 + .word 0xab9423a7 + .word 0xab9423a7 + .word 0xab9423a7 + .word 0xfc93a039 + .word 0xfc93a039 + .word 0xfc93a039 + .word 0xfc93a039 + .word 0x655b59c3 + .word 0x655b59c3 + .word 0x655b59c3 + .word 0x655b59c3 + .word 0x8f0ccc92 + .word 0x8f0ccc92 + .word 0x8f0ccc92 + .word 0x8f0ccc92 + .word 0xffeff47d + .word 0xffeff47d + .word 0xffeff47d + .word 0xffeff47d + .word 0x85845dd1 + .word 0x85845dd1 + .word 0x85845dd1 + .word 0x85845dd1 + .word 0x6fa87e4f + .word 0x6fa87e4f + .word 0x6fa87e4f + .word 0x6fa87e4f + .word 0xfe2ce6e0 + .word 0xfe2ce6e0 + .word 0xfe2ce6e0 + .word 0xfe2ce6e0 + .word 0xa3014314 + .word 0xa3014314 + .word 0xa3014314 + .word 0xa3014314 + .word 0x4e0811a1 + .word 0x4e0811a1 + .word 0x4e0811a1 + .word 0x4e0811a1 + .word 0xf7537e82 + .word 0xf7537e82 + .word 0xf7537e82 + .word 0xf7537e82 + .word 0xbd3af235 + .word 0xbd3af235 + .word 0xbd3af235 + .word 0xbd3af235 + .word 0x2ad7d2bb + .word 0x2ad7d2bb + .word 0x2ad7d2bb + .word 0x2ad7d2bb + .word 0xeb86d391 + .word 0xeb86d391 + .word 0xeb86d391 + .word 0xeb86d391 + .size md5_mb_asimd_x4, .-md5_mb_asimd_x4 diff --git a/drv/hash_mb/md5_mb_sve.S b/drv/hash_mb/md5_mb_sve.S new file mode 100644 index 0000000..8d8ecc1 --- /dev/null +++ b/drv/hash_mb/md5_mb_sve.S @@ -0,0 +1,158 @@ +/********************************************************************** + Copyright(c) 2022 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + + .arch armv8.2-a+sve + +// copying data from sparse memory unto continuous stack space +// in oroder to gather-load into SVE registers +.macro copy_mb_16words vecs:req,dest:req + mov src,\vecs + mov dst,\dest + mov counter,total_lanes +10: + ldr tmp,[src],8 + ldr tmp,[tmp] + add tmp,tmp,block_ctr,lsl 6 + ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp] + st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64 + subs counter,counter,1 + b.ne 10b +.endm + +.macro load_init + mov tmpw,16 + index VOFFS.s,0,tmpw + copy_mb_16words job_vec,databuf +.endm + +.macro load_word pipelines:req,windex:req,zreg0:req,zreg1 + add tmp,databuf,\windex * 4 + ld1w { \zreg0().s}, p0/z, [tmp, VOFFS.s, UXTW 2] + .if \pipelines > 1 + add tmp,tmp,veclen,lsl #6 + ld1w {\zreg1().s}, p1/z, [tmp, VOFFS.s, UXTW 2] + .endif +.endm + +#include "md5_sve_common.S" + +/* int md5_mb_sve_max_lanes() + */ + .global md5_mb_sve_max_lanes + .type md5_mb_sve_max_lanes, %function +md5_mb_sve_max_lanes: + cntw x0 + add x0,x0,x0 + ret + .size md5_mb_sve_max_lanes, .-md5_mb_sve_max_lanes + +/* + * void md5_mb_sve(int blocks, int total_lanes, MD5_JOB **job_vec) + */ + num_blocks .req w0 + total_lanes .req w1 + job_vec .req x2 + src .req x5 + dst .req x6 + tmp .req x8 + tmpw .req w8 + block_ctr .req x9 + block_ctr_w .req w9 + savedsp .req x10 + databuf .req x11 + counter .req w12 + veclen .req x13 + veclen_w .req w13 + abcd_buf .req x14 + md5key_adr .req x15 + + .global md5_mb_sve + .type md5_mb_sve, %function +md5_mb_sve: + cbz num_blocks,.return + md5_sve_save_stack + mov savedsp,sp + // reserve (16 * lanes) for abcd buf + mov tmpw,total_lanes,lsl 4 + sub abcd_buf,sp,tmp + // reserve (64 * lanes) for data buf + mov tmpw,total_lanes,lsl 6 + sub databuf,abcd_buf,tmp + mov sp,databuf + adr md5key_adr,MD5_CONST_KEYS + whilelo p0.s,wzr,total_lanes + mov src,job_vec + mov dst,abcd_buf + mov counter,total_lanes +.ldr_hash: + ldr tmp,[src],8 + add tmp,tmp,64 + ld1 {v0.16b},[tmp] + st1 {v0.16b},[dst],16 + subs counter,counter,1 + bne .ldr_hash + ld4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0/z,[abcd_buf] + mov block_ctr,0 + cntp veclen,p0,p0.s + cmp veclen_w,total_lanes + b.eq .loop_1x + whilelo p1.s,veclen_w,total_lanes + add tmp,abcd_buf,veclen,lsl #4 + ld4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1/z,[tmp] + b .loop_2x +.loop_1x: + md5_single 1 + add block_ctr, block_ctr, 1 + cmp block_ctr_w,num_blocks + bne .loop_1x + st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf] + b 1f +.loop_2x: + md5_single 2 + add block_ctr, block_ctr, 1 + cmp block_ctr_w,num_blocks + bne .loop_2x + st4w {VA_0.s,VB_0.s,VC_0.s,VD_0.s},p0,[abcd_buf] + add tmp,abcd_buf,veclen,lsl #4 + st4w {VA_1.s,VB_1.s,VC_1.s,VD_1.s},p1,[tmp] +1: + mov dst,job_vec + mov src,abcd_buf +.str_hash: + ld1 {v0.16b},[src],16 + ldr tmp,[dst],8 + add tmp,tmp,64 + st1 {v0.16b},[tmp] + subs total_lanes,total_lanes,1 + bne .str_hash + mov sp,savedsp + md5_sve_restore_stack +.return: + ret + .size md5_mb_sve, .-md5_mb_sve diff --git a/drv/hash_mb/md5_sve_common.S b/drv/hash_mb/md5_sve_common.S new file mode 100644 index 0000000..ed81482 --- /dev/null +++ b/drv/hash_mb/md5_sve_common.S @@ -0,0 +1,478 @@ +/********************************************************************** + Copyright(c) 2022 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + VK .req z0 + VOFFS .req z1 + VA_0 .req z2 + VB_0 .req z3 + VC_0 .req z4 + VD_0 .req z5 + VF_0 .req z6 + VF_1 .req z7 + VA_1 .req z16 + VB_1 .req z17 + VC_1 .req z18 + VD_1 .req z19 + MD5WORD0_0 .req z20 + MD5WORD1_0 .req z21 + MD5WORD0_1 .req z22 + MD5WORD1_1 .req z23 + TMPV0 .req v20 + TMPV1 .req v21 + TMPV2 .req v22 + TMPV3 .req v23 + VTMP_0 .req z24 + VAA_0 .req z25 + VBB_0 .req z26 + VCC_0 .req z27 + VDD_0 .req z28 + VTMP_1 .req z29 + VAA_1 .req z30 + VBB_1 .req z31 + VCC_1 .req z8 + VDD_1 .req z9 + TT .req z0 + +.macro rotate_left_x1 out:req,in:req,tmp:req,bits + .if \bits == 16 + revh \out().s,p0/m,\in().s + .else + .if have_sve2 == 0 + lsl \tmp().s, \in().s,\bits + lsr \out().s,\in().s,32-\bits + orr \out().d,\out().d,\tmp().d + .else + movprfx \out().d,\in().d + xar \out().s,\out().s,VZERO.s,32-\bits + .endif + .endif +.endm + +.macro rotate_left_x2 out:req,in:req,tmp:req,bits,out1:req,in1:req,tmp1:req,bits1 + + .if \bits == 16 + revh \out().s,p0/m,\in().s + revh \out1().s,p0/m,\in1().s + .else + .if have_sve2 == 0 + lsl \tmp().s, \in().s,\bits + lsl \tmp1().s, \in1().s,\bits1 + lsr \out().s,\in().s,32-\bits + lsr \out1().s,\in1().s,32-\bits1 + orr \out().d,\out().d,\tmp().d + orr \out1().d,\out1().d,\tmp1().d + .else + movprfx \out().d,\in().d + xar \out().s,\out().s,VZERO.s,32-\bits + movprfx \out1().d,\in1().d + xar \out1().s,\out1().s,VZERO.s,32-\bits1 + .endif + .endif +.endm + +.macro bsl_x1 ret:req,x:req,y:req,z:req,tmp:req + .if have_sve2 == 0 + bic \ret().d,\z().d,\x().d + and \tmp().d,\x().d,\y().d + orr \ret().d,\ret().d,\tmp().d + .else + movprfx \ret().d,\x().d + bsl \ret().d,\ret().d,\y().d,\z().d + .endif +.endm + +.macro bsl_x2 ret:req,x:req,y:req,z:req,tmp:req,ret1:req,x1:req,y1:req,z1:req,tmp1:req + .if have_sve2 == 0 + bic \ret().d,\z().d,\x().d + bic \ret1().d,\z1().d,\x1().d + and \tmp().d,\x().d,\y().d + and \tmp1().d,\x1().d,\y1().d + orr \ret().d,\ret().d,\tmp().d + orr \ret1().d,\ret1().d,\tmp1().d + .else + movprfx \ret().d,\x().d + bsl \ret().d,\ret().d,\y().d,\z().d + movprfx \ret1().d,\x1().d + bsl \ret1().d,\ret1().d,\y1().d,\z1().d + .endif +.endm + + +// F = D ^ (B and (C xor D)) +// that is (B and C) or ((not B) and D) +.macro FUNC_F0_x1 + bsl_x1 VF_0,VB_0,VC_0,VD_0,VTMP_0 +.endm + +.macro FUNC_F0_x2 + bsl_x2 VF_0,VB_0,VC_0,VD_0,VTMP_0,VF_1,VB_1,VC_1,VD_1,VTMP_1 +.endm + +// F = C xor (D and (B xor C)) +// that is (D and B) or ((not D) and C) +.macro FUNC_F1_x1 + bsl_x1 VF_0,VD_0,VB_0,VC_0,VTMP_0 +.endm + +.macro FUNC_F1_x2 + bsl_x2 VF_0,VD_0,VB_0,VC_0,VTMP_0,VF_1,VD_1,VB_1,VC_1,VTMP_1 +.endm + +// F := B xor C xor D +.macro FUNC_F2_x1 + .if have_sve2 == 0 + eor VF_0.d,VB_0.d,VC_0.d + eor VF_0.d,VF_0.d,VD_0.d + .else + movprfx VF_0.d,VB_0.d + eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d + .endif +.endm + +.macro FUNC_F2_x2 + .if have_sve2 == 0 + eor VF_0.d,VB_0.d,VC_0.d + eor VF_1.d,VB_1.d,VC_1.d + eor VF_0.d,VF_0.d,VD_0.d + eor VF_1.d,VF_1.d,VD_1.d + .else + movprfx VF_0.d,VB_0.d + eor3 VF_0.d,VF_0.d,VC_0.d,VD_0.d + movprfx VF_1.d,VB_1.d + eor3 VF_1.d,VF_1.d,VC_1.d,VD_1.d + .endif +.endm + +// F := C xor (B or (not D)) +.macro FUNC_F3_x1 + not VF_0.s,p0/m,VD_0.s + orr VF_0.d,VF_0.d,VB_0.d + eor VF_0.d,VF_0.d,VC_0.d +.endm + +.macro FUNC_F3_x2 + not VF_0.s,p0/m,VD_0.s + not VF_1.s,p0/m,VD_1.s + orr VF_0.d,VF_0.d,VB_0.d + orr VF_1.d,VF_1.d,VB_1.d + eor VF_0.d,VF_0.d,VC_0.d + eor VF_1.d,VF_1.d,VC_1.d +.endm + +.macro SWAP_STATES + .unreq TT + TT .req VA_0 + .unreq VA_0 + VA_0 .req VD_0 + .unreq VD_0 + VD_0 .req VC_0 + .unreq VC_0 + VC_0 .req VB_0 + .unreq VB_0 + VB_0 .req TT + + .unreq TT + TT .req VA_1 + .unreq VA_1 + VA_1 .req VD_1 + .unreq VD_1 + VD_1 .req VC_1 + .unreq VC_1 + VC_1 .req VB_1 + .unreq VB_1 + VB_1 .req TT +.endm + +.macro MD5_STEP_x1 windex:req,mg:req,func_f:req,bits:req + ld1rw {VK.s},p0/z,[md5key_adr,windex * 4] + \func_f()_x1 + add VTMP_0.s,VA_0.s,\mg()_0.s + add VF_0.s,VF_0.s,VK.s + add VF_0.s,VF_0.s,VTMP_0.s + rotate_left_x1 VA_0,VF_0,VTMP_0,\bits + add VA_0.s,VA_0.s,VB_0.s +.endm + +.macro MD5_STEP_x2 windex:req,mg:req,func_f:req,bits:req + ld1rw {VK.s},p0/z,[md5key_adr,windex * 4] + \func_f()_x2 + add VTMP_0.s,VA_0.s,\mg()_0.s + add VTMP_1.s,VA_1.s,\mg()_1.s + add VF_0.s,VF_0.s,VK.s + add VF_1.s,VF_1.s,VK.s + add VF_0.s,VF_0.s,VTMP_0.s + add VF_1.s,VF_1.s,VTMP_1.s + rotate_left_x2 VA_0,VF_0,VTMP_0,\bits,VA_1,VF_1,VTMP_1,\bits + add VA_0.s,VA_0.s,VB_0.s + add VA_1.s,VA_1.s,VB_1.s +.endm + +.altmacro +.macro load_words index:req,mg:req + load_word %num_pipelines,\index,MD5WORD\mg()_0,MD5WORD\mg()_1 +.endm + +.macro MD5_STEP_WRAPPER pipelines:req,windex:req,gindex:req,mg:req,\ + func_f:req,bits:req,gindex_next,mg_next + .ifnb \gindex_next + load_words \gindex_next,\mg_next + .endif + MD5_STEP_x\pipelines() \windex,MD5WORD\mg(),\func_f,\bits +.endm + +.macro exec_step windex:req,gindex:req,bits:req,gindex_next + .if \windex % 2 == 0 + mg=0 + mg_next=1 + .else + mg=1 + mg_next=0 + .endif + + .if \windex <= 15 + MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ + FUNC_F0,\bits,\gindex_next,%mg_next + .endif + .if \windex >= 16 && \windex <= 31 + MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ + FUNC_F1,\bits,\gindex_next,%mg_next + .endif + .if \windex >= 32 && \windex <= 47 + MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ + FUNC_F2,\bits,\gindex_next,%mg_next + .endif + .if \windex >= 48 && \windex < 63 + MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,\ + FUNC_F3,\bits,\gindex_next,%mg_next + .endif + .if \windex == 63 + MD5_STEP_WRAPPER %num_pipelines,\windex,\gindex,%mg,FUNC_F3,\bits + .endif + SWAP_STATES +.endm + +.macro exec_steps + exec_step 0,0,7,1 + exec_step 1,1,12,2 + exec_step 2,2,17,3 + exec_step 3,3,22,4 + exec_step 4,4,7,5 + exec_step 5,5,12,6 + exec_step 6,6,17,7 + exec_step 7,7,22,8 + exec_step 8,8,7,9 + exec_step 9,9,12,10 + exec_step 10,10,17,11 + exec_step 11,11,22,12 + exec_step 12,12,7,13 + exec_step 13,13,12,14 + exec_step 14,14,17,15 + exec_step 15,15,22,1 + exec_step 16,1,5,6 + exec_step 17,6,9,11 + exec_step 18,11,14,0 + exec_step 19,0,20,5 + exec_step 20,5,5,10 + exec_step 21,10,9,15 + exec_step 22,15,14,4 + exec_step 23,4,20,9 + exec_step 24,9,5,14 + exec_step 25,14,9,3 + exec_step 26,3,14,8 + exec_step 27,8,20,13 + exec_step 28,13,5,2 + exec_step 29,2,9,7 + exec_step 30,7,14,12 + exec_step 31,12,20,5 + exec_step 32,5,4,8 + exec_step 33,8,11,11 + exec_step 34,11,16,14 + exec_step 35,14,23,1 + exec_step 36,1,4,4 + exec_step 37,4,11,7 + exec_step 38,7,16,10 + exec_step 39,10,23,13 + exec_step 40,13,4,0 + exec_step 41,0,11,3 + exec_step 42,3,16,6 + exec_step 43,6,23,9 + exec_step 44,9,4,12 + exec_step 45,12,11,15 + exec_step 46,15,16,2 + exec_step 47,2,23,0 + exec_step 48,0,6,7 + exec_step 49,7,10,14 + exec_step 50,14,15,5 + exec_step 51,5,21,12 + exec_step 52,12,6,3 + exec_step 53,3,10,10 + exec_step 54,10,15,1 + exec_step 55,1,21,8 + exec_step 56,8,6,15 + exec_step 57,15,10,6 + exec_step 58,6,15,13 + exec_step 59,13,21,4 + exec_step 60,4,6,11 + exec_step 61,11,10,2 + exec_step 62,2,15,9 + exec_step 63,9,21 +.endm + +.macro prepare_x1 + load_words 0,0 + orr VAA_0.d,VA_0.d,VA_0.d + orr VBB_0.d,VB_0.d,VB_0.d + orr VCC_0.d,VC_0.d,VC_0.d + orr VDD_0.d,VD_0.d,VD_0.d +.endm + +.macro prepare_x2 + load_words 0,0 + orr VAA_0.d,VA_0.d,VA_0.d + orr VAA_1.d,VA_1.d,VA_1.d + orr VBB_0.d,VB_0.d,VB_0.d + orr VBB_1.d,VB_1.d,VB_1.d + orr VCC_0.d,VC_0.d,VC_0.d + orr VCC_1.d,VC_1.d,VC_1.d + orr VDD_0.d,VD_0.d,VD_0.d + orr VDD_1.d,VD_1.d,VD_1.d +.endm + +.macro finish_x1 + add VA_0.s,VA_0.s,VAA_0.s + add VB_0.s,VB_0.s,VBB_0.s + add VC_0.s,VC_0.s,VCC_0.s + add VD_0.s,VD_0.s,VDD_0.s +.endm + +.macro finish_x2 + add VA_0.s,VA_0.s,VAA_0.s + add VA_1.s,VA_1.s,VAA_1.s + add VB_0.s,VB_0.s,VBB_0.s + add VB_1.s,VB_1.s,VBB_1.s + add VC_0.s,VC_0.s,VCC_0.s + add VC_1.s,VC_1.s,VCC_1.s + add VD_0.s,VD_0.s,VDD_0.s + add VD_1.s,VD_1.s,VDD_1.s +.endm + +.macro md5_single pipelines:req,sve2 + .ifnb \sve2 + have_sve2=1 + eor VZERO.d,VZERO.d,VZERO.d + .else + have_sve2=0 + .endif + num_pipelines=\pipelines + load_init + + prepare_x\pipelines() + exec_steps + finish_x\pipelines() +.endm + +.macro md5_sve_save_stack + stp d8,d9,[sp, -48]! + stp d10,d11,[sp, 16] + stp d12,d13,[sp, 32] +.endm + +.macro md5_sve_restore_stack + ldp d10,d11,[sp, 16] + ldp d12,d13,[sp, 32] + ldp d8,d9,[sp],48 +.endm + + .section .rodata.cst16,"aM",@progbits,16 + .align 16 + +MD5_CONST_KEYS: + .word 0xd76aa478 + .word 0xe8c7b756 + .word 0x242070db + .word 0xc1bdceee + .word 0xf57c0faf + .word 0x4787c62a + .word 0xa8304613 + .word 0xfd469501 + .word 0x698098d8 + .word 0x8b44f7af + .word 0xffff5bb1 + .word 0x895cd7be + .word 0x6b901122 + .word 0xfd987193 + .word 0xa679438e + .word 0x49b40821 + .word 0xf61e2562 + .word 0xc040b340 + .word 0x265e5a51 + .word 0xe9b6c7aa + .word 0xd62f105d + .word 0x02441453 + .word 0xd8a1e681 + .word 0xe7d3fbc8 + .word 0x21e1cde6 + .word 0xc33707d6 + .word 0xf4d50d87 + .word 0x455a14ed + .word 0xa9e3e905 + .word 0xfcefa3f8 + .word 0x676f02d9 + .word 0x8d2a4c8a + .word 0xfffa3942 + .word 0x8771f681 + .word 0x6d9d6122 + .word 0xfde5380c + .word 0xa4beea44 + .word 0x4bdecfa9 + .word 0xf6bb4b60 + .word 0xbebfbc70 + .word 0x289b7ec6 + .word 0xeaa127fa + .word 0xd4ef3085 + .word 0x04881d05 + .word 0xd9d4d039 + .word 0xe6db99e5 + .word 0x1fa27cf8 + .word 0xc4ac5665 + .word 0xf4292244 + .word 0x432aff97 + .word 0xab9423a7 + .word 0xfc93a039 + .word 0x655b59c3 + .word 0x8f0ccc92 + .word 0xffeff47d + .word 0x85845dd1 + .word 0x6fa87e4f + .word 0xfe2ce6e0 + .word 0xa3014314 + .word 0x4e0811a1 + .word 0xf7537e82 + .word 0xbd3af235 + .word 0x2ad7d2bb + .word 0xeb86d391 diff --git a/drv/hash_mb/sm3_mb_asimd_x1.S b/drv/hash_mb/sm3_mb_asimd_x1.S new file mode 100644 index 0000000..c7362de --- /dev/null +++ b/drv/hash_mb/sm3_mb_asimd_x1.S @@ -0,0 +1,387 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name() .req q\reg + v\name() .req v\reg + s\name() .req s\reg +.endm + + job .req x0 + len .req x1 + data .req x2 + digest .req x0 + + msg0 .req w3 + msg1 .req w4 + msg2 .req w5 + msg3 .req w6 + msg4 .req w7 + + msg .req w9 + msgP .req w10 + SS1 .req w11 + SS2 .req w12 + TT1 .req w13 + TT2 .req w14 + Tj .req w15 + tmp0 .req w19 + tmp1 .req w20 + dig_A .req w21 + dig_B .req w22 + dig_C .req w23 + dig_D .req w24 + dig_E .req w25 + dig_F .req w26 + dig_G .req w27 + dig_H .req w28 + + declare_var_vector_reg dig0,0 + declare_var_vector_reg dig1,1 + declare_var_vector_reg dig0_bak,2 + declare_var_vector_reg dig1_bak,3 + declare_var_vector_reg vect_msg0,4 + declare_var_vector_reg vect_msg1,5 + declare_var_vector_reg vect_msg2,6 + declare_var_vector_reg vect_msg3,7 + + declare_var_vector_reg vect_msgP0,16 + declare_var_vector_reg vect_msgP1,17 + declare_var_vector_reg vect_msgP2,18 + + + + + + +// round 0-11 +.macro sm3_round_0 round:req + ldr msg, [sp,msg_off+4*\round()] + ldr msgP,[sp,wp_off +4*\round()] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + eor TT1,dig_A,dig_B + eor TT2,dig_E,dig_F + add SS2,SS2,msgP + eor TT2,TT2,dig_G + add SS1,SS1,msg + eor TT1,TT1,dig_C + add SS2,SS2,dig_D + add SS1,SS1,dig_H + add TT1,TT1,SS2 + add TT2,TT2,SS1 + mov dig_D,dig_C + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,(32-1) +.endm + +//round 12-15 +.macro sm3_round_12 round:req + ldr msg, [sp,msg_off+4*((\round())%17)] + ldr msg0,[sp,msg_off+4*((\round()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round()+4 - 3)%17)] + eor TT1,dig_A,dig_B + eor TT2,dig_E,dig_F + add SS2,SS2,dig_D + eor TT2,TT2,dig_G + add SS1,SS1,msg + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + eor TT1,TT1,dig_C + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + mov dig_D,dig_C + str msg0,[sp,msg_off+4*((\round()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,32-1 +.endm + +// round 16-62 +.macro sm3_round_16 round:req + ldr msg, [sp,msg_off+4*((\round())%17)] + ldr msg0,[sp,msg_off+4*((\round()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round()+4 - 3)%17)] + orr TT1,dig_B,dig_C + and tmp0,dig_B,dig_C + + eor TT2,dig_F,dig_G + and TT1,TT1,dig_A + add SS2,SS2,dig_D + orr TT1,TT1,tmp0 + and TT2,TT2,dig_E + add SS1,SS1,msg + eor TT2,TT2,dig_G + + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + mov dig_D,dig_C + str msg0,[sp,msg_off+4*((\round()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ror dig_C,dig_B,32-9 + mov dig_B,dig_A + mov dig_A,TT1 + eor TT1,TT2,TT2,ror (32-17) + mov dig_H,dig_G + ror dig_G,dig_F,32-19 + mov dig_F,dig_E + eor dig_E,TT1,TT2,ror(32-9) + ror Tj,Tj,32-1 +.endm + +//round 63 +.macro sm3_round_63 round:req + ldr msg, [sp,msg_off+4*((\round())%17)] + ldr msg0,[sp,msg_off+4*((\round()+4 - 16)%17)] + ldr msg1,[sp,msg_off+4*((\round()+4 - 9)%17)] + add SS1,dig_E,Tj + ror TT1,dig_A,32-12 + add SS1,SS1,TT1 + ror SS1,SS1,32-7 //SS1 done + eor SS2,SS1,TT1 //SS2 done + eor msg0,msg0,msg1 + ldr msg2,[sp,msg_off+4*((\round()+4 - 3)%17)] + orr TT1,dig_B,dig_C + and tmp0,dig_B,dig_C + eor TT2,dig_F,dig_G + and TT1,TT1,dig_A + add SS2,SS2,dig_D + orr TT1,TT1,tmp0 + and TT2,TT2,dig_E + add SS1,SS1,msg + eor TT2,TT2,dig_G + eor msg0,msg0,msg2,ror (32-15) + ldr msg3,[sp,msg_off+4*((\round()+4 - 13)%17)] + ldr msg4,[sp,msg_off+4*((\round()+4 - 6)%17)] + eor msg1,msg0,msg0,ror (32 -15) + add TT1,TT1,SS2 + eor msg4,msg4,msg3, ror (32-7) + eor msg0,msg1,msg0, ror (32-23) + add SS1,SS1,dig_H + eor msg0,msg0,msg4 + add TT2,TT2,SS1 + str msg0,[sp,msg_off+4*((\round()+4)%17)] + eor msgP,msg,msg0 + add TT1,TT1,msgP + ins vdig0_bak.s[3],dig_C + ror dig_C,dig_B,32-9 + ins vdig0_bak.s[1],dig_A + ins vdig0_bak.s[0],TT1 + ins vdig0_bak.s[2],dig_C + eor TT1,TT2,TT2,ror (32-17) + ins vdig1_bak.s[3],dig_G + ror dig_G,dig_F,32-19 + ins vdig1_bak.s[1],dig_E + ins vdig1_bak.s[2],dig_G + eor dig_E,TT1,TT2,ror(32-9) + ins vdig1_bak.s[0],dig_E +.endm + + .set wp_off , 96 + .set msg_off, 96 + 12*4 +#define STACK_SIZE 224 + .global sm3_mb_asimd_x1 + .type sm3_mb_asimd_x1, %function +sm3_mb_asimd_x1: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + ldr data,[job],64 + ldp qdig0,qdig1,[digest] + stp x19, x20, [sp, 16] + stp x21, x22, [sp, 32] + rev32 vdig0.16b,vdig0.16b + stp x23, x24, [sp, 48] + rev32 vdig1.16b,vdig1.16b + stp x25, x26, [sp, 64] + stp x27, x28, [sp, 80] + ble .exit_func + +.start_loop: + + /** prepare first 12 round data **/ + ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64 + mov Tj, 17689 + umov dig_A,vdig0.s[0] + movk Tj, 0x79cc, lsl 16 + rev32 vvect_msg0.16b,vvect_msg0.16b + umov dig_B,vdig0.s[1] + rev32 vvect_msg1.16b,vvect_msg1.16b + umov dig_C,vdig0.s[2] + rev32 vvect_msg2.16b,vvect_msg2.16b + umov dig_D,vdig0.s[3] + rev32 vvect_msg3.16b,vvect_msg3.16b + umov dig_E,vdig1.s[0] + stp qvect_msg0,qvect_msg1,[sp,msg_off] + umov dig_F,vdig1.s[1] + stp qvect_msg2,qvect_msg3,[sp,msg_off+32] + umov dig_G,vdig1.s[2] + eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b + eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b + umov dig_H,vdig1.s[3] + stp qvect_msgP0,qvect_msgP1,[sp,wp_off] + eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b + str qvect_msgP2,[sp,wp_off+32] + + sm3_round_0 0 + sm3_round_0 1 + sm3_round_0 2 + sm3_round_0 3 + sm3_round_0 4 + sm3_round_0 5 + sm3_round_0 6 + sm3_round_0 7 + sm3_round_0 8 + sm3_round_0 9 + sm3_round_0 10 + sm3_round_0 11 + + sm3_round_12 12 + sm3_round_12 13 + sm3_round_12 14 + sm3_round_12 15 + mov Tj, 0x7a87 + movk Tj, 0x9d8a, lsl 16 + sm3_round_16 16 + sm3_round_16 17 + sm3_round_16 18 + sm3_round_16 19 + sm3_round_16 20 + sm3_round_16 21 + sm3_round_16 22 + sm3_round_16 23 + sm3_round_16 24 + sm3_round_16 25 + sm3_round_16 26 + sm3_round_16 27 + sm3_round_16 28 + sm3_round_16 29 + sm3_round_16 30 + sm3_round_16 31 + sm3_round_16 32 + sm3_round_16 33 + sm3_round_16 34 + sm3_round_16 35 + sm3_round_16 36 + sm3_round_16 37 + sm3_round_16 38 + sm3_round_16 39 + sm3_round_16 40 + sm3_round_16 41 + sm3_round_16 42 + sm3_round_16 43 + sm3_round_16 44 + sm3_round_16 45 + sm3_round_16 46 + sm3_round_16 47 + sm3_round_16 48 + sm3_round_16 49 + sm3_round_16 50 + sm3_round_16 51 + sm3_round_16 52 + sm3_round_16 53 + sm3_round_16 54 + sm3_round_16 55 + sm3_round_16 56 + sm3_round_16 57 + sm3_round_16 58 + sm3_round_16 59 + sm3_round_16 60 + sm3_round_16 61 + sm3_round_16 62 + sm3_round_63 63 + subs len,len,1 + eor vdig0.16b,vdig0.16b,vdig0_bak.16b + eor vdig1.16b,vdig1.16b,vdig1_bak.16b + bne .start_loop +.exit_func: + ldp x19, x20, [sp, 16] + rev32 vdig0.16b,vdig0.16b + ldp x21, x22, [sp, 32] + rev32 vdig1.16b,vdig1.16b + ldp x23, x24, [sp, 48] + stp qdig0,qdig1,[digest] + ldp x25, x26, [sp, 64] + ldp x27, x28, [sp, 80] + ldp x29, x30, [sp], STACK_SIZE + ret + .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1 + diff --git a/drv/hash_mb/sm3_mb_asimd_x4.S b/drv/hash_mb/sm3_mb_asimd_x4.S new file mode 100644 index 0000000..975a07c --- /dev/null +++ b/drv/hash_mb/sm3_mb_asimd_x4.S @@ -0,0 +1,576 @@ +/********************************************************************** + Copyright(c) 2020 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR + dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY + THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE + OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a + .text + .align 2 + .p2align 3,,7 + +.macro declare_var_vector_reg name:req,reg:req + q\name() .req q\reg + v\name() .req v\reg + s\name() .req s\reg +.endm + + job0 .req x0 + job1 .req x1 + job2 .req x2 + job3 .req x3 + len .req x4 + + job0_data .req x5 + job1_data .req x6 + job2_data .req x7 + job3_data .req x9 + + job0_digest .req x0 + job1_digest .req x1 + job2_digest .req x2 + job3_digest .req x3 + job0_tmp .req x10 + job1_tmp .req x11 + job2_tmp .req x12 + job3_tmp .req x13 + const_adr .req x14 + + + declare_var_vector_reg msg0,0 + declare_var_vector_reg msg1,1 + declare_var_vector_reg msg2,2 + declare_var_vector_reg msg3,3 + declare_var_vector_reg msg4,4 + declare_var_vector_reg msg5,5 + declare_var_vector_reg msg6,6 + declare_var_vector_reg msg7,7 + declare_var_vector_reg msg8,8 + declare_var_vector_reg msg9,9 + declare_var_vector_reg msg10,10 + declare_var_vector_reg msg11,11 + declare_var_vector_reg msg12,12 + declare_var_vector_reg msg13,13 + declare_var_vector_reg msg14,14 + declare_var_vector_reg msg15,15 + declare_var_vector_reg msg16,16 + + + declare_var_vector_reg dig_A,24 + declare_var_vector_reg dig_B,25 + declare_var_vector_reg dig_C,26 + declare_var_vector_reg dig_D,27 + declare_var_vector_reg dig_E,28 + declare_var_vector_reg dig_F,29 + declare_var_vector_reg dig_G,30 + declare_var_vector_reg dig_H,31 + + declare_var_vector_reg TT1,17 + declare_var_vector_reg TT2,18 + declare_var_vector_reg SS1,19 + declare_var_vector_reg SS2,20 + declare_var_vector_reg tmp0,21 + declare_var_vector_reg word_pair,23 + declare_var_vector_reg Tj,22 + + +.macro rol32 target:req,reg:req,bit:req + ushr v\target().4s,v\reg().4s,32 - \bit + sli v\target().4s,v\reg().4s,\bit +.endm + +// round 0-11 +.macro sm3_round_0 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\round().16b,vmsg\round().16b + rev32 vmsg\wp().16b,vmsg\wp().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round().16b,vmsg\wp().16b + + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + + add vSS1.4s,vSS1.4s,vmsg\round().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + + +.macro sm3_round_4 round:req,wp:req + + ushr vtmp0.4s,vdig_A.4s,32 - 12 + add vSS1.4s,vdig_E.4s,vTj.4s + sli vtmp0.4s,vdig_A.4s,12 + rev32 vmsg\wp().16b,vmsg\wp().16b + add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,TT1,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round().16b,vmsg\wp().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + +.endm + +//round 12-15 +.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m0().16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m1().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vtmp0.16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m4().16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round().16b,vmsg\plus_4().16b + eor vTT1.16b,vdig_A.16b,vdig_B.16b + eor vTT1.16b,vTT1.16b,vdig_C.16b + eor vTT2.16b,vdig_E.16b,vdig_F.16b + eor vTT2.16b,vTT2.16b,vdig_G.16b + add vSS1.4s,vSS1.4s,vmsg\round().4s + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +// round 16-62 +.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m0().16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m1().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vtmp0.16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m4().16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round().16b,vmsg\plus_4().16b + mov vTT2.16b,vdig_E.16b + orr vTT1.16b,vdig_B.16b,vdig_C.16b + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + ushr vtmp0.4s,vTj.4s,32-1 + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + sli vtmp0.4s,vTj.4s,1 + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + mov vTj.16b,vtmp0.16b + //D=C + mov vdig_D.16b,vdig_C.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + //B=A + mov vdig_B.16b,vdig_A.16b + //A=TT1 + mov vdig_A.16b,vTT1.16b + // H=G + mov vdig_H.16b,vdig_G.16b + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + //F = E + mov vdig_F.16b,vdig_E.16b + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b +.endm + +//round 63 +.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4 + rol32 msg\plus_4,msg\m2,15 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m0().16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m1().16b + rol32 tmp0,msg\plus_4,15 + rol32 word_pair,msg\plus_4,23 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vtmp0.16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vword_pair.16b + rol32 tmp0,msg\m3,7 + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vmsg\m4().16b + eor vmsg\plus_4().16b,vmsg\plus_4().16b,vtmp0.16b + ushr vtmp0.4s,vdig_A.4s,32 - 12 + sli vtmp0.4s,vdig_A.4s,12 + add vSS1.4s,vdig_E.4s,vTj.4s + add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done + rol32 SS1,SS2,7 + eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done + eor vword_pair.16b,vmsg\round().16b,vmsg\plus_4().16b + + ldp qmsg0,qmsg1,[sp,dig_off+ 0] + mov vTT2.16b,vdig_E.16b + ldp qmsg2,qmsg3,[sp,dig_off+ 32] + orr vTT1.16b,vdig_B.16b,vdig_C.16b + ldp qmsg4,qmsg5,[sp,dig_off+ 64] + and vtmp0.16b,vdig_B.16b,vdig_C.16b + bsl vTT2.16b,vdig_F.16b,vdig_G.16b + ldp qmsg6,qmsg7,[sp,dig_off+ 96] + and vTT1.16b,vTT1.16b,vdig_A.16b + add vSS1.4s,vSS1.4s,vmsg\round().4s + orr vTT1.16b,vTT1.16b,vtmp0.16b + add vSS2.4s,vSS2.4s,vword_pair.4s + add vTT1.4s,vTT1.4s,vdig_D.4s + add vTT2.4s,vTT2.4s,vdig_H.4s + add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done + add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done + //D=C + eor vdig_D.16b,vdig_C.16b,vmsg3.16b + //C = ROTL32(B, 9); + ushr vdig_C.4s,vdig_B.4s,32 - 9 + sli vdig_C.4s,vdig_B.4s,9 + eor vdig_C.16b,vdig_C.16b,vmsg2.16b + //B=A + eor vdig_B.16b,vdig_A.16b,vmsg1.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + //A=TT1 + eor vdig_A.16b,vTT1.16b,vmsg0.16b + // H=G + eor vdig_H.16b,vdig_G.16b,vmsg7.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + //G = ROTL32(F,19) + rol32 dig_G,dig_F,19 + eor vdig_G.16b,vdig_G.16b,vmsg6.16b + //F = E + eor vdig_F.16b,vdig_E.16b,vmsg5.16b + stp qdig_G,qdig_H,[sp,dig_off+ 96] + // E=Target, TT2=src, TT1,SS1,SS2 is free + // E = P0(TT2); + ushr vSS2.4s, vTT2.4s, 32 - 9 + ushr vSS1.4s, vTT2.4s, 32 - 17 + sli vSS2.4s, vTT2.4s, 9 + sli vSS1.4s, vTT2.4s, 17 + eor vdig_E.16b, vTT2.16b, vSS1.16b + eor vdig_E.16b, vdig_E.16b, vSS2.16b + eor vdig_E.16b, vdig_E.16b, vmsg4.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] +.endm + + .set dig_off , 80 + +#define STACK_SIZE 224 + .global sm3_mb_asimd_x4 + .type sm3_mb_asimd_x4, %function +sm3_mb_asimd_x4: + stp x29,x30, [sp,-STACK_SIZE]! + cmp len,0 + //push d8~d15 + ldr job0_data, [job0],64 + stp d8,d9, [sp,16] + ldr job1_data, [job1],64 + stp d10,d11,[sp,32] + ldr job2_data, [job2],64 + stp d12,d13,[sp,48] + ldr job3_data, [job3],64 + stp d14,d15,[sp,64] + ble .exit_func + + mov job0_tmp,job0_digest + mov job1_tmp,job1_digest + mov job2_tmp,job2_digest + mov job3_tmp,job3_digest + //load digests + ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16 + ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16 + adrp const_adr, .consts + ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16 + add const_adr, const_adr, #:lo12:.consts + ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp] + rev32 vdig_A.16b,vdig_A.16b + ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp] + rev32 vdig_B.16b,vdig_B.16b + ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp] + rev32 vdig_C.16b,vdig_C.16b + ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp] + rev32 vdig_D.16b,vdig_D.16b + stp qdig_A,qdig_B,[sp,dig_off+ 0] + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + stp qdig_C,qdig_D,[sp,dig_off+ 32] + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + stp qdig_E,qdig_F,[sp,dig_off+ 64] + stp qdig_G,qdig_H,[sp,dig_off+ 96] + +.start_loop: + ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16 + ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16 + ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16 + ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16 + ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16 + ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16 + ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16 + ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16 + ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16 + ldr qTj,[const_adr] + + sm3_round_0 0, 4 + + ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16 + sm3_round_0 1, 5 + + ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16 + sm3_round_0 2, 6 + ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16 + sm3_round_0 3, 7 + + ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16 + + sm3_round_4 4, 8 + ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16 + sm3_round_4 5, 9 + ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16 + sm3_round_4 6,10 + ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16 + sm3_round_4 7,11 + sm3_round_4 8,12 + sm3_round_4 9,13 + sm3_round_4 10,14 + sm3_round_4 11,15 + + sm3_round_12 12,16, 0, 7,13, 3,10 //12 + sm3_round_12 13, 0, 1, 8,14, 4,11 //13 + sm3_round_12 14, 1, 2, 9,15, 5,12 //14 + sm3_round_12 15, 2, 3,10,16, 6,13 //15 + + ldr qTj,[const_adr,16] + sm3_round_16 16, 3, 4,11, 0, 7,14 //16 +#if 0 + stp sdig_A,sdig_B,[job0_digest] + stp sdig_C,sdig_D,[job0_digest,8] + stp sdig_E,sdig_F,[job0_digest,16] + stp sdig_G,sdig_H,[job0_digest,24] + b .exit_func +#endif + sm3_round_16 0, 4, 5,12, 1, 8,15 //17 + + sm3_round_16 1, 5, 6,13, 2, 9,16 //18 + sm3_round_16 2, 6, 7,14, 3,10, 0 //19 + sm3_round_16 3, 7, 8,15, 4,11, 1 //20 + sm3_round_16 4, 8, 9,16, 5,12, 2 //21 + sm3_round_16 5, 9,10, 0, 6,13, 3 //22 + sm3_round_16 6,10,11, 1, 7,14, 4 //23 + sm3_round_16 7,11,12, 2, 8,15, 5 //24 + sm3_round_16 8,12,13, 3, 9,16, 6 //25 + sm3_round_16 9,13,14, 4,10, 0, 7 //26 + sm3_round_16 10,14,15, 5,11, 1, 8 //27 + sm3_round_16 11,15,16, 6,12, 2, 9 //28 + sm3_round_16 12,16, 0, 7,13, 3,10 //29 + sm3_round_16 13, 0, 1, 8,14, 4,11 //30 + sm3_round_16 14, 1, 2, 9,15, 5,12 //31 + sm3_round_16 15, 2, 3,10,16, 6,13 //32 + sm3_round_16 16, 3, 4,11, 0, 7,14 //33 + sm3_round_16 0, 4, 5,12, 1, 8,15 //34 + sm3_round_16 1, 5, 6,13, 2, 9,16 //35 + sm3_round_16 2, 6, 7,14, 3,10, 0 //36 + sm3_round_16 3, 7, 8,15, 4,11, 1 //37 + sm3_round_16 4, 8, 9,16, 5,12, 2 //38 + sm3_round_16 5, 9,10, 0, 6,13, 3 //39 + sm3_round_16 6,10,11, 1, 7,14, 4 //40 + sm3_round_16 7,11,12, 2, 8,15, 5 //41 + sm3_round_16 8,12,13, 3, 9,16, 6 //42 + sm3_round_16 9,13,14, 4,10, 0, 7 //43 + sm3_round_16 10,14,15, 5,11, 1, 8 //44 + sm3_round_16 11,15,16, 6,12, 2, 9 //45 + sm3_round_16 12,16, 0, 7,13, 3,10 //46 + sm3_round_16 13, 0, 1, 8,14, 4,11 //47 + sm3_round_16 14, 1, 2, 9,15, 5,12 //48 + sm3_round_16 15, 2, 3,10,16, 6,13 //49 + sm3_round_16 16, 3, 4,11, 0, 7,14 //50 + sm3_round_16 0, 4, 5,12, 1, 8,15 //51 + sm3_round_16 1, 5, 6,13, 2, 9,16 //52 + sm3_round_16 2, 6, 7,14, 3,10, 0 //53 + sm3_round_16 3, 7, 8,15, 4,11, 1 //54 + sm3_round_16 4, 8, 9,16, 5,12, 2 //55 + sm3_round_16 5, 9,10, 0, 6,13, 3 //56 + sm3_round_16 6,10,11, 1, 7,14, 4 //57 + sm3_round_16 7,11,12, 2, 8,15, 5 //58 + sm3_round_16 8,12,13, 3, 9,16, 6 //59 + sm3_round_16 9,13,14, 4,10, 0, 7 //60 + sm3_round_16 10,14,15, 5,11, 1, 8 //61 + sm3_round_16 11,15,16, 6,12, 2, 9 //62 + sm3_round_63 12,16, 0, 7,13, 3,10 //63 + + subs len,len,1 + bne .start_loop + + //save digests with big endian + rev32 vdig_A.16b,vdig_A.16b + rev32 vdig_B.16b,vdig_B.16b + rev32 vdig_C.16b,vdig_C.16b + rev32 vdig_D.16b,vdig_D.16b + st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16 + rev32 vdig_E.16b,vdig_E.16b + rev32 vdig_F.16b,vdig_F.16b + st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16 + rev32 vdig_G.16b,vdig_G.16b + rev32 vdig_H.16b,vdig_H.16b + st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16 + st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16 + st4 {vdig_E.s-vdig_H.s}[0],[job0_digest] + st4 {vdig_E.s-vdig_H.s}[1],[job1_digest] + st4 {vdig_E.s-vdig_H.s}[2],[job2_digest] + st4 {vdig_E.s-vdig_H.s}[3],[job3_digest] + +.exit_func: + ldp d8, d9, [sp,16] + ldp d10,d11,[sp,32] + ldp d12,d13,[sp,48] + ldp d14,d15,[sp,64] + ldp x29, x30, [sp], STACK_SIZE + ret +.consts: + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x79cc4519 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .word 0x9d8a7a87 + .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4 + diff --git a/drv/hash_mb/sm3_mb_sve.S b/drv/hash_mb/sm3_mb_sve.S new file mode 100644 index 0000000..7dd2428 --- /dev/null +++ b/drv/hash_mb/sm3_mb_sve.S @@ -0,0 +1,161 @@ +/********************************************************************** + Copyright(c) 2022 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + .arch armv8.2-a+sve + +.macro copy_mb_16words vecs:req,dest:req + mov src,\vecs + mov dst,\dest + mov ctr,lanes +1: + ldr tmp,[src],8 + ldr tmp,[tmp] + add tmp,tmp,block_ctr,lsl 6 + ld1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [tmp] + st1 {TMPV0.4s,TMPV1.4s,TMPV2.4s,TMPV3.4s}, [dst],64 + subs ctr,ctr,1 + b.ne 1b +.endm + +.macro load_words windex:req + .if \windex == 0 + mov tmpw,16 + index VOFFS.s,0,tmpw + copy_mb_16words job_vec,databuf + mov dataptr,databuf + .endif + ld1w { WORD\windex().s}, p0/z, [dataptr, VOFFS.s, UXTW 2] + add dataptr,dataptr,4 +.endm + +#include "sm3_sve_common.S" + +/* int sm3_mb_sve_max_lanes() + * return : max lanes of SVE vector + */ + .global sm3_mb_sve_max_lanes + .type sm3_mb_sve_max_lanes, %function +sm3_mb_sve_max_lanes: + cntw x0 + ret + .size sm3_mb_sve_max_lanes, .-sm3_mb_sve_max_lanes +/* + * void sm3_mb_sve(int blocks, int total_lanes, SM3_JOB **job_vec) + */ + num_blocks .req w0 + total_lanes .req w1 + job_vec .req x2 + lanes .req x4 + src .req x5 + dst .req x6 + lane_offset .req w7 + lane_offset_x .req x7 + tmp .req x8 + tmpw .req w8 + block_ctr .req x9 + block_ctr_w .req w9 + savedsp .req x10 + databuf .req x11 + dataptr .req x12 + efgh_buf .req x12 + ctr .req x13 + abcd_buf .req x14 + sm3const_adr .req x15 + + .global sm3_mb_sve + .type sm3_mb_sve, %function +sm3_mb_sve: + cbz num_blocks,.return + sm3_sve_save_stack + mov savedsp,sp + mov lane_offset, #0 + whilelo p0.s, wzr, total_lanes + // reserve (32 * max lanes) for abcdefgh buf + cntw tmp + lsl tmp, tmp, 5 + sub abcd_buf,sp,tmp + mov tmp,63 + bic abcd_buf,abcd_buf,tmp + // reserve (64 * lanes) for data buf + cntp lanes,p0,p0.s + lsl tmp,lanes,6 + sub databuf,abcd_buf,tmp + mov sp,databuf + adr sm3const_adr,SM3_CONSTS +.seg_loops: + mov src,job_vec + mov dst,abcd_buf + cntp lanes,p0,p0.s + add efgh_buf,abcd_buf,lanes,lsl 4 + mov ctr,lanes +.ldr_hash: + ldr tmp,[src],8 + add tmp,tmp,64 + ld1 {v0.16b, v1.16b},[tmp] + rev32 v0.16b,v0.16b + rev32 v1.16b,v1.16b + st1 {v0.16b},[dst],16 + st1 {v1.16b},[efgh_buf],16 + subs ctr,ctr,1 + bne .ldr_hash + ld4w {VA.s,VB.s,VC.s,VD.s},p0/z,[abcd_buf] + add tmp,abcd_buf,lanes,lsl 4 + ld4w {VE.s,VF.s,VG.s,VH.s},p0/z,[tmp] + mov block_ctr,0 + // always unpredicated SVE mode in current settings + pred_mode=0 +.block_loop: + sm3_single + add block_ctr, block_ctr, 1 + cmp block_ctr_w,num_blocks + bne .block_loop + st4w {VA.s,VB.s,VC.s,VD.s},p0,[abcd_buf] + add efgh_buf,abcd_buf,lanes,lsl 4 + st4w {VE.s,VF.s,VG.s,VH.s},p0,[efgh_buf] + mov dst,job_vec + mov src,abcd_buf + add job_vec,job_vec,lanes,lsl 3 + mov ctr,lanes +.str_hash: + ld1 {v0.16b},[src],16 + ld1 {v1.16b},[efgh_buf],16 + rev32 v0.16b,v0.16b + rev32 v1.16b,v1.16b + ldr tmp,[dst],8 + add tmp,tmp,64 + st1 {v0.16b,v1.16b},[tmp] + subs ctr,ctr,1 + bne .str_hash + incw lane_offset_x + whilelo p0.s, lane_offset, total_lanes + b.mi .seg_loops + mov sp,savedsp + sm3_sve_restore_stack +.return: + ret + .size sm3_mb_sve, .-sm3_mb_sve diff --git a/drv/hash_mb/sm3_sve_common.S b/drv/hash_mb/sm3_sve_common.S new file mode 100644 index 0000000..3d54952 --- /dev/null +++ b/drv/hash_mb/sm3_sve_common.S @@ -0,0 +1,505 @@ +/********************************************************************** + Copyright(c) 2022 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + VA .req z0 + VB .req z1 + VC .req z2 + VD .req z3 + VE .req z4 + VF .req z5 + VG .req z6 + VH .req z7 + TMPV0 .req v8 + TMPV1 .req v9 + TMPV2 .req v10 + TMPV3 .req v11 + WORD0 .req z8 + WORD1 .req z9 + WORD2 .req z10 + WORD3 .req z11 + WORD4 .req z12 + WORD5 .req z13 + WORD6 .req z14 + WORD7 .req z15 + WORD8 .req z16 + WORD9 .req z17 + WORD10 .req z18 + WORD11 .req z19 + WORD12 .req z20 + WORD13 .req z21 + WORD14 .req z22 + WORD15 .req z23 + WORD16 .req z24 + VOFFS .req z24 // reuse WORD16 + SS1 .req z25 + SS2 .req z26 + VT .req z26 // reuse SS2 + TT2 .req z27 + VT1 .req z28 + VT2 .req z29 + VT3 .req z30 + VT4 .req z31 + VZERO .req z31 + TT .req z0 + +.macro sve_op inst:req,regd,args:vararg + .if pred_mode == 1 + \inst \regd,p0/m,\args + .else + \inst \regd,\args + .endif +.endm + +.macro sve_bitop inst:req,regd:req,regm:req + .if pred_mode == 1 + \inst \regd().s,p0/m,\regd().s,\regm().s + .else + \inst \regd().d,\regd().d,\regm().d + .endif +.endm + +.macro rotate_left0 out:req,in:req,tmp:req,bits:req,args:vararg + .if have_sve2 == 0 + lsl \tmp().s,\in().s,\bits + .else + movprfx \out().d,\in().d + xar \out().s,\out().s,VZERO.s,32-\bits + .endif + + .ifnb \args + rotate_left0 \args + .endif +.endm + +.macro rotate_left1 out:req,in:req,tmp:req,bits:req,args:vararg + .if have_sve2 == 0 + lsr \out().s,\in().s,32-\bits + .endif + + .ifnb \args + rotate_left1 \args + .endif +.endm + +.macro rotate_left2 out:req,in:req,tmp:req,bits:req,args:vararg + .if have_sve2 == 0 + orr \out().d,\out().d,\tmp().d + .endif + + .ifnb \args + rotate_left2 \args + .endif +.endm + +.macro rotate_left args:vararg + rotate_left0 \args + rotate_left1 \args + rotate_left2 \args +.endm + +.macro SVE_EOR3 rd:req,r1:req,r2:req + .if have_sve2 == 0 + sve_bitop eor,\rd,\r1 + sve_bitop eor,\rd,\r2 + .else + eor3 \rd().d,\rd().d,\r1().d,\r2().d + .endif +.endm + +.macro FUNC_EOR3 ret:req,x:req,y:req,z:req + .if have_sve2 == 0 + eor \ret().d,\x().d,\y().d + sve_bitop eor,\ret,\z + .else + movprfx \ret().d,\x().d + eor3 \ret().d,\ret().d,\y().d,\z().d + .endif +.endm + +.macro FUNC_FF windex:req,ret:req,x:req,y:req,z:req,tmp1:req,tmp2:req + and \ret().d,\x().d,\y().d + and \tmp1().d,\x().d,\z().d + and \tmp2().d,\y().d,\z().d + sve_bitop orr,\ret,\tmp1 + sve_bitop orr,\ret,\tmp2 +.endm + +.macro FUNC_BSL ret:req,x:req,y:req,z:req,tmp:req + .if have_sve2 == 0 + bic \ret().d,\z().d,\x().d + and \tmp().d,\x().d,\y().d + sve_bitop orr,\ret,\tmp + .else + movprfx \ret().d,\x().d + bsl \ret().d,\ret().d,\y().d,\z().d + .endif +.endm + +.altmacro +.macro load_next_words windex + .if \windex < 16 + load_words \windex + .endif +.endm + +.macro SM3_STEP_00_11 windex:req,w:req,w4:req + // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) + ld1rw {VT2.s},p0/z,[sm3const_adr,\windex * 4] + rotate_left SS1,VA,VT1,12 + mov SS2.s,p0/m,SS1.s + sve_op add,SS1.s,SS1.s,VE.s + sve_op add,SS1.s,SS1.s,VT2.s + rotate_left SS1,SS1,VT2,7 + // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] + add VT2.s,\w().s,VH.s + FUNC_EOR3 TT2,VE,VF,VG + // SS2 = SS1 ^ rol32(a, 12) + sve_bitop eor,SS2,SS1 + sve_op add,TT2.s,TT2.s,VT2.s + // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] + FUNC_EOR3 VH,VA,VB,VC + eor VT1.d,\w().d,\w4().d + sve_op add,VH.s,VH.s,VD.s + sve_op add,VH.s,VH.s,VT1.s + add VD.s,TT2.s,SS1.s + sve_op add,VH.s,VH.s,SS2.s + // d = P0(TT2) + rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17 + SVE_EOR3 VD,VT1,VT3 + // b = rol32(b, 9) + // f = rol32(f, 19) + rotate_left VB,VB,VT3,9,VF,VF,VT4,19 +.endm + +.macro SM3_STEP_12_15 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req + // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) + rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12 + ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4] + mov TT2.s,p0/m,SS1.s + sve_bitop eor,VT,\w16 + sve_op add,SS1.s,SS1.s,VE.s + sve_bitop eor,VT,\w9 + sve_op add,SS1.s,SS1.s,VT1.s + rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23 + SVE_EOR3 VT,VT1,VT3 + rotate_left SS1,SS1,VT2,7 + sve_bitop eor,\w4,VT + // SS2 = SS1 ^ rol32(a, 12) + eor SS2.d,TT2.d,SS1.d + sve_bitop eor,\w4,\w6 + // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] + FUNC_EOR3 TT2,VE,VF,VG + add VT1.s,\w().s,VH.s + sve_op add,TT2.s,TT2.s,VT1.s + // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] + FUNC_EOR3 VH,VA,VB,VC + eor VT1.d,\w().d,\w4().d + sve_op add,VH.s,VH.s,VD.s + // b = rol32(b, 9) + // f = rol32(f, 19) + rotate_left VB,VB,VT3,9 + sve_op add,VH.s,VH.s,VT1.s + add VD.s,TT2.s,SS1.s + sve_op add,VH.s,VH.s,SS2.s + // d = P0(TT2) + rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17,VF,VF,TT2,19 + SVE_EOR3 VD,VT1,VT3 +.endm + +.macro SM3_STEP_16_62 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req + // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) + rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12 + ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4] + mov TT2.s,p0/m,SS1.s + sve_bitop eor,VT,\w16 + sve_op add,SS1.s,SS1.s,VE.s + sve_bitop eor,VT,\w9 + sve_op add,SS1.s,SS1.s,VT1.s + rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23 + SVE_EOR3 \w4,VT,VT1 + rotate_left SS1,SS1,VT2,7 + sve_bitop eor,\w4,VT3 + // SS2 = SS1 ^ rol32(a, 12) + eor SS2.d,TT2.d,SS1.d + sve_bitop eor,\w4,\w6 + // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] + sve_op add,SS1.s,SS1.s,\w().s + FUNC_BSL TT2,VE,VF,VG,VT1 + sve_op add,SS1.s,SS1.s,VH.s + // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] + FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2 + eor VT1.d,\w().d,\w4().d + sve_op add,VH.s,VH.s,VD.s + // b = rol32(b, 9) + // f = rol32(f, 19) + rotate_left VB,VB,VT2,9,VF,VF,VT4,19 + sve_op add,VH.s,VH.s,VT1.s + add VD.s,TT2.s,SS1.s + sve_op add,VH.s,VH.s,SS2.s + // d = P0(TT2) + rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17 + SVE_EOR3 VD,VT1,VT3 +.endm + +.macro SM3_STEP_63 windex:req,w:req,w4:req,w16:req,w13:req,w9:req,w6:req,w3:req + // SS1 = rol32(rol32(a, 12) + e + rol32(T, (j % 32)), 7) + rotate_left VT,\w3,VT1,15,\w4,\w13,VT2,7,SS1,VA,VT3,12 + ld1rw {VT1.s},p0/z,[sm3const_adr,\windex * 4] + mov TT2.s,p0/m,SS1.s + sve_bitop eor,VT,\w16 + sve_op add,SS1.s,SS1.s,VE.s + sve_bitop eor,VT,\w9 + sve_op add,SS1.s,SS1.s,VT1.s + rotate_left VT1,VT,VT2,15,VT3,VT,VT4,23 + SVE_EOR3 VT,VT1,VT3 + rotate_left SS1,SS1,VT2,7 + sve_bitop eor,\w4,VT + // SS2 = SS1 ^ rol32(a, 12) + eor SS2.d,TT2.d,SS1.d + sve_bitop eor,\w4,\w6 + // d <- TT2 = GG(index, e, f, g) + h + SS1 + W[index] + FUNC_BSL TT2,VE,VF,VG,VT1 + add VT1.s,\w().s,VH.s + .if \windex == 63 + ld1w {WORD0.s},p0/z,[abcd_buf, 0, MUL VL] + ld1w {WORD1.s},p0/z,[abcd_buf, 1, MUL VL] + ld1w {WORD2.s},p0/z,[abcd_buf, 2, MUL VL] + ld1w {WORD3.s},p0/z,[abcd_buf, 3, MUL VL] + ld1w {WORD4.s},p0/z,[abcd_buf, 4, MUL VL] + ld1w {WORD5.s},p0/z,[abcd_buf, 5, MUL VL] + ld1w {WORD6.s},p0/z,[abcd_buf, 6, MUL VL] + ld1w {WORD7.s},p0/z,[abcd_buf, 7, MUL VL] + .endif + sve_op add,TT2.s,TT2.s,VT1.s + // h <- TT1 = FF(index, a, b, c) + d + SS2 + WB[index] + FUNC_FF \windex,VH,VA,VB,VC,VT1,VT2 + eor VT1.d,\w().d,\w4().d + sve_op add,VH.s,VH.s,VD.s + // b = rol32(b, 9) + // f = rol32(f, 19) + rotate_left VB,VB,VT2,9,VF,VF,VT4,19 + sve_op add,VH.s,VH.s,VT1.s + add VD.s,TT2.s,SS1.s + sve_bitop eor,VA,WORD1 + sve_bitop eor,VB,WORD2 + sve_bitop eor,VC,WORD3 + // d = P0(TT2) + rotate_left VT1,VD,VT2,9,VT3,VD,VT4,17 + sve_bitop eor,VF,WORD6 + SVE_EOR3 VD,VT1,VT3 + sve_bitop eor,VG,WORD7 + sve_bitop eor,VD,WORD4 + sve_op add,VH.s,VH.s,SS2.s + sve_bitop eor,VE,WORD5 + sve_bitop eor,VH,WORD0 +.endm + +.macro SWAP_STATES + .unreq TT + TT .req VH + .unreq VH + VH .req VG + .unreq VG + VG .req VF + .unreq VF + VF .req VE + .unreq VE + VE .req VD + .unreq VD + VD .req VC + .unreq VC + VC .req VB + .unreq VB + VB .req VA + .unreq VA + VA .req TT +.endm + +.altmacro +.macro SM3_STEP_WRAPPER windex:req,idx:req,idx4:req,idx16,idx13,idx9,idx6,idx3 + .if \windex <= 11 + revb WORD\idx4().s, p0/m, WORD\idx4().s + next=\idx4+1 + load_next_words %next + SM3_STEP_00_11 \windex,WORD\idx(),WORD\idx4() + .else + .if \windex < 16 + SM3_STEP_12_15 \windex,WORD\idx(),\ + WORD\idx4(),WORD\idx16(),WORD\idx13(),\ + WORD\idx9(),WORD\idx6(),WORD\idx3() + .else + .if \windex == 63 + SM3_STEP_63 \windex,WORD\idx(),WORD\idx4(),\ + WORD\idx16(),WORD\idx13(),WORD\idx9(),\ + WORD\idx6(),WORD\idx3() + .else + SM3_STEP_16_62 \windex,WORD\idx(),WORD\idx4(),\ + WORD\idx16(),WORD\idx13(),WORD\idx9(),\ + WORD\idx6(),WORD\idx3() + .endif + .endif + .endif +.endm + +.macro exec_step windex:req + .if \windex <= 11 + idx4=\windex+4 + SM3_STEP_WRAPPER \windex,\windex,%idx4 + .else + idxp4=\windex + 4 + idx4=idxp4 % 17 + idx16=(idxp4 - 16) % 17 + idx13=(idxp4 - 13) % 17 + idx9=(idxp4 - 9) % 17 + idx6=(idxp4 - 6) % 17 + idx3=(idxp4 - 3) % 17 + idx=\windex % 17 + SM3_STEP_WRAPPER \windex,%idx,%idx4,%idx16,%idx13,%idx9,%idx6,%idx3 + .endif + SWAP_STATES +.endm + +.macro sm3_exec + current_step=0 + .rept 64 + exec_step %current_step + current_step=current_step+1 + .endr +.endm + +.macro sm3_single sve2:vararg + .ifnb \sve2 + have_sve2 = 1 + .else + have_sve2=0 + .endif + st1w {VA.s},p0,[abcd_buf, 0, MUL VL] + st1w {VB.s},p0,[abcd_buf, 1, MUL VL] + st1w {VC.s},p0,[abcd_buf, 2, MUL VL] + st1w {VD.s},p0,[abcd_buf, 3, MUL VL] + st1w {VE.s},p0,[abcd_buf, 4, MUL VL] + st1w {VF.s},p0,[abcd_buf, 5, MUL VL] + st1w {VG.s},p0,[abcd_buf, 6, MUL VL] + st1w {VH.s},p0,[abcd_buf, 7, MUL VL] + load_words 0 + load_words 1 + load_words 2 + load_words 3 + load_words 4 + revb WORD0.s, p0/m, WORD0.s + revb WORD1.s, p0/m, WORD1.s + revb WORD2.s, p0/m, WORD2.s + revb WORD3.s, p0/m, WORD3.s + .if have_sve2 == 1 + mov VZERO.s,p0/m,#0 + .endif + sm3_exec +.endm + +.macro sm3_sve_save_stack + stp d8,d9,[sp, -64]! + stp d10,d11,[sp, 16] + stp d12,d13,[sp, 32] + stp d14,d15,[sp, 48] +.endm + +.macro sm3_sve_restore_stack + ldp d10,d11,[sp, 16] + ldp d12,d13,[sp, 32] + ldp d14,d15,[sp, 48] + ldp d8,d9,[sp],64 +.endm + + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +SM3_CONSTS: + .word 0x79CC4519 + .word 0xF3988A32 + .word 0xE7311465 + .word 0xCE6228CB + .word 0x9CC45197 + .word 0x3988A32F + .word 0x7311465E + .word 0xE6228CBC + .word 0xCC451979 + .word 0x988A32F3 + .word 0x311465E7 + .word 0x6228CBCE + .word 0xC451979C + .word 0x88A32F39 + .word 0x11465E73 + .word 0x228CBCE6 + .word 0x9D8A7A87 + .word 0x3B14F50F + .word 0x7629EA1E + .word 0xEC53D43C + .word 0xD8A7A879 + .word 0xB14F50F3 + .word 0x629EA1E7 + .word 0xC53D43CE + .word 0x8A7A879D + .word 0x14F50F3B + .word 0x29EA1E76 + .word 0x53D43CEC + .word 0xA7A879D8 + .word 0x4F50F3B1 + .word 0x9EA1E762 + .word 0x3D43CEC5 + .word 0x7A879D8A + .word 0xF50F3B14 + .word 0xEA1E7629 + .word 0xD43CEC53 + .word 0xA879D8A7 + .word 0x50F3B14F + .word 0xA1E7629E + .word 0x43CEC53D + .word 0x879D8A7A + .word 0x0F3B14F5 + .word 0x1E7629EA + .word 0x3CEC53D4 + .word 0x79D8A7A8 + .word 0xF3B14F50 + .word 0xE7629EA1 + .word 0xCEC53D43 + .word 0x9D8A7A87 + .word 0x3B14F50F + .word 0x7629EA1E + .word 0xEC53D43C + .word 0xD8A7A879 + .word 0xB14F50F3 + .word 0x629EA1E7 + .word 0xC53D43CE + .word 0x8A7A879D + .word 0x14F50F3B + .word 0x29EA1E76 + .word 0x53D43CEC + .word 0xA7A879D8 + .word 0x4F50F3B1 + .word 0x9EA1E762 + .word 0x3D43CEC5 +