Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions dev/aarch64_clean/src/pointwise_montgomery.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_AARCH64)

.macro montgomery_reduce_long res, inl, inh
uzp1 t0.4s, \inl\().4s, \inh\().4s
mul t0.4s, t0.4s, modulus_twisted.4s
smlal \inl\().2d, t0.2s, modulus.2s
smlal2 \inh\().2d, t0.4s, modulus.4s
uzp2 \res\().4s, \inl\().4s, \inh\().4s
.endm


.macro pmull dl, dh, a, b
smull \dl\().2d, \a\().2s, \b\().2s
smull2 \dh\().2d, \a\().4s, \b\().4s
.endm

.macro pmlal dl, dh, a, b
smlal \dl\().2d, \a\().2s, \b\().2s
smlal2 \dh\().2d, \a\().4s, \b\().4s
.endm

.macro save_vregs
sub sp, sp, #(16*4)
stp d8, d9, [sp, #16*0]
stp d10, d11, [sp, #16*1]
stp d12, d13, [sp, #16*2]
stp d14, d15, [sp, #16*3]
.endm

.macro restore_vregs
ldp d8, d9, [sp, #16*0]
ldp d10, d11, [sp, #16*1]
ldp d12, d13, [sp, #16*2]
ldp d14, d15, [sp, #16*3]
add sp, sp, #(16*4)
.endm

.macro push_stack
save_vregs
.endm

.macro pop_stack
restore_vregs
.endm

out_ptr .req x0
a0_ptr .req x1
b0_ptr .req x2
count .req x3
wtmp .req w3

modulus .req v0
modulus_twisted .req v1

aa .req v2
bb .req v3
res .req v4
resl .req v5
resh .req v6
t0 .req v7

q_aa .req q2
q_bb .req q3
q_res .req q4

.text
.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_asm)
.balign 4
MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm)
push_stack

// load q = 8380417
movz wtmp, #57345
movk wtmp, #127, lsl #16
dup modulus.4s, wtmp

// load -q^-1 = 4236238847
movz wtmp, #57343
movk wtmp, #64639, lsl #16
dup modulus_twisted.4s, wtmp
mov count, #(MLDSA_N / 4)
loop_start:


ldr q_aa, [a0_ptr], #64
ldr q_bb, [b0_ptr], #64
pmull resl, resh, aa, bb
montgomery_reduce_long res, resl, resh
str q_res, [out_ptr], #64

ldr q_aa, [a0_ptr, #-48]
ldr q_bb, [b0_ptr, #-48]
pmull resl, resh, aa, bb
montgomery_reduce_long res, resl, resh
str q_res, [out_ptr, #-48]

ldr q_aa, [a0_ptr, #-32]
ldr q_bb, [b0_ptr, #-32]
pmull resl, resh, aa, bb
montgomery_reduce_long res, resl, resh
str q_res, [out_ptr, #-32]

ldr q_aa, [a0_ptr, #-16]
ldr q_bb, [b0_ptr, #-16]
pmull resl, resh, aa, bb
montgomery_reduce_long res, resl, resh
str q_res, [out_ptr, #-16]

subs count, count, #4
cbnz count, loop_start

pop_stack
ret
#endif /* MLD_ARITH_BACKEND_AARCH64 */
8 changes: 8 additions & 0 deletions mldsa/native/aarch64/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
/* Set of primitives that this backend replaces */
#define MLD_USE_NATIVE_NTT
#define MLD_USE_NATIVE_INTT
#define MLD_USE_NATIVE_POINTWISE

/* Identifier for this backend so that source and assembly files
* in the build can be appropriately guarded. */
Expand All @@ -31,6 +32,13 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
mld_aarch64_intt_zetas_layer123456);
}

static MLD_INLINE void mld_pointwise_montgomery_native(
int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
const int32_t in1[MLDSA_N])
{
mld_pointwise_montgomery_asm(out, in0, in1);
}

#endif /* !__ASSEMBLER__ */

#endif /* !MLD_NATIVE_AARCH64_META_H */
42 changes: 42 additions & 0 deletions mldsa/native/aarch64/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) The mldsa-native project authors
# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT

######
# To run, see the README.md file
######
.PHONY: all clean

# ISA to optimize for
TARGET_ISA=Arm_AArch64

# MicroArch target to optimize for
TARGET_MICROARCH=Arm_Neoverse_N1_experimental

SLOTHY_EXTRA_FLAGS ?=

SLOTHY_FLAGS=-c sw_pipelining.enabled=true \
-c inputs_are_outputs \
-c sw_pipelining.minimize_overlapping=False \
-c sw_pipelining.allow_post \
-c variable_size \
-c constraints.stalls_first_attempt=64 \
$(SLOTHY_EXTRA_FLAGS)

# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30.
# Allow SLOTHY to use all V-registers, but only caller-saved GPRs.
RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]"

# Used for kernels which don't stash callee-saved registers.
# Restrict SLOTHY to caller-saved registers.
RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]"

all: pointwise_montgomery.S

# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use
# those registers.
pointwise_montgomery.S: ../../../../dev/aarch64_clean/src/pointwise_montgomery.S
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l loop_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)


clean:
-$(RM) -rf pointwise_montgomery.S
3 changes: 3 additions & 0 deletions mldsa/native/aarch64/src/arith_native_aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,7 @@ void mld_ntt_asm(int32_t *, const int32_t *, const int32_t *);
#define mld_intt_asm MLD_NAMESPACE(intt_asm)
void mld_intt_asm(int32_t *, const int32_t *, const int32_t *);

#define mld_pointwise_montgomery_asm MLD_NAMESPACE(mld_pointwise_montgomery_asm)
void mld_pointwise_montgomery_asm(int32_t *, const int32_t *, const int32_t *);

#endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
Loading
Loading