Merge "Merge android-4.19-stable.157 (8ee67bc) into msm-4.19"

This commit is contained in:
qctecmdr 2020-12-20 12:38:07 -08:00 committed by Gerrit - the friendly Code Review server
commit 58fada0da7
662 changed files with 51036 additions and 5409 deletions

View File

@ -566,7 +566,7 @@
loops can be debugged more effectively on production
systems.
clearcpuid=BITNUM [X86]
clearcpuid=BITNUM[,BITNUM...] [X86]
Disable CPUID feature X for the kernel. See
arch/x86/include/asm/cpufeatures.h for the valid bit
numbers. Note the Linux specific bits are not necessarily
@ -5302,6 +5302,14 @@
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
xen.event_eoi_delay= [XEN]
How long to delay EOI handling in case of event
storms (jiffies). Default is 10.
xen.event_loop_timeout= [XEN]
After which time (jiffies) the event handling loop
should start to delay EOI handling. Default is 2.
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]

View File

@ -99,16 +99,20 @@ Coarse and fast_ns access
Some additional variants exist for more specialized cases:
.. c:function:: ktime_t ktime_get_coarse_boottime( void )
.. c:function:: ktime_t ktime_get_coarse( void )
ktime_t ktime_get_coarse_boottime( void )
ktime_t ktime_get_coarse_real( void )
ktime_t ktime_get_coarse_clocktai( void )
ktime_t ktime_get_coarse_raw( void )
.. c:function:: u64 ktime_get_coarse_ns( void )
u64 ktime_get_coarse_boottime_ns( void )
u64 ktime_get_coarse_real_ns( void )
u64 ktime_get_coarse_clocktai_ns( void )
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
void ktime_get_coarse_real_ts64( struct timespec64 * )
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
void ktime_get_coarse_raw_ts64( struct timespec64 * )
These are quicker than the non-coarse versions, but less accurate,
corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE

View File

@ -29,8 +29,7 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
:c:type:`v4l2_hsv_encoding` specifies which encoding is used.
.. note:: The default R'G'B' quantization is full range for all
colorspaces except for BT.2020 which uses limited range R'G'B'
quantization.
colorspaces. HSV formats are always full range.
.. tabularcolumns:: |p{6.0cm}|p{11.5cm}|
@ -162,8 +161,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
- Details
* - ``V4L2_QUANTIZATION_DEFAULT``
- Use the default quantization encoding as defined by the
colorspace. This is always full range for R'G'B' (except for the
BT.2020 colorspace) and HSV. It is usually limited range for Y'CbCr.
colorspace. This is always full range for R'G'B' and HSV.
It is usually limited range for Y'CbCr.
* - ``V4L2_QUANTIZATION_FULL_RANGE``
- Use the full range quantization encoding. I.e. the range [0…1] is
mapped to [0…255] (with possible clipping to [1…254] to avoid the
@ -173,4 +172,4 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
* - ``V4L2_QUANTIZATION_LIM_RANGE``
- Use the limited range quantization encoding. I.e. the range [0…1]
is mapped to [16…235]. Cb and Cr are mapped from [-0.5…0.5] to
[16…240].
[16…240]. Limited Range cannot be used with HSV.

View File

@ -370,9 +370,8 @@ Colorspace BT.2020 (V4L2_COLORSPACE_BT2020)
The :ref:`itu2020` standard defines the colorspace used by Ultra-high
definition television (UHDTV). The default transfer function is
``V4L2_XFER_FUNC_709``. The default Y'CbCr encoding is
``V4L2_YCBCR_ENC_BT2020``. The default R'G'B' quantization is limited
range (!), and so is the default Y'CbCr quantization. The chromaticities
of the primary colors and the white reference are:
``V4L2_YCBCR_ENC_BT2020``. The default Y'CbCr quantization is limited range.
The chromaticities of the primary colors and the white reference are:

View File

@ -949,12 +949,14 @@ icmp_ratelimit - INTEGER
icmp_msgs_per_sec - INTEGER
Limit maximal number of ICMP packets sent per second from this host.
Only messages whose type matches icmp_ratemask (see below) are
controlled by this limit.
controlled by this limit. For security reasons, the precise count
of messages per second is randomized.
Default: 1000
icmp_msgs_burst - INTEGER
icmp_msgs_per_sec controls number of ICMP packets sent per second,
while icmp_msgs_burst controls the burst size of these packets.
For security reasons, the precise burst size is randomized.
Default: 50
icmp_ratemask - INTEGER

View File

@ -3907,6 +3907,7 @@ F: crypto/
F: drivers/crypto/
F: include/crypto/
F: include/linux/crypto*
F: lib/crypto/
CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
M: Neil Horman <nhorman@tuxdriver.com>
@ -15890,6 +15891,14 @@ L: linux-gpio@vger.kernel.org
S: Maintained
F: drivers/gpio/gpio-ws16c48.c
WIREGUARD SECURE NETWORK TUNNEL
M: Jason A. Donenfeld <Jason@zx2c4.com>
S: Maintained
F: drivers/net/wireguard/
F: tools/testing/selftests/wireguard/
L: wireguard@lists.zx2c4.com
L: netdev@vger.kernel.org
WISTRON LAPTOP BUTTON DRIVER
M: Miloslav Trmac <mitr@volny.cz>
S: Maintained

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 4
PATCHLEVEL = 19
SUBLEVEL = 152
SUBLEVEL = 157
EXTRAVERSION =
NAME = "People's Front"
@ -505,11 +505,7 @@ endif
ifeq ($(cc-name),clang)
ifneq ($(CROSS_COMPILE),)
CLANG_TRIPLE ?= $(CROSS_COMPILE)
CLANG_FLAGS += --target=$(notdir $(CLANG_TRIPLE:%-=%))
ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_FLAGS)), y)
$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
endif
CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)

File diff suppressed because it is too large Load Diff

View File

@ -2348,6 +2348,7 @@
__sock_recv_ts_and_drops
sock_wake_async
sock_wfree
timer_reduce
unregister_net_sysctl_table
__wake_up_sync_key
__xfrm_policy_check

View File

@ -366,6 +366,13 @@ config HAVE_RCU_TABLE_FREE
config HAVE_RCU_TABLE_INVALIDATE
bool
config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
bool
help
Temporary select until all architectures can be converted to have
irqs disabled over activate_mm. Architectures that do IPI based TLB
shootdowns should enable this.
config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool

View File

@ -156,6 +156,7 @@ END(EV_Extension)
tracesys:
; save EFA in case tracer wants the PC of traced task
; using ERET won't work since next-PC has already committed
lr r12, [efa]
GET_CURR_TASK_FIELD_PTR TASK_THREAD, r11
st r12, [r11, THREAD_FAULT_ADDR] ; thread.fault_address
@ -198,9 +199,15 @@ tracesys_exit:
; Breakpoint TRAP
; ---------------------------------------------
trap_with_param:
mov r0, r12 ; EFA in case ptracer/gdb wants stop_pc
; stop_pc info by gdb needs this info
lr r0, [efa]
mov r1, sp
; Now that we have read EFA, it is safe to do "fake" rtie
; and get out of CPU exception mode
FAKE_RET_FROM_EXCPN
; Save callee regs in case gdb wants to have a look
; SP will grow up by size of CALLEE Reg-File
; NOTE: clobbers r12
@ -227,10 +234,6 @@ ENTRY(EV_Trap)
EXCEPTION_PROLOGUE
lr r12, [efa]
FAKE_RET_FROM_EXCPN
;============ TRAP 1 :breakpoints
; Check ECR for trap with arg (PROLOGUE ensures r9 has ECR)
bmsk.f 0, r9, 7
@ -238,6 +241,9 @@ ENTRY(EV_Trap)
;============ TRAP (no param): syscall top level
; First return from Exception to pure K mode (Exception/IRQs renabled)
FAKE_RET_FROM_EXCPN
; If syscall tracing ongoing, invoke pre-post-hooks
GET_CURR_THR_INFO_FLAGS r10
btst r10, TIF_SYSCALL_TRACE

View File

@ -115,7 +115,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
int (*consumer_fn) (unsigned int, void *), void *arg)
{
#ifdef CONFIG_ARC_DW2_UNWIND
int ret = 0;
int ret = 0, cnt = 0;
unsigned int address;
struct unwind_frame_info frame_info;
@ -135,6 +135,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
break;
frame_info.regs.r63 = frame_info.regs.r31;
if (cnt++ > 128) {
printk("unwinder looping too long, aborting !\n");
return 0;
}
}
return address; /* return the last address it saw */

View File

@ -11,5 +11,6 @@ menuconfig ARC_SOC_HSDK
select ARC_HAS_ACCL_REGS
select ARC_IRQ_NO_AUTOSAVE
select CLK_HSDK
select RESET_CONTROLLER
select RESET_HSDK
select MIGHT_HAVE_PCI

View File

@ -622,8 +622,10 @@ config ARCH_S3C24XX
select HAVE_S3C2410_WATCHDOG if WATCHDOG
select HAVE_S3C_RTC if RTC_CLASS
select NEED_MACH_IO_H
select S3C2410_WATCHDOG
select SAMSUNG_ATAGS
select USE_OF
select WATCHDOG
help
Samsung S3C2410, S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443
and S3C2450 SoCs based systems, such as the Simtec Electronics BAST

View File

@ -922,8 +922,10 @@
};
rngb: rngb@21b4000 {
compatible = "fsl,imx6sl-rngb", "fsl,imx25-rngb";
reg = <0x021b4000 0x4000>;
interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&clks IMX6SL_CLK_DUMMY>;
};
weim: weim@21b8000 {

View File

@ -192,6 +192,7 @@
fixed-link {
speed = <1000>;
full-duplex;
pause;
};
};
};

View File

@ -516,7 +516,7 @@
status = "disabled";
};
target-module@56000000 {
sgx_module: target-module@56000000 {
compatible = "ti,sysc-omap4", "ti,sysc";
ti,hwmods = "gpu";
reg = <0x5601fc00 0x4>,

View File

@ -74,3 +74,13 @@
};
/include/ "omap443x-clocks.dtsi"
/*
* Use dpll_per for sgx at 153.6MHz like droid4 stock v3.0.8 Android kernel
*/
&sgx_module {
assigned-clocks = <&l3_gfx_clkctrl OMAP4_GPU_CLKCTRL 24>,
<&dpll_per_m7x2_ck>;
assigned-clock-rates = <0>, <153600000>;
assigned-clock-parents = <&dpll_per_m7x2_ck>;
};

View File

@ -85,21 +85,21 @@
global_timer: timer@b0020200 {
compatible = "arm,cortex-a9-global-timer";
reg = <0xb0020200 0x100>;
interrupts = <GIC_PPI 0 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
interrupts = <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
status = "disabled";
};
twd_timer: timer@b0020600 {
compatible = "arm,cortex-a9-twd-timer";
reg = <0xb0020600 0x20>;
interrupts = <GIC_PPI 2 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
status = "disabled";
};
twd_wdt: wdt@b0020620 {
compatible = "arm,cortex-a9-twd-wdt";
reg = <0xb0020620 0xe0>;
interrupts = <GIC_PPI 3 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
interrupts = <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
status = "disabled";
};

View File

@ -98,19 +98,16 @@
};
clocks: clock-controller@e0100000 {
compatible = "samsung,s5pv210-clock", "simple-bus";
compatible = "samsung,s5pv210-clock";
reg = <0xe0100000 0x10000>;
clock-names = "xxti", "xusbxti";
clocks = <&xxti>, <&xusbxti>;
#clock-cells = <1>;
#address-cells = <1>;
#size-cells = <1>;
ranges;
};
pmu_syscon: syscon@e0108000 {
compatible = "samsung-s5pv210-pmu", "syscon";
reg = <0xe0108000 0x8000>;
};
pmu_syscon: syscon@e0108000 {
compatible = "samsung-s5pv210-pmu", "syscon";
reg = <0xe0108000 0x8000>;
};
pinctrl0: pinctrl@e0200000 {
@ -126,35 +123,28 @@
};
};
amba {
#address-cells = <1>;
#size-cells = <1>;
compatible = "simple-bus";
ranges;
pdma0: dma@e0900000 {
compatible = "arm,pl330", "arm,primecell";
reg = <0xe0900000 0x1000>;
interrupt-parent = <&vic0>;
interrupts = <19>;
clocks = <&clocks CLK_PDMA0>;
clock-names = "apb_pclk";
#dma-cells = <1>;
#dma-channels = <8>;
#dma-requests = <32>;
};
pdma0: dma@e0900000 {
compatible = "arm,pl330", "arm,primecell";
reg = <0xe0900000 0x1000>;
interrupt-parent = <&vic0>;
interrupts = <19>;
clocks = <&clocks CLK_PDMA0>;
clock-names = "apb_pclk";
#dma-cells = <1>;
#dma-channels = <8>;
#dma-requests = <32>;
};
pdma1: dma@e0a00000 {
compatible = "arm,pl330", "arm,primecell";
reg = <0xe0a00000 0x1000>;
interrupt-parent = <&vic0>;
interrupts = <20>;
clocks = <&clocks CLK_PDMA1>;
clock-names = "apb_pclk";
#dma-cells = <1>;
#dma-channels = <8>;
#dma-requests = <32>;
};
pdma1: dma@e0a00000 {
compatible = "arm,pl330", "arm,primecell";
reg = <0xe0a00000 0x1000>;
interrupt-parent = <&vic0>;
interrupts = <20>;
clocks = <&clocks CLK_PDMA1>;
clock-names = "apb_pclk";
#dma-cells = <1>;
#dma-channels = <8>;
#dma-requests = <32>;
};
spi0: spi@e1300000 {
@ -227,43 +217,36 @@
status = "disabled";
};
audio-subsystem {
compatible = "samsung,s5pv210-audss", "simple-bus";
#address-cells = <1>;
#size-cells = <1>;
ranges;
clk_audss: clock-controller@eee10000 {
compatible = "samsung,s5pv210-audss-clock";
reg = <0xeee10000 0x1000>;
clock-names = "hclk", "xxti",
"fout_epll",
"sclk_audio0";
clocks = <&clocks DOUT_HCLKP>, <&xxti>,
<&clocks FOUT_EPLL>,
<&clocks SCLK_AUDIO0>;
#clock-cells = <1>;
};
clk_audss: clock-controller@eee10000 {
compatible = "samsung,s5pv210-audss-clock";
reg = <0xeee10000 0x1000>;
clock-names = "hclk", "xxti",
"fout_epll",
"sclk_audio0";
clocks = <&clocks DOUT_HCLKP>, <&xxti>,
<&clocks FOUT_EPLL>,
<&clocks SCLK_AUDIO0>;
#clock-cells = <1>;
};
i2s0: i2s@eee30000 {
compatible = "samsung,s5pv210-i2s";
reg = <0xeee30000 0x1000>;
interrupt-parent = <&vic2>;
interrupts = <16>;
dma-names = "rx", "tx", "tx-sec";
dmas = <&pdma1 9>, <&pdma1 10>, <&pdma1 11>;
clock-names = "iis",
"i2s_opclk0",
"i2s_opclk1";
clocks = <&clk_audss CLK_I2S>,
<&clk_audss CLK_I2S>,
<&clk_audss CLK_DOUT_AUD_BUS>;
samsung,idma-addr = <0xc0010000>;
pinctrl-names = "default";
pinctrl-0 = <&i2s0_bus>;
#sound-dai-cells = <0>;
status = "disabled";
};
i2s0: i2s@eee30000 {
compatible = "samsung,s5pv210-i2s";
reg = <0xeee30000 0x1000>;
interrupt-parent = <&vic2>;
interrupts = <16>;
dma-names = "rx", "tx", "tx-sec";
dmas = <&pdma1 9>, <&pdma1 10>, <&pdma1 11>;
clock-names = "iis",
"i2s_opclk0",
"i2s_opclk1";
clocks = <&clk_audss CLK_I2S>,
<&clk_audss CLK_I2S>,
<&clk_audss CLK_DOUT_AUD_BUS>;
samsung,idma-addr = <0xc0010000>;
pinctrl-names = "default";
pinctrl-0 = <&i2s0_bus>;
#sound-dai-cells = <0>;
status = "disabled";
};
i2s1: i2s@e2100000 {

View File

@ -143,7 +143,7 @@
trips {
cpu_alert0: cpu-alert0 {
/* milliCelsius */
temperature = <850000>;
temperature = <85000>;
hysteresis = <2000>;
type = "passive";
};

View File

@ -206,16 +206,16 @@
};
&reg_dc1sw {
regulator-min-microvolt = <3000000>;
regulator-max-microvolt = <3000000>;
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
regulator-name = "vcc-gmac-phy";
};
&reg_dcdc1 {
regulator-always-on;
regulator-min-microvolt = <3000000>;
regulator-max-microvolt = <3000000>;
regulator-name = "vcc-3v0";
regulator-min-microvolt = <3300000>;
regulator-max-microvolt = <3300000>;
regulator-name = "vcc-3v3";
};
&reg_dcdc2 {

View File

@ -1,3 +1,4 @@
aesbs-core.S
sha256-core.S
sha512-core.S
poly1305-core.S

View File

@ -125,14 +125,24 @@ config CRYPTO_CRC32_ARM_CE
select CRYPTO_HASH
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha stream cipher algorithms"
depends on KERNEL_MODE_NEON
tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
select CRYPTO_ARCH_HAVE_LIB_CHACHA
config CRYPTO_POLY1305_ARM
tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
select CRYPTO_HASH
select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_NHPOLY1305_NEON
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
depends on KERNEL_MODE_NEON
select CRYPTO_NHPOLY1305
config CRYPTO_CURVE25519_NEON
tristate "NEON accelerated Curve25519 scalar multiplication library"
depends on KERNEL_MODE_NEON
select CRYPTO_LIB_CURVE25519_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
endif

View File

@ -10,7 +10,9 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@ -53,13 +55,19 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
chacha-neon-y := chacha-scalar-core.o chacha-glue.o
chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
poly1305-arm-y := poly1305-core.o poly1305-glue.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
curve25519-neon-y := curve25519-core.o curve25519-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
$(call cmd,perl)
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
$(call cmd,perl)
@ -67,4 +75,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
$(call cmd,perl)
endif
targets += sha256-core.S sha512-core.S
targets += poly1305-core.S sha256-core.S sha512-core.S
# massage the perlasm code a bit so we only get the NEON routine if we need it
poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
AFLAGS_poly1305-core.o += $(poly1305-aflags-y)

View File

@ -0,0 +1,356 @@
// SPDX-License-Identifier: GPL-2.0
/*
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2015 Martin Willi
*/
#include <crypto/algapi.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/cputype.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
const u32 *state, int nrounds);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
static inline bool neon_usable(void)
{
return static_branch_likely(&use_neon) && may_use_simd();
}
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
u8 buf[CHACHA_BLOCK_SIZE];
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha_block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, bytes);
}
}
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
hchacha_block_arm(state, stream, nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, stream, nrounds);
kernel_neon_end();
}
}
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
bytes <= CHACHA_BLOCK_SIZE) {
chacha_doarm(dst, src, bytes, state, nrounds);
state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
return;
}
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_neon_begin();
chacha_doneon(state, dst, src, todo, nrounds);
kernel_neon_end();
bytes -= todo;
src += todo;
dst += todo;
} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv,
bool neon)
{
struct skcipher_walk walk;
u32 state[16];
int err;
err = skcipher_walk_virt(&walk, req, false);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
nbytes, state, ctx->nrounds);
state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
} else {
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes, ctx->nrounds);
kernel_neon_end();
}
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int do_chacha(struct skcipher_request *req, bool neon)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_stream_xor(req, ctx, req->iv, neon);
}
static int chacha_arm(struct skcipher_request *req)
{
return do_chacha(req, false);
}
static int chacha_neon(struct skcipher_request *req)
{
return do_chacha(req, neon_usable());
}
static int do_xchacha(struct skcipher_request *req, bool neon)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 state[16];
u8 real_iv[16];
chacha_init_generic(state, ctx->key, req->iv);
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
hchacha_block_arm(state, subctx.key, ctx->nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, subctx.key, ctx->nrounds);
kernel_neon_end();
}
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_stream_xor(req, &subctx, real_iv, neon);
}
static int xchacha_arm(struct skcipher_request *req)
{
return do_xchacha(req, false);
}
static int xchacha_neon(struct skcipher_request *req)
{
return do_xchacha(req, neon_usable());
}
static struct skcipher_alg arm_algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-arm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_arm,
.decrypt = chacha_arm,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-arm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_arm,
.decrypt = xchacha_arm,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-arm",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_arm,
.decrypt = xchacha_arm,
},
};
static struct skcipher_alg neon_algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
};
static int __init chacha_simd_mod_init(void)
{
int err = 0;
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
if (err)
return err;
}
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
int i;
switch (read_cpuid_part()) {
case ARM_CPU_PART_CORTEX_A7:
case ARM_CPU_PART_CORTEX_A5:
/*
* The Cortex-A7 and Cortex-A5 do not perform well with
* the NEON implementation but do incredibly with the
* scalar one and use less power.
*/
for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
neon_algs[i].base.cra_priority = 0;
break;
default:
static_branch_enable(&use_neon);
}
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
if (err)
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
}
}
return err;
}
static void __exit chacha_simd_mod_fini(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
}
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-arm");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-arm");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-arm");
#ifdef CONFIG_KERNEL_MODE_NEON
MODULE_ALIAS_CRYPTO("chacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha12-neon");
#endif

View File

@ -0,0 +1,460 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2018 Google, Inc.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
/*
* Design notes:
*
* 16 registers would be needed to hold the state matrix, but only 14 are
* available because 'sp' and 'pc' cannot be used. So we spill the elements
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
* 'ldrd' and one 'strd' instruction per round.
*
* All rotates are performed using the implicit rotate operand accepted by the
* 'add' and 'eor' instructions. This is faster than using explicit rotate
* instructions. To make this work, we allow the values in the second and last
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
* wrong rotation amount. The rotation amount is then fixed up just in time
* when the values are used. 'brot' is the number of bits the values in row 'b'
* need to be rotated right to arrive at the correct values, and 'drot'
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
* that they end up as (25, 24) after every round.
*/
// ChaCha state registers
X0 .req r0
X1 .req r1
X2 .req r2
X3 .req r3
X4 .req r4
X5 .req r5
X6 .req r6
X7 .req r7
X8_X10 .req r8 // shared by x8 and x10
X9_X11 .req r9 // shared by x9 and x11
X12 .req r10
X13 .req r11
X14 .req r12
X15 .req r14
.macro __rev out, in, t0, t1, t2
.if __LINUX_ARM_ARCH__ >= 6
rev \out, \in
.else
lsl \t0, \in, #24
and \t1, \in, #0xff00
and \t2, \in, #0xff0000
orr \out, \t0, \in, lsr #24
orr \out, \out, \t1, lsl #8
orr \out, \out, \t2, lsr #8
.endif
.endm
.macro _le32_bswap x, t0, t1, t2
#ifdef __ARMEB__
__rev \x, \x, \t0, \t1, \t2
#endif
.endm
.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
_le32_bswap \a, \t0, \t1, \t2
_le32_bswap \b, \t0, \t1, \t2
_le32_bswap \c, \t0, \t1, \t2
_le32_bswap \d, \t0, \t1, \t2
.endm
.macro __ldrd a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
ldrd \a, \b, [\src, #\offset]
#else
ldr \a, [\src, #\offset]
ldr \b, [\src, #\offset + 4]
#endif
.endm
.macro __strd a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
strd \a, \b, [\dst, #\offset]
#else
str \a, [\dst, #\offset]
str \b, [\dst, #\offset + 4]
#endif
.endm
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
// a += b; d ^= a; d = rol(d, 16);
add \a1, \a1, \b1, ror #brot
add \a2, \a2, \b2, ror #brot
eor \d1, \a1, \d1, ror #drot
eor \d2, \a2, \d2, ror #drot
// drot == 32 - 16 == 16
// c += d; b ^= c; b = rol(b, 12);
add \c1, \c1, \d1, ror #16
add \c2, \c2, \d2, ror #16
eor \b1, \c1, \b1, ror #brot
eor \b2, \c2, \b2, ror #brot
// brot == 32 - 12 == 20
// a += b; d ^= a; d = rol(d, 8);
add \a1, \a1, \b1, ror #20
add \a2, \a2, \b2, ror #20
eor \d1, \a1, \d1, ror #16
eor \d2, \a2, \d2, ror #16
// drot == 32 - 8 == 24
// c += d; b ^= c; b = rol(b, 7);
add \c1, \c1, \d1, ror #24
add \c2, \c2, \d2, ror #24
eor \b1, \c1, \b1, ror #20
eor \b2, \c2, \b2, ror #20
// brot == 32 - 7 == 25
.endm
.macro _doubleround
// column round
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
// save (x8, x9); restore (x10, x11)
__strd X8_X10, X9_X11, sp, 0
__ldrd X8_X10, X9_X11, sp, 8
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
.set brot, 25
.set drot, 24
// diagonal round
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
// save (x10, x11); restore (x8, x9)
__strd X8_X10, X9_X11, sp, 8
__ldrd X8_X10, X9_X11, sp, 0
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
.endm
.macro _chacha_permute nrounds
.set brot, 0
.set drot, 0
.rept \nrounds / 2
_doubleround
.endr
.endm
.macro _chacha nrounds
.Lnext_block\@:
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// Do the core ChaCha permutation to update x0-x15.
_chacha_permute \nrounds
add sp, #8
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
push {X8_X10, X9_X11, X12, X13, X14, X15}
// Load (OUT, IN, LEN).
ldr r14, [sp, #96]
ldr r12, [sp, #100]
ldr r11, [sp, #104]
orr r10, r14, r12
// Use slow path if fewer than 64 bytes remain.
cmp r11, #64
blt .Lxor_slowpath\@
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
// ARMv6+, since ldmia and stmia (used below) still require alignment.
tst r10, #3
bne .Lxor_slowpath\@
// Fast path: XOR 64 bytes of aligned data.
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// x0-x3
__ldrd r8, r9, sp, 32
__ldrd r10, r11, sp, 40
add X0, X0, r8
add X1, X1, r9
add X2, X2, r10
add X3, X3, r11
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
ldmia r12!, {r8-r11}
eor X0, X0, r8
eor X1, X1, r9
eor X2, X2, r10
eor X3, X3, r11
stmia r14!, {X0-X3}
// x4-x7
__ldrd r8, r9, sp, 48
__ldrd r10, r11, sp, 56
add X4, r8, X4, ror #brot
add X5, r9, X5, ror #brot
ldmia r12!, {X0-X3}
add X6, r10, X6, ror #brot
add X7, r11, X7, ror #brot
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
eor X4, X4, X0
eor X5, X5, X1
eor X6, X6, X2
eor X7, X7, X3
stmia r14!, {X4-X7}
// x8-x15
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
__ldrd r8, r9, sp, 32
__ldrd r10, r11, sp, 40
add r0, r0, r8 // x8
add r1, r1, r9 // x9
add r6, r6, r10 // x10
add r7, r7, r11 // x11
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
ldmia r12!, {r8-r11}
eor r0, r0, r8 // x8
eor r1, r1, r9 // x9
eor r6, r6, r10 // x10
eor r7, r7, r11 // x11
stmia r14!, {r0,r1,r6,r7}
ldmia r12!, {r0,r1,r6,r7}
__ldrd r8, r9, sp, 48
__ldrd r10, r11, sp, 56
add r2, r8, r2, ror #drot // x12
add r3, r9, r3, ror #drot // x13
add r4, r10, r4, ror #drot // x14
add r5, r11, r5, ror #drot // x15
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
ldr r9, [sp, #72] // load LEN
eor r2, r2, r0 // x12
eor r3, r3, r1 // x13
eor r4, r4, r6 // x14
eor r5, r5, r7 // x15
subs r9, #64 // decrement and check LEN
stmia r14!, {r2-r5}
beq .Ldone\@
.Lprepare_for_next_block\@:
// Stack: x0-x15 OUT IN LEN
// Increment block counter (x12)
add r8, #1
// Store updated (OUT, IN, LEN)
str r14, [sp, #64]
str r12, [sp, #68]
str r9, [sp, #72]
mov r14, sp
// Store updated block counter (x12)
str r8, [sp, #48]
sub sp, #16
// Reload state and do next block
ldmia r14!, {r0-r11} // load x0-x11
__strd r10, r11, sp, 8 // store x10-x11 before state
ldmia r14, {r10-r12,r14} // load x12-x15
b .Lnext_block\@
.Lxor_slowpath\@:
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
// We handle it by storing the 64 bytes of keystream to the stack, then
// XOR-ing the needed portion with the data.
// Allocate keystream buffer
sub sp, #64
mov r14, sp
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Save keystream for x0-x3
__ldrd r8, r9, sp, 96
__ldrd r10, r11, sp, 104
add X0, X0, r8
add X1, X1, r9
add X2, X2, r10
add X3, X3, r11
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
stmia r14!, {X0-X3}
// Save keystream for x4-x7
__ldrd r8, r9, sp, 112
__ldrd r10, r11, sp, 120
add X4, r8, X4, ror #brot
add X5, r9, X5, ror #brot
add X6, r10, X6, ror #brot
add X7, r11, X7, ror #brot
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
add r8, sp, #64
stmia r14!, {X4-X7}
// Save keystream for x8-x15
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
__ldrd r8, r9, sp, 128
__ldrd r10, r11, sp, 136
add r0, r0, r8 // x8
add r1, r1, r9 // x9
add r6, r6, r10 // x10
add r7, r7, r11 // x11
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
stmia r14!, {r0,r1,r6,r7}
__ldrd r8, r9, sp, 144
__ldrd r10, r11, sp, 152
add r2, r8, r2, ror #drot // x12
add r3, r9, r3, ror #drot // x13
add r4, r10, r4, ror #drot // x14
add r5, r11, r5, ror #drot // x15
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
stmia r14, {r2-r5}
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
// Registers: r8 is block counter, r12 is IN.
ldr r9, [sp, #168] // LEN
ldr r14, [sp, #160] // OUT
cmp r9, #64
mov r0, sp
movle r1, r9
movgt r1, #64
// r1 is number of bytes to XOR, in range [1, 64]
.if __LINUX_ARM_ARCH__ < 6
orr r2, r12, r14
tst r2, #3 // IN or OUT misaligned?
bne .Lxor_next_byte\@
.endif
// XOR a word at a time
.rept 16
subs r1, #4
blt .Lxor_words_done\@
ldr r2, [r12], #4
ldr r3, [r0], #4
eor r2, r2, r3
str r2, [r14], #4
.endr
b .Lxor_slowpath_done\@
.Lxor_words_done\@:
ands r1, r1, #3
beq .Lxor_slowpath_done\@
// XOR a byte at a time
.Lxor_next_byte\@:
ldrb r2, [r12], #1
ldrb r3, [r0], #1
eor r2, r2, r3
strb r2, [r14], #1
subs r1, #1
bne .Lxor_next_byte\@
.Lxor_slowpath_done\@:
subs r9, #64
add sp, #96
bgt .Lprepare_for_next_block\@
.Ldone\@:
.endm // _chacha
/*
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
* const u32 *state, int nrounds);
*/
ENTRY(chacha_doarm)
cmp r2, #0 // len == 0?
reteq lr
ldr ip, [sp]
cmp ip, #12
push {r0-r2,r4-r11,lr}
// Push state x0-x15 onto stack.
// Also store an extra copy of x10-x11 just before the state.
add X12, r3, #48
ldm X12, {X12,X13,X14,X15}
push {X12,X13,X14,X15}
sub sp, sp, #64
__ldrd X8_X10, X9_X11, r3, 40
__strd X8_X10, X9_X11, sp, 8
__strd X8_X10, X9_X11, sp, 56
ldm r3, {X0-X9_X11}
__strd X0, X1, sp, 16
__strd X2, X3, sp, 24
__strd X4, X5, sp, 32
__strd X6, X7, sp, 40
__strd X8_X10, X9_X11, sp, 48
beq 1f
_chacha 20
0: add sp, #76
pop {r4-r11, pc}
1: _chacha 12
b 0b
ENDPROC(chacha_doarm)
/*
* void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
*/
ENTRY(hchacha_block_arm)
push {r1,r4-r11,lr}
cmp r2, #12 // ChaCha12 ?
mov r14, r0
ldmia r14!, {r0-r11} // load x0-x11
push {r10-r11} // store x10-x11 to stack
ldm r14, {r10-r12,r14} // load x12-x15
sub sp, #8
beq 1f
_chacha_permute 20
// Skip over (unused0-unused1, x10-x11)
0: add sp, #16
// Fix up rotations of x12-x15
ror X12, X12, #drot
ror X13, X13, #drot
pop {r4} // load 'out'
ror X14, X14, #drot
ror X15, X15, #drot
// Store (x0-x3,x12-x15) to 'out'
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
pop {r4-r11,pc}
1: _chacha_permute 12
b 0b
ENDPROC(hchacha_block_arm)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,135 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
* manually reworked for use in kernel space.
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/internal/kpp.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/jump_label.h>
#include <linux/scatterlist.h>
#include <crypto/curve25519.h>
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE],
const u8 basepoint[CURVE25519_KEY_SIZE]);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
const u8 scalar[CURVE25519_KEY_SIZE],
const u8 point[CURVE25519_KEY_SIZE])
{
if (static_branch_likely(&have_neon) && may_use_simd()) {
kernel_neon_begin();
curve25519_neon(out, scalar, point);
kernel_neon_end();
} else {
curve25519_generic(out, scalar, point);
}
}
EXPORT_SYMBOL(curve25519_arch);
void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE])
{
return curve25519_arch(pub, secret, curve25519_base_point);
}
EXPORT_SYMBOL(curve25519_base_arch);
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
unsigned int len)
{
u8 *secret = kpp_tfm_ctx(tfm);
if (!len)
curve25519_generate_secret(secret);
else if (len == CURVE25519_KEY_SIZE &&
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
memcpy(secret, buf, CURVE25519_KEY_SIZE);
else
return -EINVAL;
return 0;
}
static int curve25519_compute_value(struct kpp_request *req)
{
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
const u8 *secret = kpp_tfm_ctx(tfm);
u8 public_key[CURVE25519_KEY_SIZE];
u8 buf[CURVE25519_KEY_SIZE];
int copied, nbytes;
u8 const *bp;
if (req->src) {
copied = sg_copy_to_buffer(req->src,
sg_nents_for_len(req->src,
CURVE25519_KEY_SIZE),
public_key, CURVE25519_KEY_SIZE);
if (copied != CURVE25519_KEY_SIZE)
return -EINVAL;
bp = public_key;
} else {
bp = curve25519_base_point;
}
curve25519_arch(buf, secret, bp);
/* might want less than we've got */
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
nbytes),
buf, nbytes);
if (copied != nbytes)
return -EINVAL;
return 0;
}
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
{
return CURVE25519_KEY_SIZE;
}
static struct kpp_alg curve25519_alg = {
.base.cra_name = "curve25519",
.base.cra_driver_name = "curve25519-neon",
.base.cra_priority = 200,
.base.cra_module = THIS_MODULE,
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
.set_secret = curve25519_set_secret,
.generate_public_key = curve25519_compute_value,
.compute_shared_secret = curve25519_compute_value,
.max_size = curve25519_max_size,
};
static int __init mod_init(void)
{
if (elf_hwcap & HWCAP_NEON) {
static_branch_enable(&have_neon);
return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
crypto_register_kpp(&curve25519_alg) : 0;
}
return 0;
}
static void __exit mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
crypto_unregister_kpp(&curve25519_alg);
}
module_init(mod_init);
module_exit(mod_exit);
MODULE_ALIAS_CRYPTO("curve25519");
MODULE_ALIAS_CRYPTO("curve25519-neon");
MODULE_LICENSE("GPL v2");

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,272 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
*
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/jump_label.h>
#include <linux/module.h>
void poly1305_init_arm(void *state, const u8 *key);
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
{
}
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_init_arm(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
dctx->s[1] = get_unaligned_le32(key + 20);
dctx->s[2] = get_unaligned_le32(key + 24);
dctx->s[3] = get_unaligned_le32(key + 28);
dctx->buflen = 0;
}
EXPORT_SYMBOL(poly1305_init_arch);
static int arm_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->buflen = 0;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
u32 len, u32 hibit, bool do_neon)
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_arm(&dctx->h, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
dctx->s[1] = get_unaligned_le32(src + 4);
dctx->s[2] = get_unaligned_le32(src + 8);
dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
if (len < POLY1305_BLOCK_SIZE)
return;
}
len &= ~(POLY1305_BLOCK_SIZE - 1);
if (static_branch_likely(&have_neon) && likely(do_neon))
poly1305_blocks_neon(&dctx->h, src, len, hibit);
else
poly1305_blocks_arm(&dctx->h, src, len, hibit);
}
static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
const u8 *src, u32 len, bool do_neon)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
len -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
arm_poly1305_blocks(dctx, dctx->buf,
POLY1305_BLOCK_SIZE, 1, false);
dctx->buflen = 0;
}
}
if (likely(len >= POLY1305_BLOCK_SIZE)) {
arm_poly1305_blocks(dctx, src, len, 1, do_neon);
src += round_down(len, POLY1305_BLOCK_SIZE);
len %= POLY1305_BLOCK_SIZE;
}
if (unlikely(len)) {
dctx->buflen = len;
memcpy(dctx->buf, src, len);
}
}
static int arm_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
arm_poly1305_do_update(dctx, src, srclen, false);
return 0;
}
static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
const u8 *src,
unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
bool do_neon = may_use_simd() && srclen > 128;
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_begin();
arm_poly1305_do_update(dctx, src, srclen, do_neon);
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_end();
return 0;
}
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int nbytes)
{
bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
may_use_simd();
if (unlikely(dctx->buflen)) {
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
nbytes -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks_arm(&dctx->h, dctx->buf,
POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && do_neon) {
do {
unsigned int todo = min_t(unsigned int, len, SZ_4K);
kernel_neon_begin();
poly1305_blocks_neon(&dctx->h, src, todo, 1);
kernel_neon_end();
len -= todo;
src += todo;
} while (len);
} else {
poly1305_blocks_arm(&dctx->h, src, len, 1);
src += len;
}
nbytes %= POLY1305_BLOCK_SIZE;
}
if (unlikely(nbytes)) {
dctx->buflen = nbytes;
memcpy(dctx->buf, src, nbytes);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit_arm(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg arm_poly1305_algs[] = {{
.init = arm_poly1305_init,
.update = arm_poly1305_update,
.final = arm_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-arm",
.base.cra_priority = 150,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
#ifdef CONFIG_KERNEL_MODE_NEON
}, {
.init = arm_poly1305_init,
.update = arm_poly1305_update_neon,
.final = arm_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-neon",
.base.cra_priority = 200,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
#endif
}};
static int __init arm_poly1305_mod_init(void)
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
(elf_hwcap & HWCAP_NEON))
static_branch_enable(&have_neon);
else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
/* register only the first entry */
return crypto_register_shash(&arm_poly1305_algs[0]);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shashes(arm_poly1305_algs,
ARRAY_SIZE(arm_poly1305_algs)) : 0;
}
static void __exit arm_poly1305_mod_exit(void)
{
if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
return;
if (!static_branch_likely(&have_neon)) {
crypto_unregister_shash(&arm_poly1305_algs[0]);
return;
}
crypto_unregister_shashes(arm_poly1305_algs,
ARRAY_SIZE(arm_poly1305_algs));
}
module_init(arm_poly1305_mod_init);
module_exit(arm_poly1305_mod_exit);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-arm");
MODULE_ALIAS_CRYPTO("poly1305-neon");

View File

@ -688,6 +688,40 @@ static void disable_single_step(struct perf_event *bp)
arch_install_hw_breakpoint(bp);
}
/*
* Arm32 hardware does not always report a watchpoint hit address that matches
* one of the watchpoints set. It can also report an address "near" the
* watchpoint if a single instruction access both watched and unwatched
* addresses. There is no straight-forward way, short of disassembling the
* offending instruction, to map that address back to the watchpoint. This
* function computes the distance of the memory access from the watchpoint as a
* heuristic for the likelyhood that a given access triggered the watchpoint.
*
* See this same function in the arm64 platform code, which has the same
* problem.
*
* The function returns the distance of the address from the bytes watched by
* the watchpoint. In case of an exact match, it returns 0.
*/
static u32 get_distance_from_watchpoint(unsigned long addr, u32 val,
struct arch_hw_breakpoint_ctrl *ctrl)
{
u32 wp_low, wp_high;
u32 lens, lene;
lens = __ffs(ctrl->len);
lene = __fls(ctrl->len);
wp_low = val + lens;
wp_high = val + lene;
if (addr < wp_low)
return wp_low - addr;
else if (addr > wp_high)
return addr - wp_high;
else
return 0;
}
static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
struct arch_hw_breakpoint *info)
{
@ -697,23 +731,25 @@ static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
static void watchpoint_handler(unsigned long addr, unsigned int fsr,
struct pt_regs *regs)
{
int i, access;
u32 val, ctrl_reg, alignment_mask;
int i, access, closest_match = 0;
u32 min_dist = -1, dist;
u32 val, ctrl_reg;
struct perf_event *wp, **slots;
struct arch_hw_breakpoint *info;
struct arch_hw_breakpoint_ctrl ctrl;
slots = this_cpu_ptr(wp_on_reg);
/*
* Find all watchpoints that match the reported address. If no exact
* match is found. Attribute the hit to the closest watchpoint.
*/
rcu_read_lock();
for (i = 0; i < core_num_wrps; ++i) {
rcu_read_lock();
wp = slots[i];
if (wp == NULL)
goto unlock;
continue;
info = counter_arch_bp(wp);
/*
* The DFAR is an unknown value on debug architectures prior
* to 7.1. Since we only allow a single watchpoint on these
@ -722,33 +758,31 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
*/
if (debug_arch < ARM_DEBUG_ARCH_V7_1) {
BUG_ON(i > 0);
info = counter_arch_bp(wp);
info->trigger = wp->attr.bp_addr;
} else {
if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
alignment_mask = 0x7;
else
alignment_mask = 0x3;
/* Check if the watchpoint value matches. */
val = read_wb_reg(ARM_BASE_WVR + i);
if (val != (addr & ~alignment_mask))
goto unlock;
/* Possible match, check the byte address select. */
ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
decode_ctrl_reg(ctrl_reg, &ctrl);
if (!((1 << (addr & alignment_mask)) & ctrl.len))
goto unlock;
/* Check that the access type matches. */
if (debug_exception_updates_fsr()) {
access = (fsr & ARM_FSR_ACCESS_MASK) ?
HW_BREAKPOINT_W : HW_BREAKPOINT_R;
if (!(access & hw_breakpoint_type(wp)))
goto unlock;
continue;
}
val = read_wb_reg(ARM_BASE_WVR + i);
ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
decode_ctrl_reg(ctrl_reg, &ctrl);
dist = get_distance_from_watchpoint(addr, val, &ctrl);
if (dist < min_dist) {
min_dist = dist;
closest_match = i;
}
/* Is this an exact match? */
if (dist != 0)
continue;
/* We have a winner. */
info = counter_arch_bp(wp);
info->trigger = addr;
}
@ -770,13 +804,23 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* we can single-step over the watchpoint trigger.
*/
if (!is_default_overflow_handler(wp))
goto unlock;
continue;
step:
enable_single_step(wp, instruction_pointer(regs));
unlock:
rcu_read_unlock();
}
if (min_dist > 0 && min_dist != -1) {
/* No exact match found. */
wp = slots[closest_match];
info = counter_arch_bp(wp);
info->trigger = addr;
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
perf_bp_event(wp, regs);
if (is_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs));
}
rcu_read_unlock();
}
static void watchpoint_single_step_handler(unsigned long pc)

View File

@ -1261,20 +1261,28 @@ static void __init l2c310_of_parse(const struct device_node *np,
ret = of_property_read_u32(np, "prefetch-data", &val);
if (ret == 0) {
if (val)
if (val) {
prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
else
*aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH;
} else {
prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
*aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
}
*aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
} else if (ret != -EINVAL) {
pr_err("L2C-310 OF prefetch-data property value is missing\n");
}
ret = of_property_read_u32(np, "prefetch-instr", &val);
if (ret == 0) {
if (val)
if (val) {
prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
else
*aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
} else {
prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
*aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
}
*aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
} else if (ret != -EINVAL) {
pr_err("L2C-310 OF prefetch-instr property value is missing\n");
}

View File

@ -240,6 +240,7 @@ config SAMSUNG_PM_DEBUG
bool "Samsung PM Suspend debug"
depends on PM && DEBUG_KERNEL
depends on DEBUG_EXYNOS_UART || DEBUG_S3C24XX_UART || DEBUG_S3C2410_UART
depends on DEBUG_LL && MMU
help
Say Y here if you want verbose debugging from the PM Suspend and
Resume code. See <file:Documentation/arm/Samsung-S3C24XX/Suspend.txt>

View File

@ -46,6 +46,7 @@ config ARCH_BCM_IPROC
config ARCH_BERLIN
bool "Marvell Berlin SoC Family"
select DW_APB_ICTL
select DW_APB_TIMER_OF
select GPIOLIB
select PINCTRL
help

View File

@ -10,7 +10,7 @@
#
# Copyright (C) 1995-2001 by Russell King
LDFLAGS_vmlinux :=--no-undefined -X
LDFLAGS_vmlinux :=--no-undefined -X -z norelro
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
GZFLAGS :=-9
@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y)
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
# for relative relocs, since this leads to better Image compression
# with the relocation offsets always being zero.
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \
$(call ld-option, --no-apply-dynamic-relocs)
endif

View File

@ -21,6 +21,10 @@
aliases {
ethernet0 = &eth0;
/* for dsa slave device */
ethernet1 = &switch0port1;
ethernet2 = &switch0port2;
ethernet3 = &switch0port3;
serial0 = &uart0;
serial1 = &uart1;
};
@ -136,25 +140,25 @@
#address-cells = <1>;
#size-cells = <0>;
port@0 {
switch0port0: port@0 {
reg = <0>;
label = "cpu";
ethernet = <&eth0>;
};
port@1 {
switch0port1: port@1 {
reg = <1>;
label = "wan";
phy-handle = <&switch0phy0>;
};
port@2 {
switch0port2: port@2 {
reg = <2>;
label = "lan0";
phy-handle = <&switch0phy1>;
};
port@3 {
switch0port3: port@3 {
reg = <3>;
label = "lan1";
phy-handle = <&switch0phy2>;

View File

@ -877,7 +877,7 @@
reg-names = "mdp_phys";
interrupt-parent = <&mdss>;
interrupts = <0 0>;
interrupts = <0>;
clocks = <&gcc GCC_MDSS_AHB_CLK>,
<&gcc GCC_MDSS_AXI_CLK>,
@ -909,7 +909,7 @@
reg-names = "dsi_ctrl";
interrupt-parent = <&mdss>;
interrupts = <4 0>;
interrupts = <4>;
assigned-clocks = <&gcc BYTE0_CLK_SRC>,
<&gcc PCLK0_CLK_SRC>;

View File

@ -99,7 +99,7 @@
wcd_codec: codec@f000 {
compatible = "qcom,pm8916-wcd-analog-codec";
reg = <0xf000 0x200>;
reg = <0xf000>;
reg-names = "pmic-codec-core";
clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
clock-names = "mclk";

View File

@ -430,6 +430,7 @@
bus-width = <8>;
mmc-hs200-1_8v;
non-removable;
full-pwr-cycle-in-suspend;
status = "okay";
};

View File

@ -411,7 +411,7 @@
};
i2c0: i2c@ff020000 {
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
compatible = "cdns,i2c-r1p14";
status = "disabled";
interrupt-parent = <&gic>;
interrupts = <0 17 4>;
@ -421,7 +421,7 @@
};
i2c1: i2c@ff030000 {
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
compatible = "cdns,i2c-r1p14";
status = "disabled";
interrupt-parent = <&gic>;
interrupts = <0 18 4>;

View File

@ -77,7 +77,6 @@ CONFIG_ARM_SCMI_PROTOCOL=y
CONFIG_ARM_SCPI_PROTOCOL=y
# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
CONFIG_ARM64_CRYPTO=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_JUMP_LABEL=y
@ -246,6 +245,7 @@ CONFIG_DM_VERITY_FEC=y
CONFIG_DM_BOW=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=y
CONFIG_WIREGUARD=y
CONFIG_TUN=y
CONFIG_VETH=y
# CONFIG_ETHERNET is not set
@ -358,6 +358,7 @@ CONFIG_HID_NINTENDO=y
CONFIG_HID_SONY=y
CONFIG_HID_STEAM=y
CONFIG_USB_HIDDEV=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_OTG=y
CONFIG_USB_XHCI_HCD=y
CONFIG_USB_GADGET=y
@ -503,6 +504,7 @@ CONFIG_CRC8=y
CONFIG_XZ_DEC=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_MAGIC_SYSRQ=y

View File

@ -1,2 +1,3 @@
sha256-core.S
sha512-core.S
poly1305-core.S

View File

@ -106,10 +106,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_SIMD
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
select CRYPTO_LIB_CHACHA_GENERIC
select CRYPTO_ARCH_HAVE_LIB_CHACHA
config CRYPTO_POLY1305_NEON
tristate "Poly1305 hash function using scalar or NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_ARCH_HAVE_LIB_POLY1305
config CRYPTO_AES_ARM64_BS
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"

View File

@ -53,8 +53,12 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
poly1305-neon-y := poly1305-core.o poly1305-glue.o
AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
@ -71,6 +75,9 @@ ifdef REGENERATE_ARM64_CRYPTO
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) void $(@)
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
$(call cmd,perlasm)
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
@ -78,4 +85,4 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
$(call cmd,perlasm)
endif
targets += sha256-core.S sha512-core.S
targets += poly1305-core.S sha256-core.S sha512-core.S

View File

@ -1,13 +1,13 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
* ChaCha/XChaCha NEON helper functions
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* Originally based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
@ -19,29 +19,27 @@
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>
.text
.align 6
ENTRY(chacha20_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
/*
* chacha_permute - permute one block
*
* Permute one 64-byte block where the state matrix is stored in the four NEON
* registers v0-v3. It performs matrix operations on four words in parallel,
* but requires shuffling to rearrange the words after each round.
*
* The round count is given in w3.
*
* Clobbers: w3, x10, v4, v12
*/
chacha_permute:
//
// This function encrypts one ChaCha20 block by loading the state matrix
// in four NEON registers. It performs matrix operation on four words in
// parallel, but requires shuffling to rearrange the words after each
// round.
//
// x0..3 = s0..3
adr x3, ROT8
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
ld1 {v12.4s}, [x3]
mov x3, #10
adr_l x10, ROT8
ld1 {v12.4s}, [x10]
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
ext v3.16b, v3.16b, v3.16b, #4
subs x3, x3, #1
subs w3, w3, #2
b.ne .Ldoubleround
ret
ENDPROC(chacha_permute)
ENTRY(chacha_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
// w3: nrounds
stp x29, x30, [sp, #-16]!
mov x29, sp
// x0..3 = s0..3
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
bl chacha_permute
ld1 {v4.16b-v7.16b}, [x2]
// o0 = i0 ^ (x0 + s0)
@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
st1 {v0.16b-v3.16b}, [x1]
ldp x29, x30, [sp], #16
ret
ENDPROC(chacha20_block_xor_neon)
ENDPROC(chacha_block_xor_neon)
ENTRY(hchacha_block_neon)
// x0: Input state matrix, s
// x1: output (8 32-bit words)
// w2: nrounds
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v0.4s-v3.4s}, [x0]
mov w3, w2
bl chacha_permute
st1 {v0.4s}, [x1], #16
st1 {v3.4s}, [x1]
ldp x29, x30, [sp], #16
ret
ENDPROC(hchacha_block_neon)
a0 .req w12
a1 .req w13
a2 .req w14
a3 .req w15
a4 .req w16
a5 .req w17
a6 .req w19
a7 .req w20
a8 .req w21
a9 .req w22
a10 .req w23
a11 .req w24
a12 .req w25
a13 .req w26
a14 .req w27
a15 .req w28
.align 6
ENTRY(chacha20_4block_xor_neon)
ENTRY(chacha_4block_xor_neon)
frame_push 10
// x0: Input state matrix, s
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
// w3: nrounds
// x4: byte count
adr_l x10, .Lpermute
and x5, x4, #63
add x10, x10, x5
add x11, x10, #64
//
// This function encrypts four consecutive ChaCha20 blocks by loading
// This function encrypts four consecutive ChaCha blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
adr x3, CTRINC // ... and ROT8
ld1 {v30.4s-v31.4s}, [x3]
// At the same time, a fifth block is encrypted in parallel using
// scalar registers
//
adr_l x9, CTRINC // ... and ROT8
ld1 {v30.4s-v31.4s}, [x9]
// x0..15[0-3] = s0..3[0..3]
mov x4, x0
ld4r { v0.4s- v3.4s}, [x4], #16
ld4r { v4.4s- v7.4s}, [x4], #16
ld4r { v8.4s-v11.4s}, [x4], #16
ld4r {v12.4s-v15.4s}, [x4]
add x8, x0, #16
ld4r { v0.4s- v3.4s}, [x0]
ld4r { v4.4s- v7.4s}, [x8], #16
ld4r { v8.4s-v11.4s}, [x8], #16
ld4r {v12.4s-v15.4s}, [x8]
// x12 += counter values 0-3
mov a0, v0.s[0]
mov a1, v1.s[0]
mov a2, v2.s[0]
mov a3, v3.s[0]
mov a4, v4.s[0]
mov a5, v5.s[0]
mov a6, v6.s[0]
mov a7, v7.s[0]
mov a8, v8.s[0]
mov a9, v9.s[0]
mov a10, v10.s[0]
mov a11, v11.s[0]
mov a12, v12.s[0]
mov a13, v13.s[0]
mov a14, v14.s[0]
mov a15, v15.s[0]
// x12 += counter values 1-4
add v12.4s, v12.4s, v30.4s
mov x3, #10
.Ldoubleround4:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
add v0.4s, v0.4s, v4.4s
add a0, a0, a4
add v1.4s, v1.4s, v5.4s
add a1, a1, a5
add v2.4s, v2.4s, v6.4s
add a2, a2, a6
add v3.4s, v3.4s, v7.4s
add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
eor a15, a15, a3
rev32 v12.8h, v12.8h
ror a12, a12, #16
rev32 v13.8h, v13.8h
ror a13, a13, #16
rev32 v14.8h, v14.8h
ror a14, a14, #16
rev32 v15.8h, v15.8h
ror a15, a15, #16
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
add v8.4s, v8.4s, v12.4s
add a8, a8, a12
add v9.4s, v9.4s, v13.4s
add a9, a9, a13
add v10.4s, v10.4s, v14.4s
add a10, a10, a14
add v11.4s, v11.4s, v15.4s
add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
eor a7, a7, a11
shl v4.4s, v16.4s, #12
shl v5.4s, v17.4s, #12
@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v7.4s, v19.4s, #12
sri v4.4s, v16.4s, #20
ror a4, a4, #20
sri v5.4s, v17.4s, #20
ror a5, a5, #20
sri v6.4s, v18.4s, #20
ror a6, a6, #20
sri v7.4s, v19.4s, #20
ror a7, a7, #20
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
add v0.4s, v0.4s, v4.4s
add a0, a0, a4
add v1.4s, v1.4s, v5.4s
add a1, a1, a5
add v2.4s, v2.4s, v6.4s
add a2, a2, a6
add v3.4s, v3.4s, v7.4s
add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
eor a15, a15, a3
tbl v12.16b, {v12.16b}, v31.16b
ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
ror a14, a14, #24
tbl v15.16b, {v15.16b}, v31.16b
ror a15, a15, #24
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
add v8.4s, v8.4s, v12.4s
add a8, a8, a12
add v9.4s, v9.4s, v13.4s
add a9, a9, a13
add v10.4s, v10.4s, v14.4s
add a10, a10, a14
add v11.4s, v11.4s, v15.4s
add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
eor a7, a7, a11
shl v4.4s, v16.4s, #7
shl v5.4s, v17.4s, #7
@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v7.4s, v19.4s, #7
sri v4.4s, v16.4s, #25
ror a4, a4, #25
sri v5.4s, v17.4s, #25
ror a5, a5, #25
sri v6.4s, v18.4s, #25
ror a6, a6, #25
sri v7.4s, v19.4s, #25
ror a7, a7, #25
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
add v0.4s, v0.4s, v5.4s
add a0, a0, a5
add v1.4s, v1.4s, v6.4s
add a1, a1, a6
add v2.4s, v2.4s, v7.4s
add a2, a2, a7
add v3.4s, v3.4s, v4.4s
add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
eor a14, a14, a3
rev32 v15.8h, v15.8h
ror a15, a15, #16
rev32 v12.8h, v12.8h
ror a12, a12, #16
rev32 v13.8h, v13.8h
ror a13, a13, #16
rev32 v14.8h, v14.8h
ror a14, a14, #16
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
add v10.4s, v10.4s, v15.4s
add a10, a10, a15
add v11.4s, v11.4s, v12.4s
add a11, a11, a12
add v8.4s, v8.4s, v13.4s
add a8, a8, a13
add v9.4s, v9.4s, v14.4s
add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
eor a4, a4, a9
shl v5.4s, v16.4s, #12
shl v6.4s, v17.4s, #12
@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
shl v4.4s, v19.4s, #12
sri v5.4s, v16.4s, #20
ror a5, a5, #20
sri v6.4s, v17.4s, #20
ror a6, a6, #20
sri v7.4s, v18.4s, #20
ror a7, a7, #20
sri v4.4s, v19.4s, #20
ror a4, a4, #20
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
add v0.4s, v0.4s, v5.4s
add a0, a0, a5
add v1.4s, v1.4s, v6.4s
add a1, a1, a6
add v2.4s, v2.4s, v7.4s
add a2, a2, a7
add v3.4s, v3.4s, v4.4s
add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
eor a14, a14, a3
tbl v15.16b, {v15.16b}, v31.16b
ror a15, a15, #24
tbl v12.16b, {v12.16b}, v31.16b
ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
ror a14, a14, #24
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
add v10.4s, v10.4s, v15.4s
add a10, a10, a15
add v11.4s, v11.4s, v12.4s
add a11, a11, a12
add v8.4s, v8.4s, v13.4s
add a8, a8, a13
add v9.4s, v9.4s, v14.4s
add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
eor a4, a4, a9
shl v5.4s, v16.4s, #7
shl v6.4s, v17.4s, #7
@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
shl v4.4s, v19.4s, #7
sri v5.4s, v16.4s, #25
ror a5, a5, #25
sri v6.4s, v17.4s, #25
ror a6, a6, #25
sri v7.4s, v18.4s, #25
ror a7, a7, #25
sri v4.4s, v19.4s, #25
ror a4, a4, #25
subs x3, x3, #1
subs w3, w3, #2
b.ne .Ldoubleround4
ld4r {v16.4s-v19.4s}, [x0], #16
@ -344,9 +521,21 @@ ENTRY(chacha20_4block_xor_neon)
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
add v0.4s, v0.4s, v16.4s
mov w6, v16.s[0]
mov w7, v17.s[0]
add v1.4s, v1.4s, v17.4s
mov w8, v18.s[0]
mov w9, v19.s[0]
add v2.4s, v2.4s, v18.4s
add a0, a0, w6
add a1, a1, w7
add v3.4s, v3.4s, v19.4s
add a2, a2, w8
add a3, a3, w9
CPU_BE( rev a0, a0 )
CPU_BE( rev a1, a1 )
CPU_BE( rev a2, a2 )
CPU_BE( rev a3, a3 )
ld4r {v24.4s-v27.4s}, [x0], #16
ld4r {v28.4s-v31.4s}, [x0]
@ -356,95 +545,316 @@ ENTRY(chacha20_4block_xor_neon)
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
add v4.4s, v4.4s, v20.4s
mov w6, v20.s[0]
mov w7, v21.s[0]
add v5.4s, v5.4s, v21.4s
mov w8, v22.s[0]
mov w9, v23.s[0]
add v6.4s, v6.4s, v22.4s
add a4, a4, w6
add a5, a5, w7
add v7.4s, v7.4s, v23.4s
add a6, a6, w8
add a7, a7, w9
CPU_BE( rev a4, a4 )
CPU_BE( rev a5, a5 )
CPU_BE( rev a6, a6 )
CPU_BE( rev a7, a7 )
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
add v8.4s, v8.4s, v24.4s
mov w6, v24.s[0]
mov w7, v25.s[0]
add v9.4s, v9.4s, v25.4s
mov w8, v26.s[0]
mov w9, v27.s[0]
add v10.4s, v10.4s, v26.4s
add a8, a8, w6
add a9, a9, w7
add v11.4s, v11.4s, v27.4s
add a10, a10, w8
add a11, a11, w9
CPU_BE( rev a8, a8 )
CPU_BE( rev a9, a9 )
CPU_BE( rev a10, a10 )
CPU_BE( rev a11, a11 )
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
add v12.4s, v12.4s, v28.4s
mov w6, v28.s[0]
mov w7, v29.s[0]
add v13.4s, v13.4s, v29.4s
mov w8, v30.s[0]
mov w9, v31.s[0]
add v14.4s, v14.4s, v30.4s
add a12, a12, w6
add a13, a13, w7
add v15.4s, v15.4s, v31.4s
add a14, a14, w8
add a15, a15, w9
CPU_BE( rev a12, a12 )
CPU_BE( rev a13, a13 )
CPU_BE( rev a14, a14 )
CPU_BE( rev a15, a15 )
// interleave 32-bit words in state n, n+1
ldp w6, w7, [x2], #64
zip1 v16.4s, v0.4s, v1.4s
ldp w8, w9, [x2, #-56]
eor a0, a0, w6
zip2 v17.4s, v0.4s, v1.4s
eor a1, a1, w7
zip1 v18.4s, v2.4s, v3.4s
eor a2, a2, w8
zip2 v19.4s, v2.4s, v3.4s
eor a3, a3, w9
ldp w6, w7, [x2, #-48]
zip1 v20.4s, v4.4s, v5.4s
ldp w8, w9, [x2, #-40]
eor a4, a4, w6
zip2 v21.4s, v4.4s, v5.4s
eor a5, a5, w7
zip1 v22.4s, v6.4s, v7.4s
eor a6, a6, w8
zip2 v23.4s, v6.4s, v7.4s
eor a7, a7, w9
ldp w6, w7, [x2, #-32]
zip1 v24.4s, v8.4s, v9.4s
ldp w8, w9, [x2, #-24]
eor a8, a8, w6
zip2 v25.4s, v8.4s, v9.4s
eor a9, a9, w7
zip1 v26.4s, v10.4s, v11.4s
eor a10, a10, w8
zip2 v27.4s, v10.4s, v11.4s
eor a11, a11, w9
ldp w6, w7, [x2, #-16]
zip1 v28.4s, v12.4s, v13.4s
ldp w8, w9, [x2, #-8]
eor a12, a12, w6
zip2 v29.4s, v12.4s, v13.4s
eor a13, a13, w7
zip1 v30.4s, v14.4s, v15.4s
eor a14, a14, w8
zip2 v31.4s, v14.4s, v15.4s
eor a15, a15, w9
mov x3, #64
subs x5, x4, #128
add x6, x5, x2
csel x3, x3, xzr, ge
csel x2, x2, x6, ge
// interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d
zip2 v4.2d, v16.2d, v18.2d
stp a0, a1, [x1], #64
zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d
ld1 {v16.16b-v19.16b}, [x2], #64
stp a2, a3, [x1, #-56]
ld1 {v16.16b-v19.16b}, [x2], x3
subs x6, x4, #192
ccmp x3, xzr, #4, lt
add x7, x6, x2
csel x3, x3, xzr, eq
csel x2, x2, x7, eq
zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d
stp a4, a5, [x1, #-48]
zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d
ld1 {v20.16b-v23.16b}, [x2], #64
stp a6, a7, [x1, #-40]
ld1 {v20.16b-v23.16b}, [x2], x3
subs x7, x4, #256
ccmp x3, xzr, #4, lt
add x8, x7, x2
csel x3, x3, xzr, eq
csel x2, x2, x8, eq
zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d
stp a8, a9, [x1, #-32]
zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d
ld1 {v24.16b-v27.16b}, [x2], #64
stp a10, a11, [x1, #-24]
ld1 {v24.16b-v27.16b}, [x2], x3
subs x8, x4, #320
ccmp x3, xzr, #4, lt
add x9, x8, x2
csel x2, x2, x9, eq
zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d
stp a12, a13, [x1, #-16]
zip1 v11.2d, v29.2d, v31.2d
zip2 v15.2d, v29.2d, v31.2d
stp a14, a15, [x1, #-8]
ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output
tbnz x5, #63, 0f
eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
st1 {v16.16b-v19.16b}, [x1], #64
cbz x5, .Lout
tbnz x6, #63, 1f
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
st1 {v16.16b-v19.16b}, [x1], #64
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
st1 {v20.16b-v23.16b}, [x1], #64
cbz x6, .Lout
tbnz x7, #63, 2f
eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b
st1 {v20.16b-v23.16b}, [x1], #64
eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b
eor v28.16b, v28.16b, v12.16b
st1 {v24.16b-v27.16b}, [x1], #64
cbz x7, .Lout
tbnz x8, #63, 3f
eor v28.16b, v28.16b, v12.16b
eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b
st1 {v28.16b-v31.16b}, [x1]
.Lout: frame_pop
ret
ENDPROC(chacha20_4block_xor_neon)
CTRINC: .word 0, 1, 2, 3
// fewer than 128 bytes of in/output
0: ld1 {v8.16b}, [x10]
ld1 {v9.16b}, [x11]
movi v10.16b, #16
sub x2, x1, #64
add x1, x1, x5
ld1 {v16.16b-v19.16b}, [x2]
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 192 bytes of in/output
1: ld1 {v8.16b}, [x10]
ld1 {v9.16b}, [x11]
movi v10.16b, #16
add x1, x1, x6
tbl v0.16b, {v4.16b-v7.16b}, v8.16b
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v1.16b, {v4.16b-v7.16b}, v8.16b
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v2.16b, {v4.16b-v7.16b}, v8.16b
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
add v8.16b, v8.16b, v10.16b
add v9.16b, v9.16b, v10.16b
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
eor v20.16b, v20.16b, v0.16b
eor v21.16b, v21.16b, v1.16b
eor v22.16b, v22.16b, v2.16b
eor v23.16b, v23.16b, v3.16b
st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 256 bytes of in/output
2: ld1 {v4.16b}, [x10]
ld1 {v5.16b}, [x11]
movi v6.16b, #16
add x1, x1, x7
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
tbx v24.16b, {v20.16b-v23.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
tbx v25.16b, {v20.16b-v23.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v2.16b, {v8.16b-v11.16b}, v4.16b
tbx v26.16b, {v20.16b-v23.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
eor v24.16b, v24.16b, v0.16b
eor v25.16b, v25.16b, v1.16b
eor v26.16b, v26.16b, v2.16b
eor v27.16b, v27.16b, v3.16b
st1 {v24.16b-v27.16b}, [x1]
b .Lout
// fewer than 320 bytes of in/output
3: ld1 {v4.16b}, [x10]
ld1 {v5.16b}, [x11]
movi v6.16b, #16
add x1, x1, x8
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
tbx v28.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
add v4.16b, v4.16b, v6.16b
add v5.16b, v5.16b, v6.16b
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
eor v28.16b, v28.16b, v0.16b
eor v29.16b, v29.16b, v1.16b
eor v30.16b, v30.16b, v2.16b
eor v31.16b, v31.16b, v3.16b
st1 {v28.16b-v31.16b}, [x1]
b .Lout
ENDPROC(chacha_4block_xor_neon)
.section ".rodata", "a", %progbits
.align L1_CACHE_SHIFT
.Lpermute:
.set .Li, 0
.rept 192
.byte (.Li - 64)
.set .Li, .Li + 1
.endr
CTRINC: .word 1, 2, 3, 4
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f

View File

@ -1,8 +1,8 @@
/*
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
* ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@ -20,8 +20,9 @@
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
@ -29,40 +30,78 @@
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
u8 buf[CHACHA_BLOCK_SIZE];
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha_block_xor_neon(state, dst, src, nrounds);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, bytes);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
while (bytes > 0) {
int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
if (l <= CHACHA_BLOCK_SIZE) {
u8 buf[CHACHA_BLOCK_SIZE];
memcpy(buf, src, l);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, l);
state[12] += 1;
break;
}
chacha_4block_xor_neon(state, dst, src, nrounds, l);
bytes -= l;
src += l;
dst += l;
state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
}
}
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
hchacha_block_generic(state, stream, nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, stream, nrounds);
kernel_neon_end();
}
}
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
!may_use_simd())
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_neon_begin();
chacha_doneon(state, dst, src, todo, nrounds);
kernel_neon_end();
bytes -= todo;
src += todo;
dst += todo;
} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_neon_stream_xor(struct skcipher_request *req,
struct chacha_ctx *ctx, u8 *iv)
const struct chacha_ctx *ctx, const u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];
@ -70,18 +109,25 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
err = skcipher_walk_virt(&walk, req, false);
crypto_chacha_init(state, ctx, iv);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
nbytes = rounddown(nbytes, walk.stride);
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
kernel_neon_end();
if (!static_branch_likely(&have_neon) ||
!may_use_simd()) {
chacha_crypt_generic(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes,
ctx->nrounds);
} else {
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes, ctx->nrounds);
kernel_neon_end();
}
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
@ -93,9 +139,6 @@ static int chacha_neon(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);
return chacha_neon_stream_xor(req, ctx, req->iv);
}
@ -107,14 +150,8 @@ static int xchacha_neon(struct skcipher_request *req)
u32 state[16];
u8 real_iv[16];
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_xchacha_crypt(req);
crypto_chacha_init(state, ctx, req->iv);
kernel_neon_begin();
hchacha_block_neon(state, subctx.key, ctx->nrounds);
kernel_neon_end();
chacha_init_generic(state, ctx->key, req->iv);
hchacha_block_arch(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
@ -135,8 +172,8 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
@ -151,8 +188,8 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
@ -167,8 +204,8 @@ static struct skcipher_alg algs[] = {
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha12_setkey,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
@ -176,15 +213,19 @@ static struct skcipher_alg algs[] = {
static int __init chacha_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
if (!(elf_hwcap & HWCAP_ASIMD))
return 0;
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
static_branch_enable(&have_neon);
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && (elf_hwcap & HWCAP_ASIMD))
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);

View File

@ -1,133 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA_BLOCK_SIZE];
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
kernel_neon_begin();
chacha20_4block_xor_neon(state, dst, src);
kernel_neon_end();
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
if (!bytes)
return;
kernel_neon_begin();
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha20_block_xor_neon(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_neon(state, buf, buf);
memcpy(dst, buf, bytes);
}
kernel_neon_end();
}
static int chacha20_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
u32 state[16];
int err;
if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, false);
crypto_chacha_init(state, ctx, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_neon,
.decrypt = chacha20_neon,
};
static int __init chacha20_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
return crypto_register_skcipher(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");

View File

@ -0,0 +1,913 @@
#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project.
# ====================================================================
#
# This module implements Poly1305 hash for ARMv8.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone.
#
# IALU/gcc-4.9 NEON
#
# Apple A7 1.86/+5% 0.72
# Cortex-A53 2.69/+58% 1.47
# Cortex-A57 2.70/+7% 1.14
# Denver 1.64/+50% 1.18(*)
# X-Gene 2.13/+68% 2.27
# Mongoose 1.77/+75% 1.12
# Kryo 2.70/+55% 1.13
# ThunderX2 1.17/+95% 1.36
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
# translator is not almighty;
$flavour=shift;
$output=shift;
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
my ($mac,$nonce)=($inp,$len);
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
$code.=<<___;
#ifndef __KERNEL__
# include "arm_arch.h"
.extern OPENSSL_armcap_P
#endif
.text
// forward "declarations" are required for Apple
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp $inp,xzr
stp xzr,xzr,[$ctx] // zero hash value
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifndef __KERNEL__
adrp x17,OPENSSL_armcap_P
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
#endif
ldp $r0,$r1,[$inp] // load key
mov $s1,#0xfffffffc0fffffff
movk $s1,#0x0fff,lsl#48
#ifdef __AARCH64EB__
rev $r0,$r0 // flip bytes
rev $r1,$r1
#endif
and $r0,$r0,$s1 // &=0ffffffc0fffffff
and $s1,$s1,#-4
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
mov w#$s1,#-1
stp $r0,$r1,[$ctx,#32] // save key value
str w#$s1,[$ctx,#48] // impossible key power value
#ifndef __KERNEL__
tst w17,#ARMV7_NEON
adr $d0,.Lpoly1305_blocks
adr $r0,.Lpoly1305_blocks_neon
adr $d1,.Lpoly1305_emit
csel $d0,$d0,$r0,eq
# ifdef __ILP32__
stp w#$d0,w#$d1,[$len]
# else
stp $d0,$d1,[$len]
# endif
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
.Lpoly1305_blocks:
ands $len,$len,#-16
b.eq .Lno_data
ldp $h0,$h1,[$ctx] // load hash value
ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
ldp $r0,$r1,[$ctx,#32] // load key value
#ifdef __AARCH64EB__
lsr $d0,$h0,#32
mov w#$d1,w#$h0
lsr $d2,$h1,#32
mov w15,w#$h1
lsr x16,$h2,#32
#else
mov w#$d0,w#$h0
lsr $d1,$h0,#32
mov w#$d2,w#$h1
lsr x15,$h1,#32
mov w16,w#$h2
#endif
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
lsr $d1,$d2,#12
adds $d0,$d0,$d2,lsl#52
add $d1,$d1,x15,lsl#14
adc $d1,$d1,xzr
lsr $d2,x16,#24
adds $d1,$d1,x16,lsl#40
adc $d2,$d2,xzr
cmp x17,#0 // is_base2_26?
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
csel $h0,$h0,$d0,eq // choose between radixes
csel $h1,$h1,$d1,eq
csel $h2,$h2,$d2,eq
.Loop:
ldp $t0,$t1,[$inp],#16 // load input
sub $len,$len,#16
#ifdef __AARCH64EB__
rev $t0,$t0
rev $t1,$t1
#endif
adds $h0,$h0,$t0 // accumulate input
adcs $h1,$h1,$t1
mul $d0,$h0,$r0 // h0*r0
adc $h2,$h2,$padbit
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
cbnz $len,.Loop
stp $h0,$h1,[$ctx] // store hash value
stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
.Lpoly1305_emit:
ldp $h0,$h1,[$ctx] // load hash base 2^64
ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
ldp $t0,$t1,[$nonce] // load nonce
#ifdef __AARCH64EB__
lsr $d0,$h0,#32
mov w#$d1,w#$h0
lsr $d2,$h1,#32
mov w15,w#$h1
lsr x16,$h2,#32
#else
mov w#$d0,w#$h0
lsr $d1,$h0,#32
mov w#$d2,w#$h1
lsr x15,$h1,#32
mov w16,w#$h2
#endif
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
lsr $d1,$d2,#12
adds $d0,$d0,$d2,lsl#52
add $d1,$d1,x15,lsl#14
adc $d1,$d1,xzr
lsr $d2,x16,#24
adds $d1,$d1,x16,lsl#40
adc $d2,$d2,xzr
cmp $r0,#0 // is_base2_26?
csel $h0,$h0,$d0,eq // choose between radixes
csel $h1,$h1,$d1,eq
csel $h2,$h2,$d2,eq
adds $d0,$h0,#5 // compare to modulus
adcs $d1,$h1,xzr
adc $d2,$h2,xzr
tst $d2,#-4 // see if it's carried/borrowed
csel $h0,$h0,$d0,eq
csel $h1,$h1,$d1,eq
#ifdef __AARCH64EB__
ror $t0,$t0,#32 // flip nonce words
ror $t1,$t1,#32
#endif
adds $h0,$h0,$t0 // accumulate nonce
adc $h1,$h1,$t1
#ifdef __AARCH64EB__
rev $h0,$h0 // flip output bytes
rev $h1,$h1
#endif
stp $h0,$h1,[$mac] // write result
ret
.size poly1305_emit,.-poly1305_emit
___
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
my ($T0,$T1,$MASK) = map("v$_",(29..31));
my ($in2,$zeros)=("x16","x17");
my $is_base2_26 = $zeros; # borrow
$code.=<<___;
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul $d0,$h0,$r0 // h0*r0
umulh $d1,$h0,$r0
mul $t0,$h1,$s1 // h1*5*r1
umulh $t1,$h1,$s1
adds $d0,$d0,$t0
mul $t0,$h0,$r1 // h0*r1
adc $d1,$d1,$t1
umulh $d2,$h0,$r1
adds $d1,$d1,$t0
mul $t0,$h1,$r0 // h1*r0
adc $d2,$d2,xzr
umulh $t1,$h1,$r0
adds $d1,$d1,$t0
mul $t0,$h2,$s1 // h2*5*r1
adc $d2,$d2,$t1
mul $t1,$h2,$r0 // h2*r0
adds $d1,$d1,$t0
adc $d2,$d2,$t1
and $t0,$d2,#-4 // final reduction
and $h2,$d2,#3
add $t0,$t0,$d2,lsr#2
adds $h0,$d0,$t0
adcs $h1,$d1,xzr
adc $h2,$h2,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 4
poly1305_splat:
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,$h0,#26,#26
extr x14,$h1,$h0,#52
and x14,x14,#0x03ffffff
ubfx x15,$h1,#14,#26
extr x16,$h2,$h1,#40
str w12,[$ctx,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[$ctx,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[$ctx,#16*2] // s1
str w14,[$ctx,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[$ctx,#16*4] // s2
str w15,[$ctx,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[$ctx,#16*6] // s3
str w16,[$ctx,#16*7] // r4
str w15,[$ctx,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
#ifdef __KERNEL__
.globl poly1305_blocks_neon
#endif
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
.Lpoly1305_blocks_neon:
ldr $is_base2_26,[$ctx,#24]
cmp $len,#128
b.lo .Lpoly1305_blocks
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
cbz $is_base2_26,.Lbase2_64_neon
ldp w10,w11,[$ctx] // load hash value base 2^26
ldp w12,w13,[$ctx,#8]
ldr w14,[$ctx,#16]
tst $len,#31
b.eq .Leven_neon
ldp $r0,$r1,[$ctx,#32] // load key value
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr $h1,x12,#12
adds $h0,$h0,x12,lsl#52
add $h1,$h1,x13,lsl#14
adc $h1,$h1,xzr
lsr $h2,x14,#24
adds $h1,$h1,x14,lsl#40
adc $d2,$h2,xzr // can be partially reduced...
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
b .Leven_neon
.align 4
.Lbase2_64_neon:
ldp $r0,$r1,[$ctx,#32] // load key value
ldp $h0,$h1,[$ctx] // load hash value base 2^64
ldr $h2,[$ctx,#16]
tst $len,#31
b.eq .Linit_neon
ldp $d0,$d1,[$inp],#16 // load input
sub $len,$len,#16
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev $d0,$d0
rev $d1,$d1
#endif
adds $h0,$h0,$d0 // accumulate input
adcs $h1,$h1,$d1
adc $h2,$h2,$padbit
bl poly1305_mult
.Linit_neon:
ldr w17,[$ctx,#48] // first table element
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,$h0,#26,#26
extr x12,$h1,$h0,#52
and x12,x12,#0x03ffffff
ubfx x13,$h1,#14,#26
extr x14,$h2,$h1,#40
cmp w17,#-1 // is value impossible?
b.ne .Leven_neon
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
////////////////////////////////// initialize r^n table
mov $h0,$r0 // r^1
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
mov $h1,$r1
mov $h2,xzr
add $ctx,$ctx,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub $ctx,$ctx,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub $ctx,$ctx,#4
bl poly1305_splat
sub $ctx,$ctx,#48 // restore original $ctx
b .Ldo_neon
.align 4
.Leven_neon:
fmov ${H0},x10
fmov ${H1},x11
fmov ${H2},x12
fmov ${H3},x13
fmov ${H4},x14
.Ldo_neon:
ldp x8,x12,[$inp,#32] // inp[2:3]
subs $len,$len,#64
ldp x9,x13,[$inp,#48]
add $in2,$inp,#96
adr $zeros,.Lzeros
lsl $padbit,$padbit,#24
add x15,$ctx,#48
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN23_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN23_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov $IN23_2,x8
fmov $IN23_3,x10
fmov $IN23_4,x12
ldp x8,x12,[$inp],#16 // inp[0:1]
ldp x9,x13,[$inp],#48
ld1 {$R0,$R1,$S1,$R2},[x15],#64
ld1 {$S2,$R3,$S3,$R4},[x15],#64
ld1 {$S4},[x15]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov $IN01_0,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,$padbit,x12,lsr#40
add x13,$padbit,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov $IN01_1,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
movi $MASK.2d,#-1
fmov $IN01_2,x8
fmov $IN01_3,x10
fmov $IN01_4,x12
ushr $MASK.2d,$MASK.2d,#38
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// \___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// \___________________/ \____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs $len,$len,#64
umull $ACC4,$IN23_0,${R4}[2]
csel $in2,$zeros,$in2,lo
umull $ACC3,$IN23_0,${R3}[2]
umull $ACC2,$IN23_0,${R2}[2]
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
umull $ACC1,$IN23_0,${R1}[2]
ldp x9,x13,[$in2],#48
umull $ACC0,$IN23_0,${R0}[2]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal $ACC4,$IN23_1,${R3}[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC3,$IN23_1,${R2}[2]
and x5,x9,#0x03ffffff
umlal $ACC2,$IN23_1,${R1}[2]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN23_1,${R0}[2]
ubfx x7,x9,#26,#26
umlal $ACC0,$IN23_1,${S4}[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC4,$IN23_2,${R2}[2]
extr x8,x12,x8,#52
umlal $ACC3,$IN23_2,${R1}[2]
extr x9,x13,x9,#52
umlal $ACC2,$IN23_2,${R0}[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC1,$IN23_2,${S4}[2]
fmov $IN23_0,x4
umlal $ACC0,$IN23_2,${S3}[2]
and x8,x8,#0x03ffffff
umlal $ACC4,$IN23_3,${R1}[2]
and x9,x9,#0x03ffffff
umlal $ACC3,$IN23_3,${R0}[2]
ubfx x10,x12,#14,#26
umlal $ACC2,$IN23_3,${S4}[2]
ubfx x11,x13,#14,#26
umlal $ACC1,$IN23_3,${S3}[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC0,$IN23_3,${S2}[2]
fmov $IN23_1,x6
add $IN01_2,$IN01_2,$H2
add x12,$padbit,x12,lsr#40
umlal $ACC4,$IN23_4,${R0}[2]
add x13,$padbit,x13,lsr#40
umlal $ACC3,$IN23_4,${S4}[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC2,$IN23_4,${S3}[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN23_4,${S2}[2]
fmov $IN23_2,x8
umlal $ACC0,$IN23_4,${S1}[2]
fmov $IN23_3,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add $IN01_0,$IN01_0,$H0
fmov $IN23_4,x12
umlal $ACC3,$IN01_2,${R1}[0]
ldp x8,x12,[$inp],#16 // inp[0:1]
umlal $ACC0,$IN01_2,${S3}[0]
ldp x9,x13,[$inp],#48
umlal $ACC4,$IN01_2,${R2}[0]
umlal $ACC1,$IN01_2,${S4}[0]
umlal $ACC2,$IN01_2,${R0}[0]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}[0]
umlal $ACC4,$IN01_0,${R4}[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal $ACC2,$IN01_0,${R2}[0]
and x5,x9,#0x03ffffff
umlal $ACC0,$IN01_0,${R0}[0]
ubfx x6,x8,#26,#26
umlal $ACC1,$IN01_0,${R1}[0]
ubfx x7,x9,#26,#26
add $IN01_3,$IN01_3,$H3
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal $ACC3,$IN01_1,${R2}[0]
extr x8,x12,x8,#52
umlal $ACC4,$IN01_1,${R3}[0]
extr x9,x13,x9,#52
umlal $ACC0,$IN01_1,${S4}[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal $ACC2,$IN01_1,${R1}[0]
fmov $IN01_0,x4
umlal $ACC1,$IN01_1,${R0}[0]
and x8,x8,#0x03ffffff
add $IN01_4,$IN01_4,$H4
and x9,x9,#0x03ffffff
umlal $ACC3,$IN01_3,${R0}[0]
ubfx x10,x12,#14,#26
umlal $ACC0,$IN01_3,${S2}[0]
ubfx x11,x13,#14,#26
umlal $ACC4,$IN01_3,${R1}[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal $ACC1,$IN01_3,${S3}[0]
fmov $IN01_1,x6
umlal $ACC2,$IN01_3,${S4}[0]
add x12,$padbit,x12,lsr#40
umlal $ACC3,$IN01_4,${S4}[0]
add x13,$padbit,x13,lsr#40
umlal $ACC0,$IN01_4,${S1}[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal $ACC4,$IN01_4,${R0}[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal $ACC1,$IN01_4,${S2}[0]
fmov $IN01_2,x8
umlal $ACC2,$IN01_4,${S3}[0]
fmov $IN01_3,x10
fmov $IN01_4,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
//
// [see discussion in poly1305-armv4 module]
ushr $T0.2d,$ACC3,#26
xtn $H3,$ACC3
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
xtn $H4,$ACC4
ushr $T1.2d,$ACC1,#26
xtn $H1,$ACC1
bic $H4,#0xfc,lsl#24
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
shrn $T1.2s,$ACC2,#26
xtn $H2,$ACC2
add $ACC0,$ACC0,$T0.2d // h4 -> h0
bic $H1,#0xfc,lsl#24
add $H3,$H3,$T1.2s // h2 -> h3
bic $H2,#0xfc,lsl#24
shrn $T0.2s,$ACC0,#26
xtn $H0,$ACC0
ushr $T1.2s,$H3,#26
bic $H3,#0xfc,lsl#24
bic $H0,#0xfc,lsl#24
add $H1,$H1,$T0.2s // h0 -> h1
add $H4,$H4,$T1.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup $IN23_2,${IN23_2}[0]
add $IN01_2,$IN01_2,$H2
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds $len,$len,#32
b.ne .Long_tail
dup $IN23_2,${IN01_2}[0]
add $IN23_0,$IN01_0,$H0
add $IN23_3,$IN01_3,$H3
add $IN23_1,$IN01_1,$H1
add $IN23_4,$IN01_4,$H4
.Long_tail:
dup $IN23_0,${IN23_0}[0]
umull2 $ACC0,$IN23_2,${S3}
umull2 $ACC3,$IN23_2,${R1}
umull2 $ACC4,$IN23_2,${R2}
umull2 $ACC2,$IN23_2,${R0}
umull2 $ACC1,$IN23_2,${S4}
dup $IN23_1,${IN23_1}[0]
umlal2 $ACC0,$IN23_0,${R0}
umlal2 $ACC2,$IN23_0,${R2}
umlal2 $ACC3,$IN23_0,${R3}
umlal2 $ACC4,$IN23_0,${R4}
umlal2 $ACC1,$IN23_0,${R1}
dup $IN23_3,${IN23_3}[0]
umlal2 $ACC0,$IN23_1,${S4}
umlal2 $ACC3,$IN23_1,${R2}
umlal2 $ACC2,$IN23_1,${R1}
umlal2 $ACC4,$IN23_1,${R3}
umlal2 $ACC1,$IN23_1,${R0}
dup $IN23_4,${IN23_4}[0]
umlal2 $ACC3,$IN23_3,${R0}
umlal2 $ACC4,$IN23_3,${R1}
umlal2 $ACC0,$IN23_3,${S2}
umlal2 $ACC1,$IN23_3,${S3}
umlal2 $ACC2,$IN23_3,${S4}
umlal2 $ACC3,$IN23_4,${S4}
umlal2 $ACC0,$IN23_4,${S1}
umlal2 $ACC4,$IN23_4,${R0}
umlal2 $ACC1,$IN23_4,${S2}
umlal2 $ACC2,$IN23_4,${S3}
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add $IN01_0,$IN01_0,$H0
umlal $ACC3,$IN01_2,${R1}
umlal $ACC0,$IN01_2,${S3}
umlal $ACC4,$IN01_2,${R2}
umlal $ACC1,$IN01_2,${S4}
umlal $ACC2,$IN01_2,${R0}
add $IN01_1,$IN01_1,$H1
umlal $ACC3,$IN01_0,${R3}
umlal $ACC0,$IN01_0,${R0}
umlal $ACC4,$IN01_0,${R4}
umlal $ACC1,$IN01_0,${R1}
umlal $ACC2,$IN01_0,${R2}
add $IN01_3,$IN01_3,$H3
umlal $ACC3,$IN01_1,${R2}
umlal $ACC0,$IN01_1,${S4}
umlal $ACC4,$IN01_1,${R3}
umlal $ACC1,$IN01_1,${R0}
umlal $ACC2,$IN01_1,${R1}
add $IN01_4,$IN01_4,$H4
umlal $ACC3,$IN01_3,${R0}
umlal $ACC0,$IN01_3,${S2}
umlal $ACC4,$IN01_3,${R1}
umlal $ACC1,$IN01_3,${S3}
umlal $ACC2,$IN01_3,${S4}
umlal $ACC3,$IN01_4,${S4}
umlal $ACC0,$IN01_4,${S1}
umlal $ACC4,$IN01_4,${R0}
umlal $ACC1,$IN01_4,${S2}
umlal $ACC2,$IN01_4,${S3}
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp $ACC3,$ACC3,$ACC3
ldp d8,d9,[sp,#16] // meet ABI requirements
addp $ACC0,$ACC0,$ACC0
ldp d10,d11,[sp,#32]
addp $ACC4,$ACC4,$ACC4
ldp d12,d13,[sp,#48]
addp $ACC1,$ACC1,$ACC1
ldp d14,d15,[sp,#64]
addp $ACC2,$ACC2,$ACC2
ldr x30,[sp,#8]
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr $T0.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
ushr $T1.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
add $ACC4,$ACC4,$T0.2d // h3 -> h4
add $ACC1,$ACC1,$T1.2d // h0 -> h1
ushr $T0.2d,$ACC4,#26
and $ACC4,$ACC4,$MASK.2d
ushr $T1.2d,$ACC1,#26
and $ACC1,$ACC1,$MASK.2d
add $ACC2,$ACC2,$T1.2d // h1 -> h2
add $ACC0,$ACC0,$T0.2d
shl $T0.2d,$T0.2d,#2
ushr $T1.2d,$ACC2,#26
and $ACC2,$ACC2,$MASK.2d
add $ACC0,$ACC0,$T0.2d // h4 -> h0
add $ACC3,$ACC3,$T1.2d // h2 -> h3
ushr $T0.2d,$ACC0,#26
and $ACC0,$ACC0,$MASK.2d
ushr $T1.2d,$ACC3,#26
and $ACC3,$ACC3,$MASK.2d
add $ACC1,$ACC1,$T0.2d // h0 -> h1
add $ACC4,$ACC4,$T1.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
mov x4,#1
st1 {$ACC4}[0],[$ctx]
str x4,[$ctx,#8] // set is_base2_26
ldr x29,[sp],#80
.inst 0xd50323bf // autiasp
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
.align 2
#if !defined(__KERNEL__) && !defined(_WIN64)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif
___
foreach (split("\n",$code)) {
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
s/\.[124]([sd])\[/.$1\[/;
s/w#x([0-9]+)/w$1/g;
print $_,"\n";
}
close STDOUT;

View File

@ -0,0 +1,835 @@
#ifndef __KERNEL__
# include "arm_arch.h"
.extern OPENSSL_armcap_P
#endif
.text
// forward "declarations" are required for Apple
.globl poly1305_blocks
.globl poly1305_emit
.globl poly1305_init
.type poly1305_init,%function
.align 5
poly1305_init:
cmp x1,xzr
stp xzr,xzr,[x0] // zero hash value
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
csel x0,xzr,x0,eq
b.eq .Lno_key
#ifndef __KERNEL__
adrp x17,OPENSSL_armcap_P
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
#endif
ldp x7,x8,[x1] // load key
mov x9,#0xfffffffc0fffffff
movk x9,#0x0fff,lsl#48
#ifdef __AARCH64EB__
rev x7,x7 // flip bytes
rev x8,x8
#endif
and x7,x7,x9 // &=0ffffffc0fffffff
and x9,x9,#-4
and x8,x8,x9 // &=0ffffffc0ffffffc
mov w9,#-1
stp x7,x8,[x0,#32] // save key value
str w9,[x0,#48] // impossible key power value
#ifndef __KERNEL__
tst w17,#ARMV7_NEON
adr x12,.Lpoly1305_blocks
adr x7,.Lpoly1305_blocks_neon
adr x13,.Lpoly1305_emit
csel x12,x12,x7,eq
# ifdef __ILP32__
stp w12,w13,[x2]
# else
stp x12,x13,[x2]
# endif
#endif
mov x0,#1
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,%function
.align 5
poly1305_blocks:
.Lpoly1305_blocks:
ands x2,x2,#-16
b.eq .Lno_data
ldp x4,x5,[x0] // load hash value
ldp x6,x17,[x0,#16] // [along with is_base2_26]
ldp x7,x8,[x0,#32] // load key value
#ifdef __AARCH64EB__
lsr x12,x4,#32
mov w13,w4
lsr x14,x5,#32
mov w15,w5
lsr x16,x6,#32
#else
mov w12,w4
lsr x13,x4,#32
mov w14,w5
lsr x15,x5,#32
mov w16,w6
#endif
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
lsr x13,x14,#12
adds x12,x12,x14,lsl#52
add x13,x13,x15,lsl#14
adc x13,x13,xzr
lsr x14,x16,#24
adds x13,x13,x16,lsl#40
adc x14,x14,xzr
cmp x17,#0 // is_base2_26?
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
csel x4,x4,x12,eq // choose between radixes
csel x5,x5,x13,eq
csel x6,x6,x14,eq
.Loop:
ldp x10,x11,[x1],#16 // load input
sub x2,x2,#16
#ifdef __AARCH64EB__
rev x10,x10
rev x11,x11
#endif
adds x4,x4,x10 // accumulate input
adcs x5,x5,x11
mul x12,x4,x7 // h0*r0
adc x6,x6,x3
umulh x13,x4,x7
mul x10,x5,x9 // h1*5*r1
umulh x11,x5,x9
adds x12,x12,x10
mul x10,x4,x8 // h0*r1
adc x13,x13,x11
umulh x14,x4,x8
adds x13,x13,x10
mul x10,x5,x7 // h1*r0
adc x14,x14,xzr
umulh x11,x5,x7
adds x13,x13,x10
mul x10,x6,x9 // h2*5*r1
adc x14,x14,x11
mul x11,x6,x7 // h2*r0
adds x13,x13,x10
adc x14,x14,x11
and x10,x14,#-4 // final reduction
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x12,x10
adcs x5,x13,xzr
adc x6,x6,xzr
cbnz x2,.Loop
stp x4,x5,[x0] // store hash value
stp x6,xzr,[x0,#16] // [and clear is_base2_26]
.Lno_data:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,%function
.align 5
poly1305_emit:
.Lpoly1305_emit:
ldp x4,x5,[x0] // load hash base 2^64
ldp x6,x7,[x0,#16] // [along with is_base2_26]
ldp x10,x11,[x2] // load nonce
#ifdef __AARCH64EB__
lsr x12,x4,#32
mov w13,w4
lsr x14,x5,#32
mov w15,w5
lsr x16,x6,#32
#else
mov w12,w4
lsr x13,x4,#32
mov w14,w5
lsr x15,x5,#32
mov w16,w6
#endif
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
lsr x13,x14,#12
adds x12,x12,x14,lsl#52
add x13,x13,x15,lsl#14
adc x13,x13,xzr
lsr x14,x16,#24
adds x13,x13,x16,lsl#40
adc x14,x14,xzr
cmp x7,#0 // is_base2_26?
csel x4,x4,x12,eq // choose between radixes
csel x5,x5,x13,eq
csel x6,x6,x14,eq
adds x12,x4,#5 // compare to modulus
adcs x13,x5,xzr
adc x14,x6,xzr
tst x14,#-4 // see if it's carried/borrowed
csel x4,x4,x12,eq
csel x5,x5,x13,eq
#ifdef __AARCH64EB__
ror x10,x10,#32 // flip nonce words
ror x11,x11,#32
#endif
adds x4,x4,x10 // accumulate nonce
adc x5,x5,x11
#ifdef __AARCH64EB__
rev x4,x4 // flip output bytes
rev x5,x5
#endif
stp x4,x5,[x1] // write result
ret
.size poly1305_emit,.-poly1305_emit
.type poly1305_mult,%function
.align 5
poly1305_mult:
mul x12,x4,x7 // h0*r0
umulh x13,x4,x7
mul x10,x5,x9 // h1*5*r1
umulh x11,x5,x9
adds x12,x12,x10
mul x10,x4,x8 // h0*r1
adc x13,x13,x11
umulh x14,x4,x8
adds x13,x13,x10
mul x10,x5,x7 // h1*r0
adc x14,x14,xzr
umulh x11,x5,x7
adds x13,x13,x10
mul x10,x6,x9 // h2*5*r1
adc x14,x14,x11
mul x11,x6,x7 // h2*r0
adds x13,x13,x10
adc x14,x14,x11
and x10,x14,#-4 // final reduction
and x6,x14,#3
add x10,x10,x14,lsr#2
adds x4,x12,x10
adcs x5,x13,xzr
adc x6,x6,xzr
ret
.size poly1305_mult,.-poly1305_mult
.type poly1305_splat,%function
.align 4
poly1305_splat:
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x13,x4,#26,#26
extr x14,x5,x4,#52
and x14,x14,#0x03ffffff
ubfx x15,x5,#14,#26
extr x16,x6,x5,#40
str w12,[x0,#16*0] // r0
add w12,w13,w13,lsl#2 // r1*5
str w13,[x0,#16*1] // r1
add w13,w14,w14,lsl#2 // r2*5
str w12,[x0,#16*2] // s1
str w14,[x0,#16*3] // r2
add w14,w15,w15,lsl#2 // r3*5
str w13,[x0,#16*4] // s2
str w15,[x0,#16*5] // r3
add w15,w16,w16,lsl#2 // r4*5
str w14,[x0,#16*6] // s3
str w16,[x0,#16*7] // r4
str w15,[x0,#16*8] // s4
ret
.size poly1305_splat,.-poly1305_splat
#ifdef __KERNEL__
.globl poly1305_blocks_neon
#endif
.type poly1305_blocks_neon,%function
.align 5
poly1305_blocks_neon:
.Lpoly1305_blocks_neon:
ldr x17,[x0,#24]
cmp x2,#128
b.lo .Lpoly1305_blocks
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp d8,d9,[sp,#16] // meet ABI requirements
stp d10,d11,[sp,#32]
stp d12,d13,[sp,#48]
stp d14,d15,[sp,#64]
cbz x17,.Lbase2_64_neon
ldp w10,w11,[x0] // load hash value base 2^26
ldp w12,w13,[x0,#8]
ldr w14,[x0,#16]
tst x2,#31
b.eq .Leven_neon
ldp x7,x8,[x0,#32] // load key value
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
lsr x5,x12,#12
adds x4,x4,x12,lsl#52
add x5,x5,x13,lsl#14
adc x5,x5,xzr
lsr x6,x14,#24
adds x5,x5,x14,lsl#40
adc x14,x6,xzr // can be partially reduced...
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev x12,x12
rev x13,x13
#endif
adds x4,x4,x12 // accumulate input
adcs x5,x5,x13
adc x6,x6,x3
bl poly1305_mult
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
and x12,x12,#0x03ffffff
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
b .Leven_neon
.align 4
.Lbase2_64_neon:
ldp x7,x8,[x0,#32] // load key value
ldp x4,x5,[x0] // load hash value base 2^64
ldr x6,[x0,#16]
tst x2,#31
b.eq .Linit_neon
ldp x12,x13,[x1],#16 // load input
sub x2,x2,#16
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
#ifdef __AARCH64EB__
rev x12,x12
rev x13,x13
#endif
adds x4,x4,x12 // accumulate input
adcs x5,x5,x13
adc x6,x6,x3
bl poly1305_mult
.Linit_neon:
ldr w17,[x0,#48] // first table element
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
ubfx x11,x4,#26,#26
extr x12,x5,x4,#52
and x12,x12,#0x03ffffff
ubfx x13,x5,#14,#26
extr x14,x6,x5,#40
cmp w17,#-1 // is value impossible?
b.ne .Leven_neon
fmov d24,x10
fmov d25,x11
fmov d26,x12
fmov d27,x13
fmov d28,x14
////////////////////////////////// initialize r^n table
mov x4,x7 // r^1
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
mov x5,x8
mov x6,xzr
add x0,x0,#48+12
bl poly1305_splat
bl poly1305_mult // r^2
sub x0,x0,#4
bl poly1305_splat
bl poly1305_mult // r^3
sub x0,x0,#4
bl poly1305_splat
bl poly1305_mult // r^4
sub x0,x0,#4
bl poly1305_splat
sub x0,x0,#48 // restore original x0
b .Ldo_neon
.align 4
.Leven_neon:
fmov d24,x10
fmov d25,x11
fmov d26,x12
fmov d27,x13
fmov d28,x14
.Ldo_neon:
ldp x8,x12,[x1,#32] // inp[2:3]
subs x2,x2,#64
ldp x9,x13,[x1,#48]
add x16,x1,#96
adr x17,.Lzeros
lsl x3,x3,#24
add x15,x0,#48
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov d14,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,x3,x12,lsr#40
add x13,x3,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov d15,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
fmov d16,x8
fmov d17,x10
fmov d18,x12
ldp x8,x12,[x1],#16 // inp[0:1]
ldp x9,x13,[x1],#48
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
ld1 {v8.4s},[x15]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
and x5,x9,#0x03ffffff
ubfx x6,x8,#26,#26
ubfx x7,x9,#26,#26
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
extr x8,x12,x8,#52
extr x9,x13,x9,#52
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
fmov d9,x4
and x8,x8,#0x03ffffff
and x9,x9,#0x03ffffff
ubfx x10,x12,#14,#26
ubfx x11,x13,#14,#26
add x12,x3,x12,lsr#40
add x13,x3,x13,lsr#40
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
fmov d10,x6
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
movi v31.2d,#-1
fmov d11,x8
fmov d12,x10
fmov d13,x12
ushr v31.2d,v31.2d,#38
b.ls .Lskip_loop
.align 4
.Loop_neon:
////////////////////////////////////////////////////////////////
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
// ___________________/
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
// ___________________/ ____________________/
//
// Note that we start with inp[2:3]*r^2. This is because it
// doesn't depend on reduction in previous iteration.
////////////////////////////////////////////////////////////////
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
subs x2,x2,#64
umull v23.2d,v14.2s,v7.s[2]
csel x16,x17,x16,lo
umull v22.2d,v14.2s,v5.s[2]
umull v21.2d,v14.2s,v3.s[2]
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
umull v20.2d,v14.2s,v1.s[2]
ldp x9,x13,[x16],#48
umull v19.2d,v14.2s,v0.s[2]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
umlal v23.2d,v15.2s,v5.s[2]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal v22.2d,v15.2s,v3.s[2]
and x5,x9,#0x03ffffff
umlal v21.2d,v15.2s,v1.s[2]
ubfx x6,x8,#26,#26
umlal v20.2d,v15.2s,v0.s[2]
ubfx x7,x9,#26,#26
umlal v19.2d,v15.2s,v8.s[2]
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal v23.2d,v16.2s,v3.s[2]
extr x8,x12,x8,#52
umlal v22.2d,v16.2s,v1.s[2]
extr x9,x13,x9,#52
umlal v21.2d,v16.2s,v0.s[2]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal v20.2d,v16.2s,v8.s[2]
fmov d14,x4
umlal v19.2d,v16.2s,v6.s[2]
and x8,x8,#0x03ffffff
umlal v23.2d,v17.2s,v1.s[2]
and x9,x9,#0x03ffffff
umlal v22.2d,v17.2s,v0.s[2]
ubfx x10,x12,#14,#26
umlal v21.2d,v17.2s,v8.s[2]
ubfx x11,x13,#14,#26
umlal v20.2d,v17.2s,v6.s[2]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal v19.2d,v17.2s,v4.s[2]
fmov d15,x6
add v11.2s,v11.2s,v26.2s
add x12,x3,x12,lsr#40
umlal v23.2d,v18.2s,v0.s[2]
add x13,x3,x13,lsr#40
umlal v22.2d,v18.2s,v8.s[2]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal v21.2d,v18.2s,v6.s[2]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal v20.2d,v18.2s,v4.s[2]
fmov d16,x8
umlal v19.2d,v18.2s,v2.s[2]
fmov d17,x10
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4 and accumulate
add v9.2s,v9.2s,v24.2s
fmov d18,x12
umlal v22.2d,v11.2s,v1.s[0]
ldp x8,x12,[x1],#16 // inp[0:1]
umlal v19.2d,v11.2s,v6.s[0]
ldp x9,x13,[x1],#48
umlal v23.2d,v11.2s,v3.s[0]
umlal v20.2d,v11.2s,v8.s[0]
umlal v21.2d,v11.2s,v0.s[0]
#ifdef __AARCH64EB__
rev x8,x8
rev x12,x12
rev x9,x9
rev x13,x13
#endif
add v10.2s,v10.2s,v25.2s
umlal v22.2d,v9.2s,v5.s[0]
umlal v23.2d,v9.2s,v7.s[0]
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
umlal v21.2d,v9.2s,v3.s[0]
and x5,x9,#0x03ffffff
umlal v19.2d,v9.2s,v0.s[0]
ubfx x6,x8,#26,#26
umlal v20.2d,v9.2s,v1.s[0]
ubfx x7,x9,#26,#26
add v12.2s,v12.2s,v27.2s
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
umlal v22.2d,v10.2s,v3.s[0]
extr x8,x12,x8,#52
umlal v23.2d,v10.2s,v5.s[0]
extr x9,x13,x9,#52
umlal v19.2d,v10.2s,v8.s[0]
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
umlal v21.2d,v10.2s,v1.s[0]
fmov d9,x4
umlal v20.2d,v10.2s,v0.s[0]
and x8,x8,#0x03ffffff
add v13.2s,v13.2s,v28.2s
and x9,x9,#0x03ffffff
umlal v22.2d,v12.2s,v0.s[0]
ubfx x10,x12,#14,#26
umlal v19.2d,v12.2s,v4.s[0]
ubfx x11,x13,#14,#26
umlal v23.2d,v12.2s,v1.s[0]
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
umlal v20.2d,v12.2s,v6.s[0]
fmov d10,x6
umlal v21.2d,v12.2s,v8.s[0]
add x12,x3,x12,lsr#40
umlal v22.2d,v13.2s,v8.s[0]
add x13,x3,x13,lsr#40
umlal v19.2d,v13.2s,v2.s[0]
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
umlal v23.2d,v13.2s,v0.s[0]
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
umlal v20.2d,v13.2s,v4.s[0]
fmov d11,x8
umlal v21.2d,v13.2s,v6.s[0]
fmov d12,x10
fmov d13,x12
/////////////////////////////////////////////////////////////////
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
// and P. Schwabe
//
// [see discussion in poly1305-armv4 module]
ushr v29.2d,v22.2d,#26
xtn v27.2s,v22.2d
ushr v30.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
add v23.2d,v23.2d,v29.2d // h3 -> h4
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
add v20.2d,v20.2d,v30.2d // h0 -> h1
ushr v29.2d,v23.2d,#26
xtn v28.2s,v23.2d
ushr v30.2d,v20.2d,#26
xtn v25.2s,v20.2d
bic v28.2s,#0xfc,lsl#24
add v21.2d,v21.2d,v30.2d // h1 -> h2
add v19.2d,v19.2d,v29.2d
shl v29.2d,v29.2d,#2
shrn v30.2s,v21.2d,#26
xtn v26.2s,v21.2d
add v19.2d,v19.2d,v29.2d // h4 -> h0
bic v25.2s,#0xfc,lsl#24
add v27.2s,v27.2s,v30.2s // h2 -> h3
bic v26.2s,#0xfc,lsl#24
shrn v29.2s,v19.2d,#26
xtn v24.2s,v19.2d
ushr v30.2s,v27.2s,#26
bic v27.2s,#0xfc,lsl#24
bic v24.2s,#0xfc,lsl#24
add v25.2s,v25.2s,v29.2s // h0 -> h1
add v28.2s,v28.2s,v30.2s // h3 -> h4
b.hi .Loop_neon
.Lskip_loop:
dup v16.2d,v16.d[0]
add v11.2s,v11.2s,v26.2s
////////////////////////////////////////////////////////////////
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
adds x2,x2,#32
b.ne .Long_tail
dup v16.2d,v11.d[0]
add v14.2s,v9.2s,v24.2s
add v17.2s,v12.2s,v27.2s
add v15.2s,v10.2s,v25.2s
add v18.2s,v13.2s,v28.2s
.Long_tail:
dup v14.2d,v14.d[0]
umull2 v19.2d,v16.4s,v6.4s
umull2 v22.2d,v16.4s,v1.4s
umull2 v23.2d,v16.4s,v3.4s
umull2 v21.2d,v16.4s,v0.4s
umull2 v20.2d,v16.4s,v8.4s
dup v15.2d,v15.d[0]
umlal2 v19.2d,v14.4s,v0.4s
umlal2 v21.2d,v14.4s,v3.4s
umlal2 v22.2d,v14.4s,v5.4s
umlal2 v23.2d,v14.4s,v7.4s
umlal2 v20.2d,v14.4s,v1.4s
dup v17.2d,v17.d[0]
umlal2 v19.2d,v15.4s,v8.4s
umlal2 v22.2d,v15.4s,v3.4s
umlal2 v21.2d,v15.4s,v1.4s
umlal2 v23.2d,v15.4s,v5.4s
umlal2 v20.2d,v15.4s,v0.4s
dup v18.2d,v18.d[0]
umlal2 v22.2d,v17.4s,v0.4s
umlal2 v23.2d,v17.4s,v1.4s
umlal2 v19.2d,v17.4s,v4.4s
umlal2 v20.2d,v17.4s,v6.4s
umlal2 v21.2d,v17.4s,v8.4s
umlal2 v22.2d,v18.4s,v8.4s
umlal2 v19.2d,v18.4s,v2.4s
umlal2 v23.2d,v18.4s,v0.4s
umlal2 v20.2d,v18.4s,v4.4s
umlal2 v21.2d,v18.4s,v6.4s
b.eq .Lshort_tail
////////////////////////////////////////////////////////////////
// (hash+inp[0:1])*r^4:r^3 and accumulate
add v9.2s,v9.2s,v24.2s
umlal v22.2d,v11.2s,v1.2s
umlal v19.2d,v11.2s,v6.2s
umlal v23.2d,v11.2s,v3.2s
umlal v20.2d,v11.2s,v8.2s
umlal v21.2d,v11.2s,v0.2s
add v10.2s,v10.2s,v25.2s
umlal v22.2d,v9.2s,v5.2s
umlal v19.2d,v9.2s,v0.2s
umlal v23.2d,v9.2s,v7.2s
umlal v20.2d,v9.2s,v1.2s
umlal v21.2d,v9.2s,v3.2s
add v12.2s,v12.2s,v27.2s
umlal v22.2d,v10.2s,v3.2s
umlal v19.2d,v10.2s,v8.2s
umlal v23.2d,v10.2s,v5.2s
umlal v20.2d,v10.2s,v0.2s
umlal v21.2d,v10.2s,v1.2s
add v13.2s,v13.2s,v28.2s
umlal v22.2d,v12.2s,v0.2s
umlal v19.2d,v12.2s,v4.2s
umlal v23.2d,v12.2s,v1.2s
umlal v20.2d,v12.2s,v6.2s
umlal v21.2d,v12.2s,v8.2s
umlal v22.2d,v13.2s,v8.2s
umlal v19.2d,v13.2s,v2.2s
umlal v23.2d,v13.2s,v0.2s
umlal v20.2d,v13.2s,v4.2s
umlal v21.2d,v13.2s,v6.2s
.Lshort_tail:
////////////////////////////////////////////////////////////////
// horizontal add
addp v22.2d,v22.2d,v22.2d
ldp d8,d9,[sp,#16] // meet ABI requirements
addp v19.2d,v19.2d,v19.2d
ldp d10,d11,[sp,#32]
addp v23.2d,v23.2d,v23.2d
ldp d12,d13,[sp,#48]
addp v20.2d,v20.2d,v20.2d
ldp d14,d15,[sp,#64]
addp v21.2d,v21.2d,v21.2d
ldr x30,[sp,#8]
////////////////////////////////////////////////////////////////
// lazy reduction, but without narrowing
ushr v29.2d,v22.2d,#26
and v22.16b,v22.16b,v31.16b
ushr v30.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
add v23.2d,v23.2d,v29.2d // h3 -> h4
add v20.2d,v20.2d,v30.2d // h0 -> h1
ushr v29.2d,v23.2d,#26
and v23.16b,v23.16b,v31.16b
ushr v30.2d,v20.2d,#26
and v20.16b,v20.16b,v31.16b
add v21.2d,v21.2d,v30.2d // h1 -> h2
add v19.2d,v19.2d,v29.2d
shl v29.2d,v29.2d,#2
ushr v30.2d,v21.2d,#26
and v21.16b,v21.16b,v31.16b
add v19.2d,v19.2d,v29.2d // h4 -> h0
add v22.2d,v22.2d,v30.2d // h2 -> h3
ushr v29.2d,v19.2d,#26
and v19.16b,v19.16b,v31.16b
ushr v30.2d,v22.2d,#26
and v22.16b,v22.16b,v31.16b
add v20.2d,v20.2d,v29.2d // h0 -> h1
add v23.2d,v23.2d,v30.2d // h3 -> h4
////////////////////////////////////////////////////////////////
// write the result, can be partially reduced
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
mov x4,#1
st1 {v23.s}[0],[x0]
str x4,[x0,#8] // set is_base2_26
ldr x29,[sp],#80
.inst 0xd50323bf // autiasp
ret
.size poly1305_blocks_neon,.-poly1305_blocks_neon
.align 5
.Lzeros:
.long 0,0,0,0,0,0,0,0
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
.align 2
#if !defined(__KERNEL__) && !defined(_WIN64)
.comm OPENSSL_armcap_P,4,4
.hidden OPENSSL_armcap_P
#endif

View File

@ -0,0 +1,230 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
*
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/jump_label.h>
#include <linux/module.h>
asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_init_arm64(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
dctx->s[1] = get_unaligned_le32(key + 20);
dctx->s[2] = get_unaligned_le32(key + 24);
dctx->s[3] = get_unaligned_le32(key + 28);
dctx->buflen = 0;
}
EXPORT_SYMBOL(poly1305_init_arch);
static int neon_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->buflen = 0;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
u32 len, u32 hibit, bool do_neon)
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_arch(dctx, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
dctx->s[1] = get_unaligned_le32(src + 4);
dctx->s[2] = get_unaligned_le32(src + 8);
dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
if (len < POLY1305_BLOCK_SIZE)
return;
}
len &= ~(POLY1305_BLOCK_SIZE - 1);
if (static_branch_likely(&have_neon) && likely(do_neon))
poly1305_blocks_neon(&dctx->h, src, len, hibit);
else
poly1305_blocks(&dctx->h, src, len, hibit);
}
static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
const u8 *src, u32 len, bool do_neon)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
len -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
neon_poly1305_blocks(dctx, dctx->buf,
POLY1305_BLOCK_SIZE, 1, false);
dctx->buflen = 0;
}
}
if (likely(len >= POLY1305_BLOCK_SIZE)) {
neon_poly1305_blocks(dctx, src, len, 1, do_neon);
src += round_down(len, POLY1305_BLOCK_SIZE);
len %= POLY1305_BLOCK_SIZE;
}
if (unlikely(len)) {
dctx->buflen = len;
memcpy(dctx->buf, src, len);
}
}
static int neon_poly1305_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
bool do_neon = may_use_simd() && srclen > 128;
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_begin();
neon_poly1305_do_update(dctx, src, srclen, do_neon);
if (static_branch_likely(&have_neon) && do_neon)
kernel_neon_end();
return 0;
}
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int nbytes)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
nbytes -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
if (static_branch_likely(&have_neon) && may_use_simd()) {
do {
unsigned int todo = min_t(unsigned int, len, SZ_4K);
kernel_neon_begin();
poly1305_blocks_neon(&dctx->h, src, todo, 1);
kernel_neon_end();
len -= todo;
src += todo;
} while (len);
} else {
poly1305_blocks(&dctx->h, src, len, 1);
src += len;
}
nbytes %= POLY1305_BLOCK_SIZE;
}
if (unlikely(nbytes)) {
dctx->buflen = nbytes;
memcpy(dctx->buf, src, nbytes);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg neon_poly1305_alg = {
.init = neon_poly1305_init,
.update = neon_poly1305_update,
.final = neon_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-neon",
.base.cra_priority = 200,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
};
static int __init neon_poly1305_mod_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return 0;
static_branch_enable(&have_neon);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shash(&neon_poly1305_alg) : 0;
}
static void __exit neon_poly1305_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && (elf_hwcap & HWCAP_ASIMD))
crypto_unregister_shash(&neon_poly1305_alg);
}
module_init(neon_poly1305_mod_init);
module_exit(neon_poly1305_mod_exit);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-neon");

View File

@ -192,6 +192,7 @@ enum vcpu_sysreg {
#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
#define cp14_DBGVCR (DBGVCR32_EL2 * 2)
#define NR_COPRO_REGS (NR_SYS_REGS * 2)

View File

@ -25,6 +25,9 @@ const struct cpumask *cpumask_of_node(int node);
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
static inline const struct cpumask *cpumask_of_node(int node)
{
if (node == NUMA_NO_NODE)
return cpu_all_mask;
return node_to_cpumask_map[node];
}
#endif

View File

@ -620,6 +620,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
return (need_wa > 0);
}
static void
cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
{
cap->matches(cap, SCOPE_LOCAL_CPU);
}
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@ -860,9 +866,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
},
#endif
{
.desc = "Branch predictor hardening",
.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = check_branch_predictor,
.cpu_enable = cpu_enable_branch_predictor_hardening,
},
#ifdef CONFIG_HARDEN_EL2_VECTORS
{

View File

@ -290,21 +290,23 @@ void store_cpu_topology(unsigned int cpuid)
if (mpidr & MPIDR_UP_BITMASK)
return;
/* Create cpu topology mapping based on MPIDR. */
if (mpidr & MPIDR_MT_BITMASK) {
/* Multiprocessor system : Multi-threads per core */
cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
} else {
/* Multiprocessor system : Single-thread per core */
cpuid_topo->thread_id = -1;
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
}
/*
* This would be the place to create cpu topology based on MPIDR.
*
* However, it cannot be trusted to depict the actual topology; some
* pieces of the architecture enforce an artificial cap on Aff0 values
* (e.g. GICv3's ICC_SGI1R_EL1 limits it to 15), leading to an
* artificial cycling of Aff1, Aff2 and Aff3 values. IOW, these end up
* having absolutely no relationship to the actual underlying system
* topology, and cannot be reasonably used as core / package ID.
*
* If the MT bit is set, Aff0 *could* be used to define a thread ID, but
* we still wouldn't be able to obtain a sane core ID. This means we
* need to entirely ignore MPIDR for any topology deduction.
*/
cpuid_topo->thread_id = -1;
cpuid_topo->core_id = cpuid;
cpuid_topo->package_id = cpu_to_node(cpuid);
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
cpuid, cpuid_topo->package_id, cpuid_topo->core_id,

View File

@ -1555,9 +1555,9 @@ static const struct sys_reg_desc cp14_regs[] = {
{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
DBG_BCR_BVR_WCR_WVR(1),
/* DBGDCCINT */
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 },
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
/* DBGDSCRext */
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 },
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
DBG_BCR_BVR_WCR_WVR(2),
/* DBGDTR[RT]Xint */
{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
@ -1572,7 +1572,7 @@ static const struct sys_reg_desc cp14_regs[] = {
{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
DBG_BCR_BVR_WCR_WVR(6),
/* DBGVCR */
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 },
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
DBG_BCR_BVR_WCR_WVR(7),
DBG_BCR_BVR_WCR_WVR(8),
DBG_BCR_BVR_WCR_WVR(9),

View File

@ -58,7 +58,11 @@ EXPORT_SYMBOL(node_to_cpumask_map);
*/
const struct cpumask *cpumask_of_node(int node)
{
if (WARN_ON(node >= nr_node_ids))
if (node == NUMA_NO_NODE)
return cpu_all_mask;
if (WARN_ON(node < 0 || node >= nr_node_ids))
return cpu_none_mask;
if (WARN_ON(node_to_cpumask_map[node] == NULL))

View File

@ -42,7 +42,7 @@ obj-y += esi_stub.o # must be in kernel proper
endif
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
obj-$(CONFIG_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31

View File

@ -409,83 +409,9 @@ static void kretprobe_trampoline(void)
{
}
/*
* At this point the target function has been tricked into
* returning into our trampoline. Lookup the associated instance
* and then:
* - call the handler function
* - cleanup by marking the instance as unused
* - long jump back to the original return address
*/
int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
unsigned long flags, orig_ret_address = 0;
unsigned long trampoline_address =
((struct fnptr *)kretprobe_trampoline)->ip;
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
/*
* It is possible to have multiple instances associated with a given
* task either because an multiple functions in the call path
* have a return probe installed on them, and/or more than one return
* return probe was registered for a target function.
*
* We can handle this because:
* - instances are always inserted at the head of the list
* - when multiple return probes are registered for the same
* function, the first instance's ret_addr will point to the
* real return address, and all the rest will point to
* kretprobe_trampoline
*/
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
orig_ret_address = (unsigned long)ri->ret_addr;
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
regs->cr_iip = orig_ret_address;
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
if (ri->task != current)
/* another task is sharing our hash bucket */
continue;
if (ri->rp && ri->rp->handler)
ri->rp->handler(ri, regs);
orig_ret_address = (unsigned long)ri->ret_addr;
recycle_rp_inst(ri, &empty_rp);
if (orig_ret_address != trampoline_address)
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
break;
}
kretprobe_assert(ri, orig_ret_address, trampoline_address);
kretprobe_hash_unlock(current, &flags);
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
}
regs->cr_iip = __kretprobe_trampoline_handler(regs, kretprobe_trampoline, NULL);
/*
* By returning a non-zero value, we are telling
* kprobe_handler() that we don't want the post_handler
@ -498,6 +424,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
struct pt_regs *regs)
{
ri->ret_addr = (kprobe_opcode_t *)regs->b0;
ri->fp = NULL;
/* Replace the return addr with trampoline addr */
regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;

View File

@ -339,7 +339,7 @@ libs-y += arch/mips/math-emu/
# See arch/mips/Kbuild for content of core part of the kernel
core-y += arch/mips/
drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
drivers-y += arch/mips/crypto/
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
# suspend and hibernation support

View File

@ -4,3 +4,21 @@
#
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
chacha-mips-y := chacha-core.o chacha-glue.o
AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
poly1305-mips-y := poly1305-core.o poly1305-glue.o
perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
$(call if_changed,perlasm)
targets += poly1305-core.S

View File

@ -0,0 +1,497 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#define MASK_U32 0x3c
#define CHACHA20_BLOCK_SIZE 64
#define STACK_SIZE 32
#define X0 $t0
#define X1 $t1
#define X2 $t2
#define X3 $t3
#define X4 $t4
#define X5 $t5
#define X6 $t6
#define X7 $t7
#define X8 $t8
#define X9 $t9
#define X10 $v1
#define X11 $s6
#define X12 $s5
#define X13 $s4
#define X14 $s3
#define X15 $s2
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
#define T0 $s1
#define T1 $s0
#define T(n) T ## n
#define X(n) X ## n
/* Input arguments */
#define STATE $a0
#define OUT $a1
#define IN $a2
#define BYTES $a3
/* Output argument */
/* NONCE[0] is kept in a register and not in memory.
* We don't want to touch original value in memory.
* Must be incremented every loop iteration.
*/
#define NONCE_0 $v0
/* SAVED_X and SAVED_CA are set in the jump table.
* Use regs which are overwritten on exit else we don't leak clear data.
* They are used to handling the last bytes which are not multiple of 4.
*/
#define SAVED_X X15
#define SAVED_CA $s7
#define IS_UNALIGNED $s7
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MSB 0
#define LSB 3
#define ROTx rotl
#define ROTR(n) rotr n, 24
#define CPU_TO_LE32(n) \
wsbh n; \
rotr n, 16;
#else
#define MSB 3
#define LSB 0
#define ROTx rotr
#define CPU_TO_LE32(n)
#define ROTR(n)
#endif
#define FOR_EACH_WORD(x) \
x( 0); \
x( 1); \
x( 2); \
x( 3); \
x( 4); \
x( 5); \
x( 6); \
x( 7); \
x( 8); \
x( 9); \
x(10); \
x(11); \
x(12); \
x(13); \
x(14); \
x(15);
#define FOR_EACH_WORD_REV(x) \
x(15); \
x(14); \
x(13); \
x(12); \
x(11); \
x(10); \
x( 9); \
x( 8); \
x( 7); \
x( 6); \
x( 5); \
x( 4); \
x( 3); \
x( 2); \
x( 1); \
x( 0);
#define PLUS_ONE_0 1
#define PLUS_ONE_1 2
#define PLUS_ONE_2 3
#define PLUS_ONE_3 4
#define PLUS_ONE_4 5
#define PLUS_ONE_5 6
#define PLUS_ONE_6 7
#define PLUS_ONE_7 8
#define PLUS_ONE_8 9
#define PLUS_ONE_9 10
#define PLUS_ONE_10 11
#define PLUS_ONE_11 12
#define PLUS_ONE_12 13
#define PLUS_ONE_13 14
#define PLUS_ONE_14 15
#define PLUS_ONE_15 16
#define PLUS_ONE(x) PLUS_ONE_ ## x
#define _CONCAT3(a,b,c) a ## b ## c
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
#define STORE_UNALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
.if (x != 12); \
lw T0, (x*4)(STATE); \
.endif; \
lwl T1, (x*4)+MSB ## (IN); \
lwr T1, (x*4)+LSB ## (IN); \
.if (x == 12); \
addu X ## x, NONCE_0; \
.else; \
addu X ## x, T0; \
.endif; \
CPU_TO_LE32(X ## x); \
xor X ## x, T1; \
swl X ## x, (x*4)+MSB ## (OUT); \
swr X ## x, (x*4)+LSB ## (OUT);
#define STORE_ALIGNED(x) \
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
.if (x != 12); \
lw T0, (x*4)(STATE); \
.endif; \
lw T1, (x*4) ## (IN); \
.if (x == 12); \
addu X ## x, NONCE_0; \
.else; \
addu X ## x, T0; \
.endif; \
CPU_TO_LE32(X ## x); \
xor X ## x, T1; \
sw X ## x, (x*4) ## (OUT);
/* Jump table macro.
* Used for setup and handling the last bytes, which are not multiple of 4.
* X15 is free to store Xn
* Every jumptable entry must be equal in size.
*/
#define JMPTBL_ALIGNED(x) \
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
.set noreorder; \
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
.if (x == 12); \
addu SAVED_X, X ## x, NONCE_0; \
.else; \
addu SAVED_X, X ## x, SAVED_CA; \
.endif; \
.set reorder
#define JMPTBL_UNALIGNED(x) \
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
.set noreorder; \
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
.if (x == 12); \
addu SAVED_X, X ## x, NONCE_0; \
.else; \
addu SAVED_X, X ## x, SAVED_CA; \
.endif; \
.set reorder
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
addu X(A), X(K); \
addu X(B), X(L); \
addu X(C), X(M); \
addu X(D), X(N); \
xor X(V), X(A); \
xor X(W), X(B); \
xor X(Y), X(C); \
xor X(Z), X(D); \
rotl X(V), S; \
rotl X(W), S; \
rotl X(Y), S; \
rotl X(Z), S;
.text
.set reorder
.set noat
.globl chacha_crypt_arch
.ent chacha_crypt_arch
chacha_crypt_arch:
.frame $sp, STACK_SIZE, $ra
/* Load number of rounds */
lw $at, 16($sp)
addiu $sp, -STACK_SIZE
/* Return bytes = 0. */
beqz BYTES, .Lchacha_mips_end
lw NONCE_0, 48(STATE)
/* Save s0-s7 */
sw $s0, 0($sp)
sw $s1, 4($sp)
sw $s2, 8($sp)
sw $s3, 12($sp)
sw $s4, 16($sp)
sw $s5, 20($sp)
sw $s6, 24($sp)
sw $s7, 28($sp)
/* Test IN or OUT is unaligned.
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
*/
or IS_UNALIGNED, IN, OUT
andi IS_UNALIGNED, 0x3
b .Lchacha_rounds_start
.align 4
.Loop_chacha_rounds:
addiu IN, CHACHA20_BLOCK_SIZE
addiu OUT, CHACHA20_BLOCK_SIZE
addiu NONCE_0, 1
.Lchacha_rounds_start:
lw X0, 0(STATE)
lw X1, 4(STATE)
lw X2, 8(STATE)
lw X3, 12(STATE)
lw X4, 16(STATE)
lw X5, 20(STATE)
lw X6, 24(STATE)
lw X7, 28(STATE)
lw X8, 32(STATE)
lw X9, 36(STATE)
lw X10, 40(STATE)
lw X11, 44(STATE)
move X12, NONCE_0
lw X13, 52(STATE)
lw X14, 56(STATE)
lw X15, 60(STATE)
.Loop_chacha_xor_rounds:
addiu $at, -2
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
bnez $at, .Loop_chacha_xor_rounds
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
/* Is data src/dst unaligned? Jump */
bnez IS_UNALIGNED, .Loop_chacha_unaligned
/* Set number rounds here to fill delayslot. */
lw $at, (STACK_SIZE+16)($sp)
/* BYTES < 0, it has no full block. */
bltz BYTES, .Lchacha_mips_no_full_block_aligned
FOR_EACH_WORD_REV(STORE_ALIGNED)
/* BYTES > 0? Loop again. */
bgtz BYTES, .Loop_chacha_rounds
/* Place this here to fill delay slot */
addiu NONCE_0, 1
/* BYTES < 0? Handle last bytes */
bltz BYTES, .Lchacha_mips_xor_bytes
.Lchacha_mips_xor_done:
/* Restore used registers */
lw $s0, 0($sp)
lw $s1, 4($sp)
lw $s2, 8($sp)
lw $s3, 12($sp)
lw $s4, 16($sp)
lw $s5, 20($sp)
lw $s6, 24($sp)
lw $s7, 28($sp)
/* Write NONCE_0 back to right location in state */
sw NONCE_0, 48(STATE)
.Lchacha_mips_end:
addiu $sp, STACK_SIZE
jr $ra
.Lchacha_mips_no_full_block_aligned:
/* Restore the offset on BYTES */
addiu BYTES, CHACHA20_BLOCK_SIZE
/* Get number of full WORDS */
andi $at, BYTES, MASK_U32
/* Load upper half of jump table addr */
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1, 6
/* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
/* Read value from STATE */
lw SAVED_CA, 0(T1)
/* Store remaining bytecounter as negative value */
subu BYTES, $at, BYTES
jr T0
/* Jump table */
FOR_EACH_WORD(JMPTBL_ALIGNED)
.Loop_chacha_unaligned:
/* Set number rounds here to fill delayslot. */
lw $at, (STACK_SIZE+16)($sp)
/* BYTES > 0, it has no full block. */
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
FOR_EACH_WORD_REV(STORE_UNALIGNED)
/* BYTES > 0? Loop again. */
bgtz BYTES, .Loop_chacha_rounds
/* Write NONCE_0 back to right location in state */
sw NONCE_0, 48(STATE)
.set noreorder
/* Fall through to byte handling */
bgez BYTES, .Lchacha_mips_xor_done
.Lchacha_mips_xor_unaligned_0_b:
.Lchacha_mips_xor_aligned_0_b:
/* Place this here to fill delay slot */
addiu NONCE_0, 1
.set reorder
.Lchacha_mips_xor_bytes:
addu IN, $at
addu OUT, $at
/* First byte */
lbu T1, 0(IN)
addiu $at, BYTES, 1
CPU_TO_LE32(SAVED_X)
ROTR(SAVED_X)
xor T1, SAVED_X
sb T1, 0(OUT)
beqz $at, .Lchacha_mips_xor_done
/* Second byte */
lbu T1, 1(IN)
addiu $at, BYTES, 2
ROTx SAVED_X, 8
xor T1, SAVED_X
sb T1, 1(OUT)
beqz $at, .Lchacha_mips_xor_done
/* Third byte */
lbu T1, 2(IN)
ROTx SAVED_X, 8
xor T1, SAVED_X
sb T1, 2(OUT)
b .Lchacha_mips_xor_done
.Lchacha_mips_no_full_block_unaligned:
/* Restore the offset on BYTES */
addiu BYTES, CHACHA20_BLOCK_SIZE
/* Get number of full WORDS */
andi $at, BYTES, MASK_U32
/* Load upper half of jump table addr */
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
/* Calculate lower half jump table offset */
ins T0, $at, 1, 6
/* Add offset to STATE */
addu T1, STATE, $at
/* Add lower half jump table addr */
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
/* Read value from STATE */
lw SAVED_CA, 0(T1)
/* Store remaining bytecounter as negative value */
subu BYTES, $at, BYTES
jr T0
/* Jump table */
FOR_EACH_WORD(JMPTBL_UNALIGNED)
.end chacha_crypt_arch
.set at
/* Input arguments
* STATE $a0
* OUT $a1
* NROUND $a2
*/
#undef X12
#undef X13
#undef X14
#undef X15
#define X12 $a3
#define X13 $at
#define X14 $v0
#define X15 STATE
.set noat
.globl hchacha_block_arch
.ent hchacha_block_arch
hchacha_block_arch:
.frame $sp, STACK_SIZE, $ra
addiu $sp, -STACK_SIZE
/* Save X11(s6) */
sw X11, 0($sp)
lw X0, 0(STATE)
lw X1, 4(STATE)
lw X2, 8(STATE)
lw X3, 12(STATE)
lw X4, 16(STATE)
lw X5, 20(STATE)
lw X6, 24(STATE)
lw X7, 28(STATE)
lw X8, 32(STATE)
lw X9, 36(STATE)
lw X10, 40(STATE)
lw X11, 44(STATE)
lw X12, 48(STATE)
lw X13, 52(STATE)
lw X14, 56(STATE)
lw X15, 60(STATE)
.Loop_hchacha_xor_rounds:
addiu $a2, -2
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
bnez $a2, .Loop_hchacha_xor_rounds
/* Restore used register */
lw X11, 0($sp)
sw X0, 0(OUT)
sw X1, 4(OUT)
sw X2, 8(OUT)
sw X3, 12(OUT)
sw X12, 16(OUT)
sw X13, 20(OUT)
sw X14, 24(OUT)
sw X15, 28(OUT)
addiu $sp, STACK_SIZE
jr $ra
.end hchacha_block_arch
.set at

View File

@ -0,0 +1,152 @@
// SPDX-License-Identifier: GPL-2.0
/*
* MIPS accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/byteorder.h>
#include <crypto/algapi.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds);
EXPORT_SYMBOL(chacha_crypt_arch);
asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
static int chacha_mips_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];
int err;
err = skcipher_walk_virt(&walk, req, false);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int chacha_mips(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_mips_stream_xor(req, ctx, req->iv);
}
static int xchacha_mips(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 state[16];
u8 real_iv[16];
chacha_init_generic(state, ctx->key, req->iv);
hchacha_block(state, subctx.key, ctx->nrounds);
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_mips_stream_xor(req, &subctx, real_iv);
}
static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-mips",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_mips,
.decrypt = chacha_mips,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-mips",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_mips,
.decrypt = xchacha_mips,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-mips",
.base.cra_priority = 200,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_mips,
.decrypt = xchacha_mips,
}
};
static int __init chacha_simd_mod_init(void)
{
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-mips");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-mips");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-mips");

View File

@ -0,0 +1,191 @@
// SPDX-License-Identifier: GPL-2.0
/*
* OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
*
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
*/
#include <asm/unaligned.h>
#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/poly1305.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
asmlinkage void poly1305_init_mips(void *state, const u8 *key);
asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
poly1305_init_mips(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(key + 16);
dctx->s[1] = get_unaligned_le32(key + 20);
dctx->s[2] = get_unaligned_le32(key + 24);
dctx->s[3] = get_unaligned_le32(key + 28);
dctx->buflen = 0;
}
EXPORT_SYMBOL(poly1305_init_arch);
static int mips_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
dctx->buflen = 0;
dctx->rset = 0;
dctx->sset = false;
return 0;
}
static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
u32 len, u32 hibit)
{
if (unlikely(!dctx->sset)) {
if (!dctx->rset) {
poly1305_init_mips(&dctx->h, src);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->rset = 1;
}
if (len >= POLY1305_BLOCK_SIZE) {
dctx->s[0] = get_unaligned_le32(src + 0);
dctx->s[1] = get_unaligned_le32(src + 4);
dctx->s[2] = get_unaligned_le32(src + 8);
dctx->s[3] = get_unaligned_le32(src + 12);
src += POLY1305_BLOCK_SIZE;
len -= POLY1305_BLOCK_SIZE;
dctx->sset = true;
}
if (len < POLY1305_BLOCK_SIZE)
return;
}
len &= ~(POLY1305_BLOCK_SIZE - 1);
poly1305_blocks_mips(&dctx->h, src, len, hibit);
}
static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
unsigned int len)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(dctx->buflen)) {
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
len -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(len >= POLY1305_BLOCK_SIZE)) {
mips_poly1305_blocks(dctx, src, len, 1);
src += round_down(len, POLY1305_BLOCK_SIZE);
len %= POLY1305_BLOCK_SIZE;
}
if (unlikely(len)) {
dctx->buflen = len;
memcpy(dctx->buf, src, len);
}
return 0;
}
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int nbytes)
{
if (unlikely(dctx->buflen)) {
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
memcpy(dctx->buf + dctx->buflen, src, bytes);
src += bytes;
nbytes -= bytes;
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
poly1305_blocks_mips(&dctx->h, dctx->buf,
POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
poly1305_blocks_mips(&dctx->h, src, len, 1);
src += len;
nbytes %= POLY1305_BLOCK_SIZE;
}
if (unlikely(nbytes)) {
dctx->buflen = nbytes;
memcpy(dctx->buf, src, nbytes);
}
}
EXPORT_SYMBOL(poly1305_update_arch);
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
if (unlikely(dctx->buflen)) {
dctx->buf[dctx->buflen++] = 1;
memset(dctx->buf + dctx->buflen, 0,
POLY1305_BLOCK_SIZE - dctx->buflen);
poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
}
poly1305_emit_mips(&dctx->h, dst, dctx->s);
*dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
if (unlikely(!dctx->sset))
return -ENOKEY;
poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg mips_poly1305_alg = {
.init = mips_poly1305_init,
.update = mips_poly1305_update,
.final = mips_poly1305_final,
.digestsize = POLY1305_DIGEST_SIZE,
.descsize = sizeof(struct poly1305_desc_ctx),
.base.cra_name = "poly1305",
.base.cra_driver_name = "poly1305-mips",
.base.cra_priority = 200,
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
};
static int __init mips_poly1305_mod_init(void)
{
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shash(&mips_poly1305_alg) : 0;
}
static void __exit mips_poly1305_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
crypto_unregister_shash(&mips_poly1305_alg);
}
module_init(mips_poly1305_mod_init);
module_exit(mips_poly1305_mod_exit);
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-mips");

File diff suppressed because it is too large Load Diff

View File

@ -152,6 +152,7 @@ config PPC
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select ARCH_WEAK_RELEASE_ACQUIRE
select BINFMT_ELF
select BUILDTIME_EXTABLE_SORT
@ -1009,6 +1010,19 @@ config FSL_RIO
source "drivers/rapidio/Kconfig"
config PPC_RTAS_FILTER
bool "Enable filtering of RTAS syscalls"
default y
depends on PPC_RTAS
help
The RTAS syscall API has security issues that could be used to
compromise system integrity. This option enforces restrictions on the
RTAS calls and arguments passed by userspace programs to mitigate
these issues.
Say Y unless you know what you are doing and the filter is causing
problems for you.
endmenu
config NONSTATIC_KERNEL

View File

@ -12,6 +12,8 @@
#ifndef _ASM_POWERPC_LMB_H
#define _ASM_POWERPC_LMB_H
#include <linux/sched.h>
struct drmem_lmb {
u64 base_addr;
u32 drc_index;
@ -22,13 +24,27 @@ struct drmem_lmb {
struct drmem_lmb_info {
struct drmem_lmb *lmbs;
int n_lmbs;
u32 lmb_size;
u64 lmb_size;
};
extern struct drmem_lmb_info *drmem_info;
static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
const struct drmem_lmb *start)
{
/*
* DLPAR code paths can take several milliseconds per element
* when interacting with firmware. Ensure that we don't
* unfairly monopolize the CPU.
*/
if (((++lmb - start) % 16) == 0)
cond_resched();
return lmb;
}
#define for_each_drmem_lmb_in_range(lmb, start, end) \
for ((lmb) = (start); (lmb) < (end); (lmb)++)
for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start))
#define for_each_drmem_lmb(lmb) \
for_each_drmem_lmb_in_range((lmb), \
@ -67,7 +83,7 @@ struct of_drconf_cell_v2 {
#define DRCONF_MEM_AI_INVALID 0x00000040
#define DRCONF_MEM_RESERVED 0x00000080
static inline u32 drmem_lmb_size(void)
static inline u64 drmem_lmb_size(void)
{
return drmem_info->lmb_size;
}

View File

@ -204,7 +204,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
{
switch_mm(prev, next, current);
switch_mm_irqs_off(prev, next, current);
}
/* We don't currently use enter_lazy_tlb() for anything */

View File

@ -788,7 +788,7 @@
#define THRM1_TIN (1 << 31)
#define THRM1_TIV (1 << 30)
#define THRM1_THRES(x) ((x&0x7f)<<23)
#define THRM3_SITV(x) ((x&0x3fff)<<1)
#define THRM3_SITV(x) ((x & 0x1fff) << 1)
#define THRM1_TID (1<<2)
#define THRM1_TIE (1<<1)
#define THRM1_V (1<<0)

View File

@ -76,19 +76,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
return false;
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
}
static inline void mm_reset_thread_local(struct mm_struct *mm)
{
WARN_ON(atomic_read(&mm->context.copros) > 0);
/*
* It's possible for mm_access to take a reference on mm_users to
* access the remote mm from another thread, but it's not allowed
* to set mm_cpumask, so mm_users may be > 1 here.
*/
WARN_ON(current->mm != mm);
atomic_set(&mm->context.active_cpus, 1);
cpumask_clear(mm_cpumask(mm));
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
}
#else /* CONFIG_PPC_BOOK3S_64 */
static inline int mm_is_thread_local(struct mm_struct *mm)
{

View File

@ -1057,6 +1057,147 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
return NULL;
}
#ifdef CONFIG_PPC_RTAS_FILTER
/*
* The sys_rtas syscall, as originally designed, allows root to pass
* arbitrary physical addresses to RTAS calls. A number of RTAS calls
* can be abused to write to arbitrary memory and do other things that
* are potentially harmful to system integrity, and thus should only
* be used inside the kernel and not exposed to userspace.
*
* All known legitimate users of the sys_rtas syscall will only ever
* pass addresses that fall within the RMO buffer, and use a known
* subset of RTAS calls.
*
* Accordingly, we filter RTAS requests to check that the call is
* permitted, and that provided pointers fall within the RMO buffer.
* The rtas_filters list contains an entry for each permitted call,
* with the indexes of the parameters which are expected to contain
* addresses and sizes of buffers allocated inside the RMO buffer.
*/
struct rtas_filter {
const char *name;
int token;
/* Indexes into the args buffer, -1 if not used */
int buf_idx1;
int size_idx1;
int buf_idx2;
int size_idx2;
int fixed_size;
};
static struct rtas_filter rtas_filters[] __ro_after_init = {
{ "ibm,activate-firmware", -1, -1, -1, -1, -1 },
{ "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */
{ "display-character", -1, -1, -1, -1, -1 },
{ "ibm,display-message", -1, 0, -1, -1, -1 },
{ "ibm,errinjct", -1, 2, -1, -1, -1, 1024 },
{ "ibm,close-errinjct", -1, -1, -1, -1, -1 },
{ "ibm,open-errinct", -1, -1, -1, -1, -1 },
{ "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 },
{ "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 },
{ "ibm,get-indices", -1, 2, 3, -1, -1 },
{ "get-power-level", -1, -1, -1, -1, -1 },
{ "get-sensor-state", -1, -1, -1, -1, -1 },
{ "ibm,get-system-parameter", -1, 1, 2, -1, -1 },
{ "get-time-of-day", -1, -1, -1, -1, -1 },
{ "ibm,get-vpd", -1, 0, -1, 1, 2 },
{ "ibm,lpar-perftools", -1, 2, 3, -1, -1 },
{ "ibm,platform-dump", -1, 4, 5, -1, -1 },
{ "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 },
{ "ibm,scan-log-dump", -1, 0, 1, -1, -1 },
{ "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 },
{ "ibm,set-eeh-option", -1, -1, -1, -1, -1 },
{ "set-indicator", -1, -1, -1, -1, -1 },
{ "set-power-level", -1, -1, -1, -1, -1 },
{ "set-time-for-power-on", -1, -1, -1, -1, -1 },
{ "ibm,set-system-parameter", -1, 1, -1, -1, -1 },
{ "set-time-of-day", -1, -1, -1, -1, -1 },
{ "ibm,suspend-me", -1, -1, -1, -1, -1 },
{ "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 },
{ "ibm,update-properties", -1, 0, -1, -1, -1, 4096 },
{ "ibm,physical-attestation", -1, 0, 1, -1, -1 },
};
static bool in_rmo_buf(u32 base, u32 end)
{
return base >= rtas_rmo_buf &&
base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) &&
base <= end &&
end >= rtas_rmo_buf &&
end < (rtas_rmo_buf + RTAS_RMOBUF_MAX);
}
static bool block_rtas_call(int token, int nargs,
struct rtas_args *args)
{
int i;
for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
struct rtas_filter *f = &rtas_filters[i];
u32 base, size, end;
if (token != f->token)
continue;
if (f->buf_idx1 != -1) {
base = be32_to_cpu(args->args[f->buf_idx1]);
if (f->size_idx1 != -1)
size = be32_to_cpu(args->args[f->size_idx1]);
else if (f->fixed_size)
size = f->fixed_size;
else
size = 1;
end = base + size - 1;
if (!in_rmo_buf(base, end))
goto err;
}
if (f->buf_idx2 != -1) {
base = be32_to_cpu(args->args[f->buf_idx2]);
if (f->size_idx2 != -1)
size = be32_to_cpu(args->args[f->size_idx2]);
else if (f->fixed_size)
size = f->fixed_size;
else
size = 1;
end = base + size - 1;
/*
* Special case for ibm,configure-connector where the
* address can be 0
*/
if (!strcmp(f->name, "ibm,configure-connector") &&
base == 0)
return false;
if (!in_rmo_buf(base, end))
goto err;
}
return false;
}
err:
pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
token, nargs, current->comm);
return true;
}
#else
static bool block_rtas_call(int token, int nargs,
struct rtas_args *args)
{
return false;
}
#endif /* CONFIG_PPC_RTAS_FILTER */
/* We assume to be passed big endian arguments */
SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
{
@ -1094,6 +1235,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
args.rets = &args.args[nargs];
memset(args.rets, 0, nret * sizeof(rtas_arg_t));
if (block_rtas_call(token, nargs, &args))
return -EINVAL;
/* Need to handle ibm,suspend_me call specially */
if (token == ibm_suspend_me_token) {
@ -1155,6 +1299,9 @@ void __init rtas_initialize(void)
unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
u32 base, size, entry;
int no_base, no_size, no_entry;
#ifdef CONFIG_PPC_RTAS_FILTER
int i;
#endif
/* Get RTAS dev node and fill up our "rtas" structure with infos
* about it.
@ -1190,6 +1337,12 @@ void __init rtas_initialize(void)
#ifdef CONFIG_RTAS_ERROR_LOGGING
rtas_last_error_token = rtas_token("rtas-last-error");
#endif
#ifdef CONFIG_PPC_RTAS_FILTER
for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
rtas_filters[i].token = rtas_token(rtas_filters[i].name);
}
#endif
}
int __init early_init_dt_scan_rtas(unsigned long node,

View File

@ -29,29 +29,27 @@
static DEFINE_PER_CPU(struct cpu, cpu_devices);
/*
* SMT snooze delay stuff, 64-bit only for now
*/
#ifdef CONFIG_PPC64
/* Time in microseconds we delay before sleeping in the idle loop */
static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
/*
* Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle:
* smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in
* 2014:
*
* "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean
* up the kernel code."
*
* powerpc-utils stopped using it as of 1.3.8. At some point in the future this
* code should be removed.
*/
static ssize_t store_smt_snooze_delay(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t count)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
ssize_t ret;
long snooze;
ret = sscanf(buf, "%ld", &snooze);
if (ret != 1)
return -EINVAL;
per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n",
current->comm, current->pid);
return count;
}
@ -59,9 +57,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, dev);
return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n",
current->comm, current->pid);
return sprintf(buf, "100\n");
}
static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
@ -69,16 +67,10 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
static int __init setup_smt_snooze_delay(char *str)
{
unsigned int cpu;
long snooze;
if (!cpu_has_feature(CPU_FTR_SMT))
return 1;
snooze = simple_strtol(str, NULL, 10);
for_each_possible_cpu(cpu)
per_cpu(smt_snooze_delay, cpu) = snooze;
pr_warn("smt-snooze-delay command line option has no effect\n");
return 1;
}
__setup("smt-snooze-delay=", setup_smt_snooze_delay);

View File

@ -13,13 +13,14 @@
*/
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/workqueue.h>
#include <asm/io.h>
#include <asm/reg.h>
@ -39,9 +40,7 @@ static struct tau_temp
unsigned char grew;
} tau[NR_CPUS];
struct timer_list tau_timer;
#undef DEBUG
static bool tau_int_enable;
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
* dynamic adjustment to minimize # of interrupts */
@ -50,72 +49,49 @@ struct timer_list tau_timer;
#define step_size 2 /* step size when temp goes out of range */
#define window_expand 1 /* expand the window by this much */
/* configurable values for shrinking the window */
#define shrink_timer 2*HZ /* period between shrinking the window */
#define shrink_timer 2000 /* period between shrinking the window */
#define min_window 2 /* minimum window size, degrees C */
static void set_thresholds(unsigned long cpu)
{
#ifdef CONFIG_TAU_INT
/*
* setup THRM1,
* threshold, valid bit, enable interrupts, interrupt when below threshold
*/
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
/* setup THRM2,
* threshold, valid bit, enable interrupts, interrupt when above threshold
*/
mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
#else
/* same thing but don't enable interrupts */
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
#endif
/* setup THRM1, threshold, valid bit, interrupt when below threshold */
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
/* setup THRM2, threshold, valid bit, interrupt when above threshold */
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
}
static void TAUupdate(int cpu)
{
unsigned thrm;
#ifdef DEBUG
printk("TAUupdate ");
#endif
u32 thrm;
u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
/* if both thresholds are crossed, the step_sizes cancel out
* and the window winds up getting expanded twice. */
if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
if(thrm & THRM1_TIN){ /* crossed low threshold */
if (tau[cpu].low >= step_size){
tau[cpu].low -= step_size;
tau[cpu].high -= (step_size - window_expand);
}
tau[cpu].grew = 1;
#ifdef DEBUG
printk("low threshold crossed ");
#endif
thrm = mfspr(SPRN_THRM1);
if ((thrm & bits) == bits) {
mtspr(SPRN_THRM1, 0);
if (tau[cpu].low >= step_size) {
tau[cpu].low -= step_size;
tau[cpu].high -= (step_size - window_expand);
}
tau[cpu].grew = 1;
pr_debug("%s: low threshold crossed\n", __func__);
}
if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
if(thrm & THRM1_TIN){ /* crossed high threshold */
if (tau[cpu].high <= 127-step_size){
tau[cpu].low += (step_size - window_expand);
tau[cpu].high += step_size;
}
tau[cpu].grew = 1;
#ifdef DEBUG
printk("high threshold crossed ");
#endif
thrm = mfspr(SPRN_THRM2);
if ((thrm & bits) == bits) {
mtspr(SPRN_THRM2, 0);
if (tau[cpu].high <= 127 - step_size) {
tau[cpu].low += (step_size - window_expand);
tau[cpu].high += step_size;
}
tau[cpu].grew = 1;
pr_debug("%s: high threshold crossed\n", __func__);
}
#ifdef DEBUG
printk("grew = %d\n", tau[cpu].grew);
#endif
#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
set_thresholds(cpu);
#endif
}
#ifdef CONFIG_TAU_INT
@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs)
static void tau_timeout(void * info)
{
int cpu;
unsigned long flags;
int size;
int shrink;
/* disabling interrupts *should* be okay */
local_irq_save(flags);
cpu = smp_processor_id();
#ifndef CONFIG_TAU_INT
TAUupdate(cpu);
#endif
if (!tau_int_enable)
TAUupdate(cpu);
/* Stop thermal sensor comparisons and interrupts */
mtspr(SPRN_THRM3, 0);
size = tau[cpu].high - tau[cpu].low;
if (size > min_window && ! tau[cpu].grew) {
@ -173,32 +148,26 @@ static void tau_timeout(void * info)
set_thresholds(cpu);
/*
* Do the enable every time, since otherwise a bunch of (relatively)
* complex sleep code needs to be added. One mtspr every time
* tau_timeout is called is probably not a big deal.
*
* Enable thermal sensor and set up sample interval timer
* need 20 us to do the compare.. until a nice 'cpu_speed' function
* call is implemented, just assume a 500 mhz clock. It doesn't really
* matter if we take too long for a compare since it's all interrupt
* driven anyway.
*
* use a extra long time.. (60 us @ 500 mhz)
/* Restart thermal sensor comparisons and interrupts.
* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
* recommends that "the maximum value be set in THRM3 under all
* conditions."
*/
mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
local_irq_restore(flags);
mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
}
static void tau_timeout_smp(struct timer_list *unused)
static struct workqueue_struct *tau_workq;
static void tau_work_func(struct work_struct *work)
{
/* schedule ourselves to be run again */
mod_timer(&tau_timer, jiffies + shrink_timer) ;
msleep(shrink_timer);
on_each_cpu(tau_timeout, NULL, 0);
/* schedule ourselves to be run again */
queue_work(tau_workq, work);
}
DECLARE_WORK(tau_work, tau_work_func);
/*
* setup the TAU
*
@ -231,21 +200,19 @@ static int __init TAU_init(void)
return 1;
}
tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
!strcmp(cur_cpu_spec->platform, "ppc750");
/* first, set up the window shrinking timer */
timer_setup(&tau_timer, tau_timeout_smp, 0);
tau_timer.expires = jiffies + shrink_timer;
add_timer(&tau_timer);
tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
if (!tau_workq)
return -ENOMEM;
on_each_cpu(TAU_init_smp, NULL, 0);
printk("Thermal assist unit ");
#ifdef CONFIG_TAU_INT
printk("using interrupts, ");
#else
printk("using timers, ");
#endif
printk("shrink_timer: %d jiffies\n", shrink_timer);
queue_work(tau_workq, &tau_work);
pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
tau_initialized = 1;
return 0;

View File

@ -794,7 +794,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs)
{
unsigned int ra, rb, t, i, sel, instr, rc;
const void __user *addr;
u8 vbuf[16], *vdst;
u8 vbuf[16] __aligned(16), *vdst;
unsigned long ea, msr, msr_mask;
bool swap;

View File

@ -598,19 +598,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
struct mm_struct *mm = arg;
unsigned long pid = mm->context.id;
/*
* A kthread could have done a mmget_not_zero() after the flushing CPU
* checked mm_is_singlethreaded, and be in the process of
* kthread_use_mm when interrupted here. In that case, current->mm will
* be set to mm, because kthread_use_mm() setting ->mm and switching to
* the mm is done with interrupts off.
*/
if (current->mm == mm)
return; /* Local CPU */
goto out_flush;
if (current->active_mm == mm) {
/*
* Must be a kernel thread because sender is single-threaded.
*/
BUG_ON(current->mm);
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
mmgrab(&init_mm);
switch_mm(mm, &init_mm, current);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
mmdrop(mm);
}
atomic_dec(&mm->context.active_cpus);
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
out_flush:
_tlbiel_pid(pid, RIC_FLUSH_ALL);
}
@ -625,7 +635,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
*/
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
(void *)mm, 1);
mm_reset_thread_local(mm);
}
void radix__flush_tlb_mm(struct mm_struct *mm)

View File

@ -95,7 +95,7 @@ REQUEST(__field(0, 8, partition_id)
#define REQUEST_NAME system_performance_capabilities
#define REQUEST_NUM 0x40
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
#include I(REQUEST_BEGIN)
REQUEST(__field(0, 1, perf_collect_privileged)
__field(0x1, 1, capability_mask)
@ -223,7 +223,7 @@ REQUEST(__field(0, 2, partition_id)
#define REQUEST_NAME system_hypervisor_times
#define REQUEST_NUM 0xF0
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
#include I(REQUEST_BEGIN)
REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
__count(0x8, 8, time_spent_processing_virtual_processor_timers)
@ -234,7 +234,7 @@ REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
#define REQUEST_NAME system_tlbie_count_and_time
#define REQUEST_NUM 0xF4
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
#include I(REQUEST_BEGIN)
REQUEST(__count(0, 8, tlbie_instructions_issued)
/*

View File

@ -273,6 +273,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
mask |= CNST_PMC_MASK(pmc);
value |= CNST_PMC_VAL(pmc);
/*
* PMC5 and PMC6 are used to count cycles and instructions and
* they do not support most of the constraint bits. Add a check
* to exclude PMC5/6 from most of the constraints except for
* EBB/BHRB.
*/
if (pmc >= 5)
goto ebb_bhrb;
}
if (pmc <= 4) {
@ -331,6 +340,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
}
}
ebb_bhrb:
if (!pmc && ebb)
/* EBB events must specify the PMC */
return -1;

View File

@ -238,12 +238,11 @@ config TAU
temperature within 2-4 degrees Celsius. This option shows the current
on-die temperature in /proc/cpuinfo if the cpu supports it.
Unfortunately, on some chip revisions, this sensor is very inaccurate
and in many cases, does not work at all, so don't assume the cpu
temp is actually what /proc/cpuinfo says it is.
Unfortunately, this sensor is very inaccurate when uncalibrated, so
don't assume the cpu temp is actually what /proc/cpuinfo says it is.
config TAU_INT
bool "Interrupt driven TAU driver (DANGEROUS)"
bool "Interrupt driven TAU driver (EXPERIMENTAL)"
depends on TAU
---help---
The TAU supports an interrupt driven mode which causes an interrupt
@ -251,12 +250,7 @@ config TAU_INT
to get notified the temp has exceeded a range. With this option off,
a timer is used to re-check the temperature periodically.
However, on some cpus it appears that the TAU interrupt hardware
is buggy and can cause a situation which would lead unexplained hard
lockups.
Unless you are extending the TAU driver, or enjoy kernel/hardware
debugging, leave this option off.
If in doubt, say N here.
config TAU_AVERAGE
bool "Average high and low temp"

View File

@ -322,15 +322,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
uint32_t type)
static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
{
struct dump_obj *dump;
int rc;
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
if (!dump)
return NULL;
return;
dump->kobj.kset = dump_kset;
@ -350,21 +349,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
if (rc) {
kobject_put(&dump->kobj);
return NULL;
return;
}
/*
* As soon as the sysfs file for this dump is created/activated there is
* a chance the opal_errd daemon (or any userspace) might read and
* acknowledge the dump before kobject_uevent() is called. If that
* happens then there is a potential race between
* dump_ack_store->kobject_put() and kobject_uevent() which leads to a
* use-after-free of a kernfs object resulting in a kernel crash.
*
* To avoid that, we need to take a reference on behalf of the bin file,
* so that our reference remains valid while we call kobject_uevent().
* We then drop our reference before exiting the function, leaving the
* bin file to drop the last reference (if it hasn't already).
*/
/* Take a reference for the bin file */
kobject_get(&dump->kobj);
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
if (rc) {
if (rc == 0) {
kobject_uevent(&dump->kobj, KOBJ_ADD);
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
__func__, dump->id, dump->size);
} else {
/* Drop reference count taken for bin file */
kobject_put(&dump->kobj);
return NULL;
}
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
__func__, dump->id, dump->size);
kobject_uevent(&dump->kobj, KOBJ_ADD);
return dump;
/* Drop our reference */
kobject_put(&dump->kobj);
return;
}
static irqreturn_t process_dump(int irq, void *data)

View File

@ -183,14 +183,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
{
struct elog_obj *elog;
int rc;
elog = kzalloc(sizeof(*elog), GFP_KERNEL);
if (!elog)
return NULL;
return;
elog->kobj.kset = elog_kset;
@ -223,18 +223,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
if (rc) {
kobject_put(&elog->kobj);
return NULL;
return;
}
/*
* As soon as the sysfs file for this elog is created/activated there is
* a chance the opal_errd daemon (or any userspace) might read and
* acknowledge the elog before kobject_uevent() is called. If that
* happens then there is a potential race between
* elog_ack_store->kobject_put() and kobject_uevent() which leads to a
* use-after-free of a kernfs object resulting in a kernel crash.
*
* To avoid that, we need to take a reference on behalf of the bin file,
* so that our reference remains valid while we call kobject_uevent().
* We then drop our reference before exiting the function, leaving the
* bin file to drop the last reference (if it hasn't already).
*/
/* Take a reference for the bin file */
kobject_get(&elog->kobj);
rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
if (rc) {
if (rc == 0) {
kobject_uevent(&elog->kobj, KOBJ_ADD);
} else {
/* Drop the reference taken for the bin file */
kobject_put(&elog->kobj);
return NULL;
}
kobject_uevent(&elog->kobj, KOBJ_ADD);
/* Drop our reference */
kobject_put(&elog->kobj);
return elog;
return;
}
static irqreturn_t elog_event(int irq, void *data)

View File

@ -47,7 +47,7 @@
#include <asm/udbg.h>
#define DBG(fmt...) udbg_printf(fmt)
#else
#define DBG(fmt...)
#define DBG(fmt...) do { } while (0)
#endif
static void pnv_smp_setup_cpu(int cpu)

View File

@ -40,6 +40,7 @@ static __init int rng_init(void)
ppc_md.get_random_seed = pseries_get_random_long;
of_node_put(dn);
return 0;
}
machine_subsys_initcall(pseries, rng_init);

View File

@ -179,6 +179,7 @@ int icp_hv_init(void)
icp_ops = &icp_hv_ops;
of_node_put(np);
return 0;
}

View File

@ -21,4 +21,7 @@
/* vDSO location */
#define AT_SYSINFO_EHDR 33
/* entries in ARCH_DLINFO */
#define AT_VECTOR_SIZE_ARCH 1
#endif /* _UAPI_ASM_RISCV_AUXVEC_H */

View File

@ -354,8 +354,9 @@ static DEFINE_PER_CPU(atomic_t, clock_sync_word);
static DEFINE_MUTEX(clock_sync_mutex);
static unsigned long clock_sync_flags;
#define CLOCK_SYNC_HAS_STP 0
#define CLOCK_SYNC_STP 1
#define CLOCK_SYNC_HAS_STP 0
#define CLOCK_SYNC_STP 1
#define CLOCK_SYNC_STPINFO_VALID 2
/*
* The get_clock function for the physical clock. It will get the current
@ -592,6 +593,22 @@ void stp_queue_work(void)
queue_work(time_sync_wq, &stp_work);
}
static int __store_stpinfo(void)
{
int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
if (rc)
clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
else
set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
return rc;
}
static int stpinfo_valid(void)
{
return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
}
static int stp_sync_clock(void *data)
{
struct clock_sync_data *sync = data;
@ -613,8 +630,7 @@ static int stp_sync_clock(void *data)
if (rc == 0) {
sync->clock_delta = clock_delta;
clock_sync_global(clock_delta);
rc = chsc_sstpi(stp_page, &stp_info,
sizeof(struct stp_sstpi));
rc = __store_stpinfo();
if (rc == 0 && stp_info.tmd != 2)
rc = -EAGAIN;
}
@ -659,7 +675,7 @@ static void stp_work_fn(struct work_struct *work)
if (rc)
goto out_unlock;
rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
rc = __store_stpinfo();
if (rc || stp_info.c == 0)
goto out_unlock;
@ -696,10 +712,14 @@ static ssize_t stp_ctn_id_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online)
return -ENODATA;
return sprintf(buf, "%016llx\n",
*(unsigned long long *) stp_info.ctnid);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid())
ret = sprintf(buf, "%016llx\n",
*(unsigned long long *) stp_info.ctnid);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
@ -708,9 +728,13 @@ static ssize_t stp_ctn_type_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online)
return -ENODATA;
return sprintf(buf, "%i\n", stp_info.ctn);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid())
ret = sprintf(buf, "%i\n", stp_info.ctn);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
@ -719,9 +743,13 @@ static ssize_t stp_dst_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online || !(stp_info.vbits & 0x2000))
return -ENODATA;
return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x2000))
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
@ -730,9 +758,13 @@ static ssize_t stp_leap_seconds_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online || !(stp_info.vbits & 0x8000))
return -ENODATA;
return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x8000))
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
@ -741,9 +773,13 @@ static ssize_t stp_stratum_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online)
return -ENODATA;
return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid())
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
@ -752,9 +788,13 @@ static ssize_t stp_time_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online || !(stp_info.vbits & 0x0800))
return -ENODATA;
return sprintf(buf, "%i\n", (int) stp_info.tto);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x0800))
ret = sprintf(buf, "%i\n", (int) stp_info.tto);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
@ -763,9 +803,13 @@ static ssize_t stp_time_zone_offset_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online || !(stp_info.vbits & 0x4000))
return -ENODATA;
return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid() && (stp_info.vbits & 0x4000))
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(time_zone_offset, 0400,
@ -775,9 +819,13 @@ static ssize_t stp_timing_mode_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online)
return -ENODATA;
return sprintf(buf, "%i\n", stp_info.tmd);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid())
ret = sprintf(buf, "%i\n", stp_info.tmd);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
@ -786,9 +834,13 @@ static ssize_t stp_timing_state_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
if (!stp_online)
return -ENODATA;
return sprintf(buf, "%i\n", stp_info.tst);
ssize_t ret = -ENODATA;
mutex_lock(&stp_work_mutex);
if (stpinfo_valid())
ret = sprintf(buf, "%i\n", stp_info.tst);
mutex_unlock(&stp_work_mutex);
return ret;
}
static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);

View File

@ -1039,38 +1039,9 @@ void smp_fetch_global_pmu(void)
* are flush_tlb_*() routines, and these run after flush_cache_*()
* which performs the flushw.
*
* The SMP TLB coherency scheme we use works as follows:
*
* 1) mm->cpu_vm_mask is a bit mask of which cpus an address
* space has (potentially) executed on, this is the heuristic
* we use to avoid doing cross calls.
*
* Also, for flushing from kswapd and also for clones, we
* use cpu_vm_mask as the list of cpus to make run the TLB.
*
* 2) TLB context numbers are shared globally across all processors
* in the system, this allows us to play several games to avoid
* cross calls.
*
* One invariant is that when a cpu switches to a process, and
* that processes tsk->active_mm->cpu_vm_mask does not have the
* current cpu's bit set, that tlb context is flushed locally.
*
* If the address space is non-shared (ie. mm->count == 1) we avoid
* cross calls when we want to flush the currently running process's
* tlb state. This is done by clearing all cpu bits except the current
* processor's in current->mm->cpu_vm_mask and performing the
* flush locally only. This will force any subsequent cpus which run
* this task to flush the context from the local tlb if the process
* migrates to another cpu (again).
*
* 3) For shared address spaces (threads) and swapping we bite the
* bullet for most cases and perform the cross call (but only to
* the cpus listed in cpu_vm_mask).
*
* The performance gain from "optimizing" away the cross call for threads is
* questionable (in theory the big win for threads is the massive sharing of
* address space state across processors).
* mm->cpu_vm_mask is a bit mask of which cpus an address
* space has (potentially) executed on, this is the heuristic
* we use to limit cross calls.
*/
/* This currently is only used by the hugetlb arch pre-fault
@ -1080,18 +1051,13 @@ void smp_fetch_global_pmu(void)
void smp_flush_tlb_mm(struct mm_struct *mm)
{
u32 ctx = CTX_HWBITS(mm->context);
int cpu = get_cpu();
if (atomic_read(&mm->mm_users) == 1) {
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
goto local_flush_and_out;
}
get_cpu();
smp_cross_call_masked(&xcall_flush_tlb_mm,
ctx, 0, 0,
mm_cpumask(mm));
local_flush_and_out:
__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
put_cpu();
@ -1114,17 +1080,15 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
{
u32 ctx = CTX_HWBITS(mm->context);
struct tlb_pending_info info;
int cpu = get_cpu();
get_cpu();
info.ctx = ctx;
info.nr = nr;
info.vaddrs = vaddrs;
if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
else
smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
&info, 1);
smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
&info, 1);
__flush_tlb_pending(ctx, nr, vaddrs);
@ -1134,14 +1098,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
{
unsigned long context = CTX_HWBITS(mm->context);
int cpu = get_cpu();
if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
else
smp_cross_call_masked(&xcall_flush_tlb_page,
context, vaddr, 0,
mm_cpumask(mm));
get_cpu();
smp_cross_call_masked(&xcall_flush_tlb_page,
context, vaddr, 0,
mm_cpumask(mm));
__flush_tlb_page(context, vaddr);
put_cpu();

View File

@ -36,14 +36,14 @@ int write_sigio_irq(int fd)
}
/* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
static DEFINE_SPINLOCK(sigio_spinlock);
static DEFINE_MUTEX(sigio_mutex);
void sigio_lock(void)
{
spin_lock(&sigio_spinlock);
mutex_lock(&sigio_mutex);
}
void sigio_unlock(void)
{
spin_unlock(&sigio_spinlock);
mutex_unlock(&sigio_mutex);
}

View File

@ -200,9 +200,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)

View File

@ -40,6 +40,7 @@ CONFIG_EMBEDDED=y
# CONFIG_SLAB_MERGE_DEFAULT is not set
CONFIG_PROFILING=y
CONFIG_SMP=y
CONFIG_X86_X2APIC=y
CONFIG_HYPERVISOR_GUEST=y
CONFIG_PARAVIRT=y
CONFIG_NR_CPUS=32
@ -213,6 +214,7 @@ CONFIG_DM_VERITY_FEC=y
CONFIG_DM_BOW=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=y
CONFIG_WIREGUARD=y
CONFIG_TUN=y
CONFIG_VETH=y
# CONFIG_ETHERNET is not set
@ -310,6 +312,7 @@ CONFIG_HID_NINTENDO=y
CONFIG_HID_SONY=y
CONFIG_HID_STEAM=y
CONFIG_USB_HIDDEV=y
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
CONFIG_USB_XHCI_HCD=y
CONFIG_USB_GADGET=y
CONFIG_USB_GADGET_VBUS_DRAW=500
@ -436,6 +439,7 @@ CONFIG_CRC8=y
CONFIG_XZ_DEC=y
CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF4=y
# CONFIG_ENABLE_MUST_CHECK is not set
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
CONFIG_MAGIC_SYSRQ=y

1
arch/x86/crypto/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
poly1305-x86_64-cryptogams.S

View File

@ -8,8 +8,10 @@ OBJECT_FILES_NON_STANDARD := y
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@ -23,7 +25,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@ -46,6 +48,11 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
# These modules require the assembler to support ADX.
ifeq ($(adx_supported),yes)
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
endif
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@ -54,6 +61,7 @@ ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
endif
# These modules require assembler to support AVX2.
@ -77,7 +85,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@ -87,6 +95,12 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
targets += poly1305-x86_64-cryptogams.S
endif
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
@ -100,20 +114,22 @@ endif
ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
chacha20-x86_64-y += chacha20-avx2-x86_64.o
chacha-x86_64-y += chacha-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
endif
ifeq ($(avx512_supported),yes)
chacha-x86_64-y += chacha-avx512vl-x86_64.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
ifeq ($(sha1_ni_supported),yes)
sha1-ssse3-y += sha1_ni_asm.o
@ -127,3 +143,8 @@ sha256-ssse3-y += sha256_ni_asm.o
endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
quiet_cmd_perlasm = PERLASM $@
cmd_perlasm = $(PERL) $< > $@
$(obj)/%.S: $(src)/%.pl FORCE
$(call if_changed,perlasm)

View File

@ -0,0 +1,258 @@
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
*/
#include <linux/linkage.h>
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
.align 32
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ROT16, "aM", @progbits, 16
.align 16
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
.align 64
SIGMA:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
#ifdef CONFIG_AS_AVX512
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
.align 64
SIGMA2:
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#endif /* CONFIG_AS_AVX512 */
.text
#ifdef CONFIG_AS_SSSE3
ENTRY(blake2s_compress_ssse3)
testq %rdx,%rdx
je .Lendofloop
movdqu (%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqa ROT16(%rip),%xmm12
movdqa ROR328(%rip),%xmm13
movdqu 0x20(%rdi),%xmm14
movq %rcx,%xmm15
leaq SIGMA+0xa0(%rip),%r8
jmp .Lbeginofloop
.align 32
.Lbeginofloop:
movdqa %xmm0,%xmm10
movdqa %xmm1,%xmm11
paddq %xmm15,%xmm14
movdqa IV(%rip),%xmm2
movdqa %xmm14,%xmm3
pxor IV+0x10(%rip),%xmm3
leaq SIGMA(%rip),%rcx
.Lroundloop:
movzbl (%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0x1(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x2(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x3(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
movzbl 0x4(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x5(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x6(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0x7(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 0x8(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x9(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xa(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xb(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
por %xmm8,%xmm1
movzbl 0xc(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xd(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xe(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0xf(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $0x10,%rcx
cmpq %r8,%rcx
jnz .Lroundloop
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $0x40,%rsi
decq %rdx
jnz .Lbeginofloop
movdqu %xmm0,(%rdi)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
ret
ENDPROC(blake2s_compress_ssse3)
#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
ENTRY(blake2s_compress_avx512)
vmovdqu (%rdi),%xmm0
vmovdqu 0x10(%rdi),%xmm1
vmovdqu 0x20(%rdi),%xmm4
vmovq %rcx,%xmm5
vmovdqa IV(%rip),%xmm14
vmovdqa IV+16(%rip),%xmm15
jmp .Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
vmovdqa %xmm0,%xmm10
vmovdqa %xmm1,%xmm11
vpaddq %xmm5,%xmm4,%xmm4
vmovdqa %xmm14,%xmm2
vpxor %xmm15,%xmm4,%xmm3
vmovdqu (%rsi),%ymm6
vmovdqu 0x20(%rsi),%ymm7
addq $0x40,%rsi
leaq SIGMA2(%rip),%rax
movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
addq $0x40,%rax
vmovdqa -0x40(%rax),%ymm8
vmovdqa -0x20(%rax),%ymm9
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
vmovdqa %ymm9,%ymm7
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
vpxor %xmm10,%xmm0,%xmm0
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
decq %rdx
jne .Lblake2s_compress_avx512_mainloop
vmovdqu %xmm0,(%rdi)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
vzeroupper
retq
ENDPROC(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */

View File

@ -0,0 +1,232 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/internal/blake2s.h>
#include <crypto/internal/hash.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/cpufeature.h>
#include <asm/fpu/api.h>
#include <asm/processor.h>
#include <asm/simd.h>
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
void blake2s_compress_arch(struct blake2s_state *state,
const u8 *block, size_t nblocks,
const u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
return;
}
do {
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2S_BLOCK_SIZE);
kernel_fpu_begin();
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&blake2s_use_avx512))
blake2s_compress_avx512(state, block, blocks, inc);
else
blake2s_compress_ssse3(state, block, blocks, inc);
kernel_fpu_end();
nblocks -= blocks;
block += blocks * BLAKE2S_BLOCK_SIZE;
} while (nblocks);
}
EXPORT_SYMBOL(blake2s_compress_arch);
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
unsigned int keylen)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
memcpy(tctx->key, key, keylen);
tctx->keylen = keylen;
return 0;
}
static int crypto_blake2s_init(struct shash_desc *desc)
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct blake2s_state *state = shash_desc_ctx(desc);
const int outlen = crypto_shash_digestsize(desc->tfm);
if (tctx->keylen)
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
else
blake2s_init(state, outlen);
return 0;
}
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
unsigned int inlen)
{
struct blake2s_state *state = shash_desc_ctx(desc);
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
if (unlikely(!inlen))
return 0;
if (inlen > fill) {
memcpy(state->buf + state->buflen, in, fill);
blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
state->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
/* Hash one less (full) block than strictly possible */
blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
memcpy(state->buf + state->buflen, in, inlen);
state->buflen += inlen;
return 0;
}
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
{
struct blake2s_state *state = shash_desc_ctx(desc);
blake2s_set_lastblock(state);
memset(state->buf + state->buflen, 0,
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
blake2s_compress_arch(state, state->buf, 1, state->buflen);
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
memcpy(out, state->h, state->outlen);
memzero_explicit(state, sizeof(*state));
return 0;
}
static struct shash_alg blake2s_algs[] = {{
.base.cra_name = "blake2s-128",
.base.cra_driver_name = "blake2s-128-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_128_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-160",
.base.cra_driver_name = "blake2s-160-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_160_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-224",
.base.cra_driver_name = "blake2s-224-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_224_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}, {
.base.cra_name = "blake2s-256",
.base.cra_driver_name = "blake2s-256-x86",
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
.base.cra_priority = 200,
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.digestsize = BLAKE2S_256_HASH_SIZE,
.setkey = crypto_blake2s_setkey,
.init = crypto_blake2s_init,
.update = crypto_blake2s_update,
.final = crypto_blake2s_final,
.descsize = sizeof(struct blake2s_state),
}};
static int __init blake2s_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return 0;
static_branch_enable(&blake2s_use_ssse3);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
boot_cpu_has(X86_FEATURE_AVX512F) &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
XFEATURE_MASK_AVX512, NULL))
static_branch_enable(&blake2s_use_avx512);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
crypto_register_shashes(blake2s_algs,
ARRAY_SIZE(blake2s_algs)) : 0;
}
static void __exit blake2s_mod_exit(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
}
module_init(blake2s_mod_init);
module_exit(blake2s_mod_exit);
MODULE_ALIAS_CRYPTO("blake2s-128");
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
MODULE_ALIAS_CRYPTO("blake2s-160");
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
MODULE_ALIAS_CRYPTO("blake2s-224");
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
MODULE_ALIAS_CRYPTO("blake2s-256");
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
MODULE_LICENSE("GPL v2");

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,836 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
*
* Copyright (C) 2018 Martin Willi
*/
#include <linux/linkage.h>
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
.align 32
CTR2BL: .octa 0x00000000000000000000000000000000
.octa 0x00000000000000000000000000000001
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
.align 32
CTR4BL: .octa 0x00000000000000000000000000000002
.octa 0x00000000000000000000000000000003
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
.align 32
CTR8BL: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004
.text
ENTRY(chacha_2block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 2 data blocks output, o
# %rdx: up to 2 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts two ChaCha blocks by loading the state
# matrix twice across four AVX registers. It performs matrix operations
# on four words in each matrix in parallel, but requires shuffling to
# rearrange the words after each round.
vzeroupper
# x0..3[0-2] = s0..3
vbroadcasti128 0x00(%rdi),%ymm0
vbroadcasti128 0x10(%rdi),%ymm1
vbroadcasti128 0x20(%rdi),%ymm2
vbroadcasti128 0x30(%rdi),%ymm3
vpaddd CTR2BL(%rip),%ymm3,%ymm3
vmovdqa %ymm0,%ymm8
vmovdqa %ymm1,%ymm9
vmovdqa %ymm2,%ymm10
vmovdqa %ymm3,%ymm11
.Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm1,%ymm1
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm3,%ymm3
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm1,%ymm1
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
sub $2,%r8d
jnz .Ldoubleround
# o0 = i0 ^ (x0 + s0)
vpaddd %ymm8,%ymm0,%ymm7
cmp $0x10,%rcx
jl .Lxorpart2
vpxord 0x00(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x00(%rsi)
vextracti128 $1,%ymm7,%xmm0
# o1 = i1 ^ (x1 + s1)
vpaddd %ymm9,%ymm1,%ymm7
cmp $0x20,%rcx
jl .Lxorpart2
vpxord 0x10(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x10(%rsi)
vextracti128 $1,%ymm7,%xmm1
# o2 = i2 ^ (x2 + s2)
vpaddd %ymm10,%ymm2,%ymm7
cmp $0x30,%rcx
jl .Lxorpart2
vpxord 0x20(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x20(%rsi)
vextracti128 $1,%ymm7,%xmm2
# o3 = i3 ^ (x3 + s3)
vpaddd %ymm11,%ymm3,%ymm7
cmp $0x40,%rcx
jl .Lxorpart2
vpxord 0x30(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x30(%rsi)
vextracti128 $1,%ymm7,%xmm3
# xor and write second block
vmovdqa %xmm0,%xmm7
cmp $0x50,%rcx
jl .Lxorpart2
vpxord 0x40(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x40(%rsi)
vmovdqa %xmm1,%xmm7
cmp $0x60,%rcx
jl .Lxorpart2
vpxord 0x50(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x50(%rsi)
vmovdqa %xmm2,%xmm7
cmp $0x70,%rcx
jl .Lxorpart2
vpxord 0x60(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x60(%rsi)
vmovdqa %xmm3,%xmm7
cmp $0x80,%rcx
jl .Lxorpart2
vpxord 0x70(%rdx),%xmm7,%xmm6
vmovdqu %xmm6,0x70(%rsi)
.Ldone2:
vzeroupper
ret
.Lxorpart2:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
jz .Ldone8
mov %rax,%r9
and $~0xf,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
vpxord %xmm7,%xmm1,%xmm1
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
jmp .Ldone2
ENDPROC(chacha_2block_xor_avx512vl)
ENTRY(chacha_4block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts four ChaCha blocks by loading the state
# matrix four times across eight AVX registers. It performs matrix
# operations on four words in two matrices in parallel, sequentially
# to the operations on the four words of the other two matrices. The
# required word shuffling has a rather high latency, we can do the
# arithmetic on two matrix-pairs without much slowdown.
vzeroupper
# x0..3[0-4] = s0..3
vbroadcasti128 0x00(%rdi),%ymm0
vbroadcasti128 0x10(%rdi),%ymm1
vbroadcasti128 0x20(%rdi),%ymm2
vbroadcasti128 0x30(%rdi),%ymm3
vmovdqa %ymm0,%ymm4
vmovdqa %ymm1,%ymm5
vmovdqa %ymm2,%ymm6
vmovdqa %ymm3,%ymm7
vpaddd CTR2BL(%rip),%ymm3,%ymm3
vpaddd CTR4BL(%rip),%ymm7,%ymm7
vmovdqa %ymm0,%ymm11
vmovdqa %ymm1,%ymm12
vmovdqa %ymm2,%ymm13
vmovdqa %ymm3,%ymm14
vmovdqa %ymm7,%ymm15
.Ldoubleround4:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $16,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $8,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm1,%ymm1
vpshufd $0x39,%ymm5,%ymm5
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
vpshufd $0x4e,%ymm6,%ymm6
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm3,%ymm3
vpshufd $0x93,%ymm7,%ymm7
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $16,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $16,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $12,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vpaddd %ymm1,%ymm0,%ymm0
vpxord %ymm0,%ymm3,%ymm3
vprold $8,%ymm3,%ymm3
vpaddd %ymm5,%ymm4,%ymm4
vpxord %ymm4,%ymm7,%ymm7
vprold $8,%ymm7,%ymm7
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vpaddd %ymm3,%ymm2,%ymm2
vpxord %ymm2,%ymm1,%ymm1
vprold $7,%ymm1,%ymm1
vpaddd %ymm7,%ymm6,%ymm6
vpxord %ymm6,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vpshufd $0x93,%ymm1,%ymm1
vpshufd $0x93,%ymm5,%ymm5
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vpshufd $0x4e,%ymm2,%ymm2
vpshufd $0x4e,%ymm6,%ymm6
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vpshufd $0x39,%ymm3,%ymm3
vpshufd $0x39,%ymm7,%ymm7
sub $2,%r8d
jnz .Ldoubleround4
# o0 = i0 ^ (x0 + s0), first block
vpaddd %ymm11,%ymm0,%ymm10
cmp $0x10,%rcx
jl .Lxorpart4
vpxord 0x00(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x00(%rsi)
vextracti128 $1,%ymm10,%xmm0
# o1 = i1 ^ (x1 + s1), first block
vpaddd %ymm12,%ymm1,%ymm10
cmp $0x20,%rcx
jl .Lxorpart4
vpxord 0x10(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x10(%rsi)
vextracti128 $1,%ymm10,%xmm1
# o2 = i2 ^ (x2 + s2), first block
vpaddd %ymm13,%ymm2,%ymm10
cmp $0x30,%rcx
jl .Lxorpart4
vpxord 0x20(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x20(%rsi)
vextracti128 $1,%ymm10,%xmm2
# o3 = i3 ^ (x3 + s3), first block
vpaddd %ymm14,%ymm3,%ymm10
cmp $0x40,%rcx
jl .Lxorpart4
vpxord 0x30(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x30(%rsi)
vextracti128 $1,%ymm10,%xmm3
# xor and write second block
vmovdqa %xmm0,%xmm10
cmp $0x50,%rcx
jl .Lxorpart4
vpxord 0x40(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x40(%rsi)
vmovdqa %xmm1,%xmm10
cmp $0x60,%rcx
jl .Lxorpart4
vpxord 0x50(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x50(%rsi)
vmovdqa %xmm2,%xmm10
cmp $0x70,%rcx
jl .Lxorpart4
vpxord 0x60(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x60(%rsi)
vmovdqa %xmm3,%xmm10
cmp $0x80,%rcx
jl .Lxorpart4
vpxord 0x70(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x70(%rsi)
# o0 = i0 ^ (x0 + s0), third block
vpaddd %ymm11,%ymm4,%ymm10
cmp $0x90,%rcx
jl .Lxorpart4
vpxord 0x80(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x80(%rsi)
vextracti128 $1,%ymm10,%xmm4
# o1 = i1 ^ (x1 + s1), third block
vpaddd %ymm12,%ymm5,%ymm10
cmp $0xa0,%rcx
jl .Lxorpart4
vpxord 0x90(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0x90(%rsi)
vextracti128 $1,%ymm10,%xmm5
# o2 = i2 ^ (x2 + s2), third block
vpaddd %ymm13,%ymm6,%ymm10
cmp $0xb0,%rcx
jl .Lxorpart4
vpxord 0xa0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xa0(%rsi)
vextracti128 $1,%ymm10,%xmm6
# o3 = i3 ^ (x3 + s3), third block
vpaddd %ymm15,%ymm7,%ymm10
cmp $0xc0,%rcx
jl .Lxorpart4
vpxord 0xb0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xb0(%rsi)
vextracti128 $1,%ymm10,%xmm7
# xor and write fourth block
vmovdqa %xmm4,%xmm10
cmp $0xd0,%rcx
jl .Lxorpart4
vpxord 0xc0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xc0(%rsi)
vmovdqa %xmm5,%xmm10
cmp $0xe0,%rcx
jl .Lxorpart4
vpxord 0xd0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xd0(%rsi)
vmovdqa %xmm6,%xmm10
cmp $0xf0,%rcx
jl .Lxorpart4
vpxord 0xe0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xe0(%rsi)
vmovdqa %xmm7,%xmm10
cmp $0x100,%rcx
jl .Lxorpart4
vpxord 0xf0(%rdx),%xmm10,%xmm9
vmovdqu %xmm9,0xf0(%rsi)
.Ldone4:
vzeroupper
ret
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0xf,%rcx
jz .Ldone8
mov %rax,%r9
and $~0xf,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
vpxord %xmm10,%xmm1,%xmm1
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
jmp .Ldone4
ENDPROC(chacha_4block_xor_avx512vl)
ENTRY(chacha_8block_xor_avx512vl)
# %rdi: Input state matrix, s
# %rsi: up to 8 data blocks output, o
# %rdx: up to 8 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts eight consecutive ChaCha blocks by loading
# the state matrix in AVX registers eight times. Compared to AVX2, this
# mostly benefits from the new rotate instructions in VL and the
# additional registers.
vzeroupper
# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpbroadcastd 0x04(%rdi),%ymm1
vpbroadcastd 0x08(%rdi),%ymm2
vpbroadcastd 0x0c(%rdi),%ymm3
vpbroadcastd 0x10(%rdi),%ymm4
vpbroadcastd 0x14(%rdi),%ymm5
vpbroadcastd 0x18(%rdi),%ymm6
vpbroadcastd 0x1c(%rdi),%ymm7
vpbroadcastd 0x20(%rdi),%ymm8
vpbroadcastd 0x24(%rdi),%ymm9
vpbroadcastd 0x28(%rdi),%ymm10
vpbroadcastd 0x2c(%rdi),%ymm11
vpbroadcastd 0x30(%rdi),%ymm12
vpbroadcastd 0x34(%rdi),%ymm13
vpbroadcastd 0x38(%rdi),%ymm14
vpbroadcastd 0x3c(%rdi),%ymm15
# x12 += counter values 0-3
vpaddd CTR8BL(%rip),%ymm12,%ymm12
vmovdqa64 %ymm0,%ymm16
vmovdqa64 %ymm1,%ymm17
vmovdqa64 %ymm2,%ymm18
vmovdqa64 %ymm3,%ymm19
vmovdqa64 %ymm4,%ymm20
vmovdqa64 %ymm5,%ymm21
vmovdqa64 %ymm6,%ymm22
vmovdqa64 %ymm7,%ymm23
vmovdqa64 %ymm8,%ymm24
vmovdqa64 %ymm9,%ymm25
vmovdqa64 %ymm10,%ymm26
vmovdqa64 %ymm11,%ymm27
vmovdqa64 %ymm12,%ymm28
vmovdqa64 %ymm13,%ymm29
vmovdqa64 %ymm14,%ymm30
vmovdqa64 %ymm15,%ymm31
.Ldoubleround8:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
vpaddd %ymm0,%ymm4,%ymm0
vpxord %ymm0,%ymm12,%ymm12
vprold $16,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
vpaddd %ymm1,%ymm5,%ymm1
vpxord %ymm1,%ymm13,%ymm13
vprold $16,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
vpaddd %ymm2,%ymm6,%ymm2
vpxord %ymm2,%ymm14,%ymm14
vprold $16,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vpaddd %ymm3,%ymm7,%ymm3
vpxord %ymm3,%ymm15,%ymm15
vprold $16,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
vpaddd %ymm12,%ymm8,%ymm8
vpxord %ymm8,%ymm4,%ymm4
vprold $12,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
vpaddd %ymm13,%ymm9,%ymm9
vpxord %ymm9,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
vpaddd %ymm14,%ymm10,%ymm10
vpxord %ymm10,%ymm6,%ymm6
vprold $12,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vpaddd %ymm15,%ymm11,%ymm11
vpxord %ymm11,%ymm7,%ymm7
vprold $12,%ymm7,%ymm7
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
vpaddd %ymm0,%ymm4,%ymm0
vpxord %ymm0,%ymm12,%ymm12
vprold $8,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
vpaddd %ymm1,%ymm5,%ymm1
vpxord %ymm1,%ymm13,%ymm13
vprold $8,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
vpaddd %ymm2,%ymm6,%ymm2
vpxord %ymm2,%ymm14,%ymm14
vprold $8,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vpaddd %ymm3,%ymm7,%ymm3
vpxord %ymm3,%ymm15,%ymm15
vprold $8,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
vpaddd %ymm12,%ymm8,%ymm8
vpxord %ymm8,%ymm4,%ymm4
vprold $7,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
vpaddd %ymm13,%ymm9,%ymm9
vpxord %ymm9,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
vpaddd %ymm14,%ymm10,%ymm10
vpxord %ymm10,%ymm6,%ymm6
vprold $7,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vpaddd %ymm15,%ymm11,%ymm11
vpxord %ymm11,%ymm7,%ymm7
vprold $7,%ymm7,%ymm7
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
vpaddd %ymm0,%ymm5,%ymm0
vpxord %ymm0,%ymm15,%ymm15
vprold $16,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
vpaddd %ymm1,%ymm6,%ymm1
vpxord %ymm1,%ymm12,%ymm12
vprold $16,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
vpaddd %ymm2,%ymm7,%ymm2
vpxord %ymm2,%ymm13,%ymm13
vprold $16,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vpaddd %ymm3,%ymm4,%ymm3
vpxord %ymm3,%ymm14,%ymm14
vprold $16,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
vpaddd %ymm15,%ymm10,%ymm10
vpxord %ymm10,%ymm5,%ymm5
vprold $12,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
vpaddd %ymm12,%ymm11,%ymm11
vpxord %ymm11,%ymm6,%ymm6
vprold $12,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
vpaddd %ymm13,%ymm8,%ymm8
vpxord %ymm8,%ymm7,%ymm7
vprold $12,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vpaddd %ymm14,%ymm9,%ymm9
vpxord %ymm9,%ymm4,%ymm4
vprold $12,%ymm4,%ymm4
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
vpaddd %ymm0,%ymm5,%ymm0
vpxord %ymm0,%ymm15,%ymm15
vprold $8,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
vpaddd %ymm1,%ymm6,%ymm1
vpxord %ymm1,%ymm12,%ymm12
vprold $8,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
vpaddd %ymm2,%ymm7,%ymm2
vpxord %ymm2,%ymm13,%ymm13
vprold $8,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vpaddd %ymm3,%ymm4,%ymm3
vpxord %ymm3,%ymm14,%ymm14
vprold $8,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
vpaddd %ymm15,%ymm10,%ymm10
vpxord %ymm10,%ymm5,%ymm5
vprold $7,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
vpaddd %ymm12,%ymm11,%ymm11
vpxord %ymm11,%ymm6,%ymm6
vprold $7,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
vpaddd %ymm13,%ymm8,%ymm8
vpxord %ymm8,%ymm7,%ymm7
vprold $7,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vpaddd %ymm14,%ymm9,%ymm9
vpxord %ymm9,%ymm4,%ymm4
vprold $7,%ymm4,%ymm4
sub $2,%r8d
jnz .Ldoubleround8
# x0..15[0-3] += s[0..15]
vpaddd %ymm16,%ymm0,%ymm0
vpaddd %ymm17,%ymm1,%ymm1
vpaddd %ymm18,%ymm2,%ymm2
vpaddd %ymm19,%ymm3,%ymm3
vpaddd %ymm20,%ymm4,%ymm4
vpaddd %ymm21,%ymm5,%ymm5
vpaddd %ymm22,%ymm6,%ymm6
vpaddd %ymm23,%ymm7,%ymm7
vpaddd %ymm24,%ymm8,%ymm8
vpaddd %ymm25,%ymm9,%ymm9
vpaddd %ymm26,%ymm10,%ymm10
vpaddd %ymm27,%ymm11,%ymm11
vpaddd %ymm28,%ymm12,%ymm12
vpaddd %ymm29,%ymm13,%ymm13
vpaddd %ymm30,%ymm14,%ymm14
vpaddd %ymm31,%ymm15,%ymm15
# interleave 32-bit words in state n, n+1
vpunpckldq %ymm1,%ymm0,%ymm16
vpunpckhdq %ymm1,%ymm0,%ymm17
vpunpckldq %ymm3,%ymm2,%ymm18
vpunpckhdq %ymm3,%ymm2,%ymm19
vpunpckldq %ymm5,%ymm4,%ymm20
vpunpckhdq %ymm5,%ymm4,%ymm21
vpunpckldq %ymm7,%ymm6,%ymm22
vpunpckhdq %ymm7,%ymm6,%ymm23
vpunpckldq %ymm9,%ymm8,%ymm24
vpunpckhdq %ymm9,%ymm8,%ymm25
vpunpckldq %ymm11,%ymm10,%ymm26
vpunpckhdq %ymm11,%ymm10,%ymm27
vpunpckldq %ymm13,%ymm12,%ymm28
vpunpckhdq %ymm13,%ymm12,%ymm29
vpunpckldq %ymm15,%ymm14,%ymm30
vpunpckhdq %ymm15,%ymm14,%ymm31
# interleave 64-bit words in state n, n+2
vpunpcklqdq %ymm18,%ymm16,%ymm0
vpunpcklqdq %ymm19,%ymm17,%ymm1
vpunpckhqdq %ymm18,%ymm16,%ymm2
vpunpckhqdq %ymm19,%ymm17,%ymm3
vpunpcklqdq %ymm22,%ymm20,%ymm4
vpunpcklqdq %ymm23,%ymm21,%ymm5
vpunpckhqdq %ymm22,%ymm20,%ymm6
vpunpckhqdq %ymm23,%ymm21,%ymm7
vpunpcklqdq %ymm26,%ymm24,%ymm8
vpunpcklqdq %ymm27,%ymm25,%ymm9
vpunpckhqdq %ymm26,%ymm24,%ymm10
vpunpckhqdq %ymm27,%ymm25,%ymm11
vpunpcklqdq %ymm30,%ymm28,%ymm12
vpunpcklqdq %ymm31,%ymm29,%ymm13
vpunpckhqdq %ymm30,%ymm28,%ymm14
vpunpckhqdq %ymm31,%ymm29,%ymm15
# interleave 128-bit words in state n, n+4
# xor/write first four blocks
vmovdqa64 %ymm0,%ymm16
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
cmp $0x0020,%rcx
jl .Lxorpart8
vpxord 0x0000(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0000(%rsi)
vmovdqa64 %ymm16,%ymm0
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
cmp $0x0040,%rcx
jl .Lxorpart8
vpxord 0x0020(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0020(%rsi)
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
cmp $0x0060,%rcx
jl .Lxorpart8
vpxord 0x0040(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0040(%rsi)
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
cmp $0x0080,%rcx
jl .Lxorpart8
vpxord 0x0060(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0060(%rsi)
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
cmp $0x00a0,%rcx
jl .Lxorpart8
vpxord 0x0080(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0080(%rsi)
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
cmp $0x00c0,%rcx
jl .Lxorpart8
vpxord 0x00a0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00a0(%rsi)
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
cmp $0x00e0,%rcx
jl .Lxorpart8
vpxord 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00c0(%rsi)
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
cmp $0x0100,%rcx
jl .Lxorpart8
vpxord 0x00e0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x00e0(%rsi)
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
# xor remaining blocks, write to output
vmovdqa64 %ymm4,%ymm0
cmp $0x0120,%rcx
jl .Lxorpart8
vpxord 0x0100(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0100(%rsi)
vmovdqa64 %ymm12,%ymm0
cmp $0x0140,%rcx
jl .Lxorpart8
vpxord 0x0120(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0120(%rsi)
vmovdqa64 %ymm6,%ymm0
cmp $0x0160,%rcx
jl .Lxorpart8
vpxord 0x0140(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0140(%rsi)
vmovdqa64 %ymm14,%ymm0
cmp $0x0180,%rcx
jl .Lxorpart8
vpxord 0x0160(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0160(%rsi)
vmovdqa64 %ymm5,%ymm0
cmp $0x01a0,%rcx
jl .Lxorpart8
vpxord 0x0180(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x0180(%rsi)
vmovdqa64 %ymm13,%ymm0
cmp $0x01c0,%rcx
jl .Lxorpart8
vpxord 0x01a0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01a0(%rsi)
vmovdqa64 %ymm7,%ymm0
cmp $0x01e0,%rcx
jl .Lxorpart8
vpxord 0x01c0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01c0(%rsi)
vmovdqa64 %ymm15,%ymm0
cmp $0x0200,%rcx
jl .Lxorpart8
vpxord 0x01e0(%rdx),%ymm0,%ymm0
vmovdqu64 %ymm0,0x01e0(%rsi)
.Ldone8:
vzeroupper
ret
.Lxorpart8:
# xor remaining bytes from partial register into output
mov %rcx,%rax
and $0x1f,%rcx
jz .Ldone8
mov %rax,%r9
and $~0x1f,%r9
mov $1,%rax
shld %cl,%rax,%rax
sub $1,%rax
kmovq %rax,%k1
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
vpxord %ymm0,%ymm1,%ymm1
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
jmp .Ldone8
ENDPROC(chacha_8block_xor_avx512vl)

View File

@ -1,5 +1,5 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
*
@ -10,6 +10,7 @@
*/
#include <linux/linkage.h>
#include <asm/frame.h>
.section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16
@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
.text
ENTRY(chacha20_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: 1 data block output, o
# %rdx: 1 data block input, i
# This function encrypts one ChaCha20 block by loading the state matrix
# in four SSE registers. It performs matrix operation on four words in
# parallel, but requireds shuffling to rearrange the words after each
# round. 8/16-bit word rotation is done with the slightly better
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
# traditional shift+OR.
# x0..3 = s0..3
movdqa 0x00(%rdi),%xmm0
movdqa 0x10(%rdi),%xmm1
movdqa 0x20(%rdi),%xmm2
movdqa 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
/*
* chacha_permute - permute one block
*
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
* function performs matrix operations on four words in parallel, but requires
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
* rotation uses traditional shift+OR.
*
* The round count is given in %r8d.
*
* Clobbers: %r8d, %xmm4-%xmm7
*/
chacha_permute:
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
mov $10,%ecx
.Ldoubleround:
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
pshufd $0x39,%xmm3,%xmm3
dec %ecx
sub $2,%r8d
jnz .Ldoubleround
ret
ENDPROC(chacha_permute)
ENTRY(chacha_block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 1 data block output, o
# %rdx: up to 1 data block input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
FRAME_BEGIN
# x0..3 = s0..3
movdqu 0x00(%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu 0x20(%rdi),%xmm2
movdqu 0x30(%rdi),%xmm3
movdqa %xmm0,%xmm8
movdqa %xmm1,%xmm9
movdqa %xmm2,%xmm10
movdqa %xmm3,%xmm11
mov %rcx,%rax
call chacha_permute
# o0 = i0 ^ (x0 + s0)
movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0
cmp $0x10,%rax
jl .Lxorpart
movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1
pxor %xmm5,%xmm1
movdqu %xmm1,0x10(%rsi)
movdqa %xmm1,%xmm0
cmp $0x20,%rax
jl .Lxorpart
movdqu 0x10(%rdx),%xmm0
pxor %xmm1,%xmm0
movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2
pxor %xmm6,%xmm2
movdqu %xmm2,0x20(%rsi)
movdqa %xmm2,%xmm0
cmp $0x30,%rax
jl .Lxorpart
movdqu 0x20(%rdx),%xmm0
pxor %xmm2,%xmm0
movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3
pxor %xmm7,%xmm3
movdqu %xmm3,0x30(%rsi)
movdqa %xmm3,%xmm0
cmp $0x40,%rax
jl .Lxorpart
movdqu 0x30(%rdx),%xmm0
pxor %xmm3,%xmm0
movdqu %xmm0,0x30(%rsi)
.Ldone:
FRAME_END
ret
ENDPROC(chacha20_block_xor_ssse3)
ENTRY(chacha20_4block_xor_ssse3)
.Lxorpart:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone
and $~0x0f,%rax
mov %rsi,%r11
lea 8(%rsp),%r10
sub $0x10,%rsp
and $~31,%rsp
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
lea -8(%r10),%rsp
jmp .Ldone
ENDPROC(chacha_block_xor_ssse3)
ENTRY(hchacha_block_ssse3)
# %rdi: Input state matrix, s
# %rsi: 4 data blocks output, o
# %rdx: 4 data blocks input, i
# %rsi: output (8 32-bit words)
# %edx: nrounds
FRAME_BEGIN
# This function encrypts four consecutive ChaCha20 blocks by loading the
movdqu 0x00(%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu 0x20(%rdi),%xmm2
movdqu 0x30(%rdi),%xmm3
mov %edx,%r8d
call chacha_permute
movdqu %xmm0,0x00(%rsi)
movdqu %xmm3,0x10(%rsi)
FRAME_END
ret
ENDPROC(hchacha_block_ssse3)
ENTRY(chacha_4block_xor_ssse3)
# %rdi: Input state matrix, s
# %rsi: up to 4 data blocks output, o
# %rdx: up to 4 data blocks input, i
# %rcx: input/output length in bytes
# %r8d: nrounds
# This function encrypts four consecutive ChaCha blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
# registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
mov %rcx,%rax
# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
# x12 += counter values 0-3
paddd %xmm1,%xmm12
mov $10,%ecx
.Ldoubleround4:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
movdqa 0x00(%rsp),%xmm0
@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
psrld $25,%xmm4
por %xmm0,%xmm4
dec %ecx
sub $2,%r8d
jnz .Ldoubleround4
# x0[0-3] += s0[0]
@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
# xor with corresponding input, write to output
movdqa 0x00(%rsp),%xmm0
cmp $0x10,%rax
jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
movdqa 0x10(%rsp),%xmm0
movdqu 0x80(%rdx),%xmm1
movdqu %xmm4,%xmm0
cmp $0x20,%rax
jl .Lxorpart4
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm0,0x10(%rsi)
movdqu %xmm8,%xmm0
cmp $0x30,%rax
jl .Lxorpart4
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x20(%rsi)
movdqu %xmm12,%xmm0
cmp $0x40,%rax
jl .Lxorpart4
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x30(%rsi)
movdqa 0x20(%rsp),%xmm0
cmp $0x50,%rax
jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
movdqu %xmm6,%xmm0
cmp $0x60,%rax
jl .Lxorpart4
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x50(%rsi)
movdqu %xmm10,%xmm0
cmp $0x70,%rax
jl .Lxorpart4
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x60(%rsi)
movdqu %xmm14,%xmm0
cmp $0x80,%rax
jl .Lxorpart4
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x70(%rsi)
movdqa 0x10(%rsp),%xmm0
cmp $0x90,%rax
jl .Lxorpart4
movdqu 0x80(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x80(%rsi)
movdqu %xmm5,%xmm0
cmp $0xa0,%rax
jl .Lxorpart4
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x90(%rsi)
movdqu %xmm9,%xmm0
cmp $0xb0,%rax
jl .Lxorpart4
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xa0(%rsi)
movdqu %xmm13,%xmm0
cmp $0xc0,%rax
jl .Lxorpart4
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xb0(%rsi)
movdqa 0x30(%rsp),%xmm0
cmp $0xd0,%rax
jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm4
movdqu %xmm4,0x10(%rsi)
movdqu 0x90(%rdx),%xmm1
pxor %xmm1,%xmm5
movdqu %xmm5,0x90(%rsi)
movdqu 0x50(%rdx),%xmm1
pxor %xmm1,%xmm6
movdqu %xmm6,0x50(%rsi)
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm7
movdqu %xmm7,0xd0(%rsi)
movdqu 0x20(%rdx),%xmm1
pxor %xmm1,%xmm8
movdqu %xmm8,0x20(%rsi)
movdqu 0xa0(%rdx),%xmm1
pxor %xmm1,%xmm9
movdqu %xmm9,0xa0(%rsi)
movdqu 0x60(%rdx),%xmm1
pxor %xmm1,%xmm10
movdqu %xmm10,0x60(%rsi)
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm11
movdqu %xmm11,0xe0(%rsi)
movdqu 0x30(%rdx),%xmm1
pxor %xmm1,%xmm12
movdqu %xmm12,0x30(%rsi)
movdqu 0xb0(%rdx),%xmm1
pxor %xmm1,%xmm13
movdqu %xmm13,0xb0(%rsi)
movdqu 0x70(%rdx),%xmm1
pxor %xmm1,%xmm14
movdqu %xmm14,0x70(%rsi)
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm15
movdqu %xmm15,0xf0(%rsi)
movdqu %xmm7,%xmm0
cmp $0xe0,%rax
jl .Lxorpart4
movdqu 0xd0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xd0(%rsi)
movdqu %xmm11,%xmm0
cmp $0xf0,%rax
jl .Lxorpart4
movdqu 0xe0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xe0(%rsi)
movdqu %xmm15,%xmm0
cmp $0x100,%rax
jl .Lxorpart4
movdqu 0xf0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xf0(%rsi)
.Ldone4:
lea -8(%r10),%rsp
ret
ENDPROC(chacha20_4block_xor_ssse3)
.Lxorpart4:
# xor remaining bytes from partial register into output
mov %rax,%r9
and $0x0f,%r9
jz .Ldone4
and $~0x0f,%rax
mov %rsi,%r11
lea (%rdx,%rax),%rsi
mov %rsp,%rdi
mov %r9,%rcx
rep movsb
pxor 0x00(%rsp),%xmm0
movdqa %xmm0,0x00(%rsp)
mov %rsp,%rsi
lea (%r11,%rax),%rdi
mov %r9,%rcx
rep movsb
jmp .Ldone4
ENDPROC(chacha_4block_xor_ssse3)

View File

@ -1,448 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.section .rodata.cst32.ROT8, "aM", @progbits, 32
.align 32
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
.octa 0x0e0d0c0f0a09080b0605040702010003
.section .rodata.cst32.ROT16, "aM", @progbits, 32
.align 32
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
.octa 0x0d0c0f0e09080b0a0504070601000302
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
.align 32
CTRINC: .octa 0x00000003000000020000000100000000
.octa 0x00000007000000060000000500000004
.text
ENTRY(chacha20_8block_xor_avx2)
# %rdi: Input state matrix, s
# %rsi: 8 data blocks output, o
# %rdx: 8 data blocks input, i
# This function encrypts eight consecutive ChaCha20 blocks by loading
# the state matrix in AVX registers eight times. As we need some
# scratch registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
# state matrix, hence requires no word shuffling. For final XORing step
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
# words, which allows us to do XOR in AVX registers. 8/16-bit word
# rotation is done with the slightly better performing byte shuffling,
# 7/12-bit word rotation uses traditional shift+OR.
vzeroupper
# 4 * 32 byte stack, 32-byte aligned
lea 8(%rsp),%r10
and $~31, %rsp
sub $0x80, %rsp
# x0..15[0-7] = s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpbroadcastd 0x04(%rdi),%ymm1
vpbroadcastd 0x08(%rdi),%ymm2
vpbroadcastd 0x0c(%rdi),%ymm3
vpbroadcastd 0x10(%rdi),%ymm4
vpbroadcastd 0x14(%rdi),%ymm5
vpbroadcastd 0x18(%rdi),%ymm6
vpbroadcastd 0x1c(%rdi),%ymm7
vpbroadcastd 0x20(%rdi),%ymm8
vpbroadcastd 0x24(%rdi),%ymm9
vpbroadcastd 0x28(%rdi),%ymm10
vpbroadcastd 0x2c(%rdi),%ymm11
vpbroadcastd 0x30(%rdi),%ymm12
vpbroadcastd 0x34(%rdi),%ymm13
vpbroadcastd 0x38(%rdi),%ymm14
vpbroadcastd 0x3c(%rdi),%ymm15
# x0..3 on stack
vmovdqa %ymm0,0x00(%rsp)
vmovdqa %ymm1,0x20(%rsp)
vmovdqa %ymm2,0x40(%rsp)
vmovdqa %ymm3,0x60(%rsp)
vmovdqa CTRINC(%rip),%ymm1
vmovdqa ROT8(%rip),%ymm2
vmovdqa ROT16(%rip),%ymm3
# x12 += counter values 0-3
vpaddd %ymm1,%ymm12,%ymm12
mov $10,%ecx
.Ldoubleround8:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
vpaddd 0x00(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm3,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
vpaddd 0x20(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm3,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
vpaddd 0x40(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm3,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vpaddd 0x60(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm3,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
vpaddd %ymm12,%ymm8,%ymm8
vpxor %ymm8,%ymm4,%ymm4
vpslld $12,%ymm4,%ymm0
vpsrld $20,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
vpaddd %ymm13,%ymm9,%ymm9
vpxor %ymm9,%ymm5,%ymm5
vpslld $12,%ymm5,%ymm0
vpsrld $20,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
vpaddd %ymm14,%ymm10,%ymm10
vpxor %ymm10,%ymm6,%ymm6
vpslld $12,%ymm6,%ymm0
vpsrld $20,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vpaddd %ymm15,%ymm11,%ymm11
vpxor %ymm11,%ymm7,%ymm7
vpslld $12,%ymm7,%ymm0
vpsrld $20,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
vpaddd 0x00(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm2,%ymm12,%ymm12
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
vpaddd 0x20(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm2,%ymm13,%ymm13
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
vpaddd 0x40(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm2,%ymm14,%ymm14
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vpaddd 0x60(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm2,%ymm15,%ymm15
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
vpaddd %ymm12,%ymm8,%ymm8
vpxor %ymm8,%ymm4,%ymm4
vpslld $7,%ymm4,%ymm0
vpsrld $25,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
vpaddd %ymm13,%ymm9,%ymm9
vpxor %ymm9,%ymm5,%ymm5
vpslld $7,%ymm5,%ymm0
vpsrld $25,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
vpaddd %ymm14,%ymm10,%ymm10
vpxor %ymm10,%ymm6,%ymm6
vpslld $7,%ymm6,%ymm0
vpsrld $25,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vpaddd %ymm15,%ymm11,%ymm11
vpxor %ymm11,%ymm7,%ymm7
vpslld $7,%ymm7,%ymm0
vpsrld $25,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
vpaddd 0x00(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm3,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
vpaddd 0x20(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm3,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
vpaddd 0x40(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm3,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vpaddd 0x60(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm3,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
vpaddd %ymm15,%ymm10,%ymm10
vpxor %ymm10,%ymm5,%ymm5
vpslld $12,%ymm5,%ymm0
vpsrld $20,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
vpaddd %ymm12,%ymm11,%ymm11
vpxor %ymm11,%ymm6,%ymm6
vpslld $12,%ymm6,%ymm0
vpsrld $20,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
vpaddd %ymm13,%ymm8,%ymm8
vpxor %ymm8,%ymm7,%ymm7
vpslld $12,%ymm7,%ymm0
vpsrld $20,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vpaddd %ymm14,%ymm9,%ymm9
vpxor %ymm9,%ymm4,%ymm4
vpslld $12,%ymm4,%ymm0
vpsrld $20,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
vpaddd 0x00(%rsp),%ymm5,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpxor %ymm0,%ymm15,%ymm15
vpshufb %ymm2,%ymm15,%ymm15
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
vpaddd 0x20(%rsp),%ymm6,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpxor %ymm0,%ymm12,%ymm12
vpshufb %ymm2,%ymm12,%ymm12
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
vpaddd 0x40(%rsp),%ymm7,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpxor %ymm0,%ymm13,%ymm13
vpshufb %ymm2,%ymm13,%ymm13
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vpaddd 0x60(%rsp),%ymm4,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpxor %ymm0,%ymm14,%ymm14
vpshufb %ymm2,%ymm14,%ymm14
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
vpaddd %ymm15,%ymm10,%ymm10
vpxor %ymm10,%ymm5,%ymm5
vpslld $7,%ymm5,%ymm0
vpsrld $25,%ymm5,%ymm5
vpor %ymm0,%ymm5,%ymm5
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
vpaddd %ymm12,%ymm11,%ymm11
vpxor %ymm11,%ymm6,%ymm6
vpslld $7,%ymm6,%ymm0
vpsrld $25,%ymm6,%ymm6
vpor %ymm0,%ymm6,%ymm6
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
vpaddd %ymm13,%ymm8,%ymm8
vpxor %ymm8,%ymm7,%ymm7
vpslld $7,%ymm7,%ymm0
vpsrld $25,%ymm7,%ymm7
vpor %ymm0,%ymm7,%ymm7
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vpaddd %ymm14,%ymm9,%ymm9
vpxor %ymm9,%ymm4,%ymm4
vpslld $7,%ymm4,%ymm0
vpsrld $25,%ymm4,%ymm4
vpor %ymm0,%ymm4,%ymm4
dec %ecx
jnz .Ldoubleround8
# x0..15[0-3] += s[0..15]
vpbroadcastd 0x00(%rdi),%ymm0
vpaddd 0x00(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x00(%rsp)
vpbroadcastd 0x04(%rdi),%ymm0
vpaddd 0x20(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x20(%rsp)
vpbroadcastd 0x08(%rdi),%ymm0
vpaddd 0x40(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x40(%rsp)
vpbroadcastd 0x0c(%rdi),%ymm0
vpaddd 0x60(%rsp),%ymm0,%ymm0
vmovdqa %ymm0,0x60(%rsp)
vpbroadcastd 0x10(%rdi),%ymm0
vpaddd %ymm0,%ymm4,%ymm4
vpbroadcastd 0x14(%rdi),%ymm0
vpaddd %ymm0,%ymm5,%ymm5
vpbroadcastd 0x18(%rdi),%ymm0
vpaddd %ymm0,%ymm6,%ymm6
vpbroadcastd 0x1c(%rdi),%ymm0
vpaddd %ymm0,%ymm7,%ymm7
vpbroadcastd 0x20(%rdi),%ymm0
vpaddd %ymm0,%ymm8,%ymm8
vpbroadcastd 0x24(%rdi),%ymm0
vpaddd %ymm0,%ymm9,%ymm9
vpbroadcastd 0x28(%rdi),%ymm0
vpaddd %ymm0,%ymm10,%ymm10
vpbroadcastd 0x2c(%rdi),%ymm0
vpaddd %ymm0,%ymm11,%ymm11
vpbroadcastd 0x30(%rdi),%ymm0
vpaddd %ymm0,%ymm12,%ymm12
vpbroadcastd 0x34(%rdi),%ymm0
vpaddd %ymm0,%ymm13,%ymm13
vpbroadcastd 0x38(%rdi),%ymm0
vpaddd %ymm0,%ymm14,%ymm14
vpbroadcastd 0x3c(%rdi),%ymm0
vpaddd %ymm0,%ymm15,%ymm15
# x12 += counter values 0-3
vpaddd %ymm1,%ymm12,%ymm12
# interleave 32-bit words in state n, n+1
vmovdqa 0x00(%rsp),%ymm0
vmovdqa 0x20(%rsp),%ymm1
vpunpckldq %ymm1,%ymm0,%ymm2
vpunpckhdq %ymm1,%ymm0,%ymm1
vmovdqa %ymm2,0x00(%rsp)
vmovdqa %ymm1,0x20(%rsp)
vmovdqa 0x40(%rsp),%ymm0
vmovdqa 0x60(%rsp),%ymm1
vpunpckldq %ymm1,%ymm0,%ymm2
vpunpckhdq %ymm1,%ymm0,%ymm1
vmovdqa %ymm2,0x40(%rsp)
vmovdqa %ymm1,0x60(%rsp)
vmovdqa %ymm4,%ymm0
vpunpckldq %ymm5,%ymm0,%ymm4
vpunpckhdq %ymm5,%ymm0,%ymm5
vmovdqa %ymm6,%ymm0
vpunpckldq %ymm7,%ymm0,%ymm6
vpunpckhdq %ymm7,%ymm0,%ymm7
vmovdqa %ymm8,%ymm0
vpunpckldq %ymm9,%ymm0,%ymm8
vpunpckhdq %ymm9,%ymm0,%ymm9
vmovdqa %ymm10,%ymm0
vpunpckldq %ymm11,%ymm0,%ymm10
vpunpckhdq %ymm11,%ymm0,%ymm11
vmovdqa %ymm12,%ymm0
vpunpckldq %ymm13,%ymm0,%ymm12
vpunpckhdq %ymm13,%ymm0,%ymm13
vmovdqa %ymm14,%ymm0
vpunpckldq %ymm15,%ymm0,%ymm14
vpunpckhdq %ymm15,%ymm0,%ymm15
# interleave 64-bit words in state n, n+2
vmovdqa 0x00(%rsp),%ymm0
vmovdqa 0x40(%rsp),%ymm2
vpunpcklqdq %ymm2,%ymm0,%ymm1
vpunpckhqdq %ymm2,%ymm0,%ymm2
vmovdqa %ymm1,0x00(%rsp)
vmovdqa %ymm2,0x40(%rsp)
vmovdqa 0x20(%rsp),%ymm0
vmovdqa 0x60(%rsp),%ymm2
vpunpcklqdq %ymm2,%ymm0,%ymm1
vpunpckhqdq %ymm2,%ymm0,%ymm2
vmovdqa %ymm1,0x20(%rsp)
vmovdqa %ymm2,0x60(%rsp)
vmovdqa %ymm4,%ymm0
vpunpcklqdq %ymm6,%ymm0,%ymm4
vpunpckhqdq %ymm6,%ymm0,%ymm6
vmovdqa %ymm5,%ymm0
vpunpcklqdq %ymm7,%ymm0,%ymm5
vpunpckhqdq %ymm7,%ymm0,%ymm7
vmovdqa %ymm8,%ymm0
vpunpcklqdq %ymm10,%ymm0,%ymm8
vpunpckhqdq %ymm10,%ymm0,%ymm10
vmovdqa %ymm9,%ymm0
vpunpcklqdq %ymm11,%ymm0,%ymm9
vpunpckhqdq %ymm11,%ymm0,%ymm11
vmovdqa %ymm12,%ymm0
vpunpcklqdq %ymm14,%ymm0,%ymm12
vpunpckhqdq %ymm14,%ymm0,%ymm14
vmovdqa %ymm13,%ymm0
vpunpcklqdq %ymm15,%ymm0,%ymm13
vpunpckhqdq %ymm15,%ymm0,%ymm15
# interleave 128-bit words in state n, n+4
vmovdqa 0x00(%rsp),%ymm0
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
vmovdqa %ymm1,0x00(%rsp)
vmovdqa 0x20(%rsp),%ymm0
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
vmovdqa %ymm1,0x20(%rsp)
vmovdqa 0x40(%rsp),%ymm0
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
vmovdqa %ymm1,0x40(%rsp)
vmovdqa 0x60(%rsp),%ymm0
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
vmovdqa %ymm1,0x60(%rsp)
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
vmovdqa %ymm0,%ymm8
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
vmovdqa %ymm0,%ymm9
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
vmovdqa %ymm0,%ymm10
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
vmovdqa %ymm0,%ymm11
# xor with corresponding input, write to output
vmovdqa 0x00(%rsp),%ymm0
vpxor 0x0000(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0000(%rsi)
vmovdqa 0x20(%rsp),%ymm0
vpxor 0x0080(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0080(%rsi)
vmovdqa 0x40(%rsp),%ymm0
vpxor 0x0040(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x0040(%rsi)
vmovdqa 0x60(%rsp),%ymm0
vpxor 0x00c0(%rdx),%ymm0,%ymm0
vmovdqu %ymm0,0x00c0(%rsi)
vpxor 0x0100(%rdx),%ymm4,%ymm4
vmovdqu %ymm4,0x0100(%rsi)
vpxor 0x0180(%rdx),%ymm5,%ymm5
vmovdqu %ymm5,0x00180(%rsi)
vpxor 0x0140(%rdx),%ymm6,%ymm6
vmovdqu %ymm6,0x0140(%rsi)
vpxor 0x01c0(%rdx),%ymm7,%ymm7
vmovdqu %ymm7,0x01c0(%rsi)
vpxor 0x0020(%rdx),%ymm8,%ymm8
vmovdqu %ymm8,0x0020(%rsi)
vpxor 0x00a0(%rdx),%ymm9,%ymm9
vmovdqu %ymm9,0x00a0(%rsi)
vpxor 0x0060(%rdx),%ymm10,%ymm10
vmovdqu %ymm10,0x0060(%rsi)
vpxor 0x00e0(%rdx),%ymm11,%ymm11
vmovdqu %ymm11,0x00e0(%rsi)
vpxor 0x0120(%rdx),%ymm12,%ymm12
vmovdqu %ymm12,0x0120(%rsi)
vpxor 0x01a0(%rdx),%ymm13,%ymm13
vmovdqu %ymm13,0x01a0(%rsi)
vpxor 0x0160(%rdx),%ymm14,%ymm14
vmovdqu %ymm14,0x0160(%rsi)
vpxor 0x01e0(%rdx),%ymm15,%ymm15
vmovdqu %ymm15,0x01e0(%rsi)
vzeroupper
lea -8(%r10),%rsp
ret
ENDPROC(chacha20_8block_xor_avx2)

View File

@ -1,146 +0,0 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
#define CHACHA20_STATE_ALIGN 16
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
#ifdef CONFIG_AS_AVX2
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
static bool chacha20_use_avx2;
#endif
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA_BLOCK_SIZE];
#ifdef CONFIG_AS_AVX2
if (chacha20_use_avx2) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha20_8block_xor_avx2(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
}
#endif
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha20_4block_xor_ssse3(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha20_block_xor_ssse3(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_ssse3(state, buf, buf);
memcpy(dst, buf, bytes);
}
}
static int chacha20_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
u32 *state, state_buf[16 + 2] __aligned(8);
struct skcipher_walk walk;
int err;
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, true);
crypto_chacha_init(state, ctx, walk.iv);
kernel_fpu_begin();
while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
err = skcipher_walk_done(&walk,
walk.nbytes % CHACHA_BLOCK_SIZE);
}
if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes);
err = skcipher_walk_done(&walk, 0);
}
kernel_fpu_end();
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_simd,
.decrypt = chacha20_simd,
};
static int __init chacha20_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return -ENODEV;
#ifdef CONFIG_AS_AVX2
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
#endif
return crypto_register_skcipher(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd");

View File

@ -0,0 +1,322 @@
/*
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/internal/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/fpu/api.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
unsigned int len, int nrounds);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
{
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
}
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes, int nrounds)
{
if (IS_ENABLED(CONFIG_AS_AVX512) &&
static_branch_likely(&chacha_use_avx512vl)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha_8block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 8);
return;
}
if (bytes > CHACHA_BLOCK_SIZE * 2) {
chacha_4block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes) {
chacha_2block_xor_avx512vl(state, dst, src, bytes,
nrounds);
state[12] += chacha_advance(bytes, 2);
return;
}
}
if (IS_ENABLED(CONFIG_AS_AVX2) &&
static_branch_likely(&chacha_use_avx2)) {
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 8;
src += CHACHA_BLOCK_SIZE * 8;
dst += CHACHA_BLOCK_SIZE * 8;
state[12] += 8;
}
if (bytes > CHACHA_BLOCK_SIZE * 4) {
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 8);
return;
}
if (bytes > CHACHA_BLOCK_SIZE * 2) {
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes > CHACHA_BLOCK_SIZE) {
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 2);
return;
}
}
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
if (bytes > CHACHA_BLOCK_SIZE) {
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
state[12] += chacha_advance(bytes, 4);
return;
}
if (bytes) {
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
state[12]++;
}
}
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
{
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd()) {
hchacha_block_generic(state, stream, nrounds);
} else {
kernel_fpu_begin();
hchacha_block_ssse3(state, stream, nrounds);
kernel_fpu_end();
}
}
EXPORT_SYMBOL(hchacha_block_arch);
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
{
chacha_init_generic(state, key, iv);
}
EXPORT_SYMBOL(chacha_init_arch);
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
int nrounds)
{
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd() ||
bytes <= CHACHA_BLOCK_SIZE)
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_fpu_begin();
chacha_dosimd(state, dst, src, todo, nrounds);
kernel_fpu_end();
bytes -= todo;
src += todo;
dst += todo;
} while (bytes);
}
EXPORT_SYMBOL(chacha_crypt_arch);
static int chacha_simd_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
chacha_init_generic(state, ctx->key, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
if (!static_branch_likely(&chacha_use_simd) ||
!may_use_simd()) {
chacha_crypt_generic(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes,
ctx->nrounds);
} else {
kernel_fpu_begin();
chacha_dosimd(state, walk.dst.virt.addr,
walk.src.virt.addr, nbytes,
ctx->nrounds);
kernel_fpu_end();
}
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int chacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
return chacha_simd_stream_xor(req, ctx, req->iv);
}
static int xchacha_simd(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
u32 state[CHACHA_STATE_WORDS] __aligned(8);
struct chacha_ctx subctx;
u8 real_iv[16];
chacha_init_generic(state, ctx->key, req->iv);
if (req->cryptlen > CHACHA_BLOCK_SIZE && irq_fpu_usable()) {
kernel_fpu_begin();
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
kernel_fpu_end();
} else {
hchacha_block_generic(state, subctx.key, ctx->nrounds);
}
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_simd_stream_xor(req, &subctx, real_iv);
}
static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = chacha_simd,
.decrypt = chacha_simd,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha20_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-simd",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.setkey = chacha12_setkey,
.encrypt = xchacha_simd,
.decrypt = xchacha_simd,
},
};
static int __init chacha_simd_mod_init(void)
{
if (!boot_cpu_has(X86_FEATURE_SSSE3))
return 0;
static_branch_enable(&chacha_use_simd);
if (IS_ENABLED(CONFIG_AS_AVX2) &&
boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
static_branch_enable(&chacha_use_avx2);
if (IS_ENABLED(CONFIG_AS_AVX512) &&
boot_cpu_has(X86_FEATURE_AVX512VL) &&
boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
static_branch_enable(&chacha_use_avx512vl);
}
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
}
static void __exit chacha_simd_mod_fini(void)
{
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-simd");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-simd");

Some files were not shown because too many files have changed in this diff Show More