2026-01-21 18:59:54 +08:00

343 lines
6.9 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copy to/from user space, handling exceptions as we go.. This
* isn't exactly pretty.
*
* This is essentially the same as "memcpy()", but with a few twists.
* Notably, we have to make sure that $18 is always up-to-date and
* contains the right "bytes left to copy" value (and that it is updated
* only _after_ a successful copy). There is also some rather minor
* exception setup stuff..
*
* Inputs:
* length in $18
* destination address in $16
* source address in $17
* return address in $26
*
* Outputs:
* bytes left to copy in $0
*
* Clobbers:
* $1,$2,$3,$4,$5,$16,$17
*
*/
/* Author: Copy_user simd version 1.1 (20190904) by Gao Xiuwu.
*/
#include <asm/export.h>
/* Allow an exception for an insn; exit if we get one. */
#define EXI(x, y...) \
99: x, ##y; \
.section __ex_table, "a"; \
.long 99b - .; \
ldi $31, $exitin-99b($31); \
.previous
#define EXO(x,y...) \
99: x, ##y; \
.section __ex_table, "a"; \
.long 99b - .; \
ldi $31, $exitout-99b($31); \
.previous
.set noat
.align 4
.globl __copy_user
.ent __copy_user
__copy_user:
.prologue 0
subl $18, 32, $1
beq $18, $zerolength
and $16, 7, $3
ble $1, $onebyteloop
beq $3, $destaligned
subl $3, 8, $3
/*
* The fetcher stall also hides the 1 cycle cross-cluster stall for $3 (L --> U)
* This loop aligns the destination a byte at a time
* We know we have at least one trip through this loop
*/
$aligndest:
EXI(ldbu $1, 0($17))
addl $16, 1, $16
addl $3, 1, $3
/*
* the -1 is to compensate for the inc($16) done in a previous quadpack
* which allows us zero dependencies within either quadpack in the loop
*/
EXO(stb $1, -1($16))
addl $17, 1, $17
subl $18, 1, $18
bne $3, $aligndest
/*
* If we fell through into here, we have a minimum of 33 - 7 bytes
* If we arrived via branch, we have a minimum of 32 bytes
*/
$destaligned:
and $17, 7, $1
bic $18, 7, $4
#EXI(ldl_u $3, 0($17))
beq $1, $quadaligned
#ifndef MISQUAD_SCALAR
$misquad:
and $16, 31, $1
beq $1, $dest32Baligned
$align_32B:
EXI(ldbu $1, 0($17))
addl $17, 1, $17
EXO(stb $1, 0($16))
subl $18, 1, $18
addl $16, 1, $16
and $16, 31, $1
beq $18, $exitout
bne $1, $align_32B
$dest32Baligned:
ldi $2, 256($31)
andnot $17, 31, $3
EXI(vldd $f10, 0($3))
and $17, 31, $5
sll $5, 3, $5
subw $2, $5, $4
ifmovs $5, $f15
ifmovs $4, $f14
cmple $18, 63, $1
bne $1, $misalign_tail_simd
$misalign_body_simd:
EXI(vldd $f11, 32($3))
fillcs 128*5($3)
srlow $f10, $f15, $f12
sllow $f11, $f14, $f13
#fillde 128*5($16)
vlogfc $f12, $f13, $f31, $f12
EXI(vldd $f10, 64($3))
srlow $f11, $f15, $f22
sllow $f10, $f14, $f23
vlogfc $f22, $f23, $f31, $f22
EXO(vstd $f12, 0($16))
EXO(vstd $f22, 32($16))
addl $16, 64, $16
addl $3, 64, $3
subl $18, 64, $18
cmple $18, 63, $1
beq $1, $misalign_body_simd
br $misalign_tail_simd
$misalign_tail_simd:
cmple $18, 31, $1
bne $1, $before_misalign_tail_quads
EXI(vldd $f11, 32($3))
srlow $f10, $f15, $f12
sllow $f11, $f14, $f13
vlogfc $f12, $f13, $f31, $f12
EXO(vstd $f12, 0($16))
subl $18, 32, $18
addl $16, 32, $16
addl $3, 32, $3
vfmov $f11, $f10
$before_misalign_tail_quads:
srlow $f10, $f15, $f12
s8subl $18, $4, $1
ble $1, $tail_quads
EXI(vldd $f11, 32($3))
sllow $f11, $f14, $f13
vlogfc $f12, $f13, $f31, $f12
$tail_quads:
subl $18, 8, $1
blt $1, $less_than_8
$move_a_quad:
fimovd $f12, $1
srlow $f12, 64, $f12
EXO(stl $1, 0($16))
subl $18, 8, $18
addl $16, 8, $16
subl $18, 8, $1
bge $1, $move_a_quad
$less_than_8:
.align 4
beq $18, $exitout
fimovd $f12, $1
$tail_bytes:
EXO(stb $1, 0($16))
subl $18, 1, $18
srl $1, 8, $1
addl $16, 1, $16
bgt $18, $tail_bytes
br $exitout
#else
/*
* In the worst case, we've just executed an ldl_u here from 0($17)
* and we'll repeat it once if we take the branch
*/
/* Misaligned quadword loop - not unrolled. Leave it that way. */
$misquad:
EXI(ldl_u $2, 8($17))
subl $4, 8, $4
extll $3, $17, $3
exthl $2, $17, $1
bis $3, $1, $1
EXO(stl $1, 0($16))
addl $17, 8, $17
subl $18, 8, $18
addl $16, 8, $16
bis $2, $2, $3
bne $4, $misquad
beq $18, $zerolength
/* We know we have at least one trip through the byte loop */
EXI(ldbu $2, 0($17))
addl $16, 1, $16
br $31, $dirtyentry
#endif
/* Do the trailing byte loop load, then hop into the store part of the loop */
/*
* A minimum of (33 - 7) bytes to do a quad at a time.
* Based upon the usage context, it's worth the effort to unroll this loop
* $18 - number of bytes to be moved
* $4 - number of bytes to move as quadwords
* $16 is current destination address
* $17 is current source address
*/
$quadaligned:
and $16, 31, $1
beq $1, $quadaligned_dest32Baligned
$quadaligned_align_32B:
EXI(ldl $1, 0($17))
addl $17, 8, $17
EXO(stl $1, 0($16))
subl $18, 8, $18
subl $4, 8, $4
addl $16, 8, $16
and $16, 31, $1
beq $4, $onebyteloop
bne $1, $quadaligned_align_32B
$quadaligned_dest32Baligned:
and $17, 31, $2
bne $2, $dest32Baligned
$quad32Bailgned:
subl $4, 64, $2
blt $2, $onequad
/*
* There is a significant assumption here that the source and destination
* addresses differ by more than 32 bytes. In this particular case, a
* sparsity of registers further bounds this to be a minimum of 8 bytes.
* But if this isn't met, then the output result will be incorrect.
* Furthermore, due to a lack of available registers, we really can't
* unroll this to be an 8x loop (which would enable us to use the wh64
* instruction memory hint instruction).
*/
$simd_quadalign_unroll2:
fillcs 128 * 5($17)
EXI(vldd $f22, 0($17))
EXI(vldd $f23, 32($17))
EXO(vstd $f22, 0($16))
EXO(vstd $f23, 32($16))
#fillde 128 * 5($16)
subl $4, 64, $4
subl $18, 64, $18
addl $17, 64, $17
addl $16, 64, $16
subl $4, 64, $3
bge $3, $simd_quadalign_unroll2
bne $4, $onequad
br $31, $noquads
$onequad:
EXI(ldl $1, 0($17))
subl $4, 8, $4
addl $17, 8, $17
EXO(stl $1, 0($16))
subl $18, 8, $18
addl $16, 8, $16
bne $4, $onequad
$noquads:
beq $18, $zerolength
/*
* For small copies (or the tail of a larger copy), do a very simple byte loop.
* There's no point in doing a lot of complex alignment calculations to try to
* to quadword stuff for a small amount of data.
* $18 - remaining number of bytes left to copy
* $16 - current dest addr
* $17 - current source addr
*/
$onebyteloop:
EXI(ldbu $2, 0($17))
addl $16, 1, $16
$dirtyentry:
/*
* the -1 is to compensate for the inc($16) done in a previous quadpack
* which allows us zero dependencies within either quadpack in the loop
*/
EXO(stb $2, -1($16))
addl $17, 1, $17
subl $18, 1, $18
bgt $18, $onebyteloop
$zerolength:
$exitout:
bis $31, $18, $0
ret $31, ($26), 1
$exitin:
/* A stupid byte-by-byte zeroing of the rest of the output
* buffer. This cures security holes by never leaving
* random kernel data around to be copied elsewhere.
*/
mov $18, $1
$101:
EXO(stb $31, 0($16))
subl $1, 1, $1
addl $16, 1, $16
bgt $1, $101
bis $31, $18, $0
ret $31, ($26), 1
.end __copy_user
EXPORT_SYMBOL(__copy_user)