134 lines
2.3 KiB
ArmAsm
134 lines
2.3 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
/*
|
|
* template for memcpy and copy_user with SIMD
|
|
*
|
|
* $7: SIMD status
|
|
* 0: not in simd loop
|
|
* 1: in simd loop
|
|
* 2: in simd_u loop
|
|
* $16: latest dest, clobbered
|
|
* $17: 8-byte data to set
|
|
* $18: bytes left to copy
|
|
*
|
|
*/
|
|
|
|
#define NC_STORE_THRESHOLD 2048
|
|
|
|
#define SAVE_SIMD_REGS \
|
|
ldi $sp, -0x40($sp); \
|
|
addl $sp, 0x1f, $23; \
|
|
bic $23, 0x1f, $23; \
|
|
vstd $f1, 0($23); \
|
|
ldi $7, 1
|
|
|
|
#define RESTORE_SIMD_REGS \
|
|
vldd $f1, 0($23); \
|
|
ldi $sp, 0x40($sp); \
|
|
bis $31, $31, $7
|
|
|
|
ble $18, $out
|
|
and $16, 7, $1
|
|
beq $1, $dest_aligned_8
|
|
|
|
.align 3
|
|
$byte_loop_head:
|
|
FIXUP_LDST( stb $17, 0($16) )
|
|
subl $18, 1, $18
|
|
addl $16, 1, $16
|
|
ble $18, $out
|
|
and $16, 7, $1
|
|
bne $1, $byte_loop_head
|
|
|
|
$dest_aligned_8:
|
|
cmplt $18, 16, $1
|
|
bne $1, $quad_loop_end
|
|
and $16, 31, $1
|
|
beq $1, $dest_aligned_32
|
|
cmplt $18, 64, $1
|
|
bne $1, $simd_end
|
|
|
|
.align 3
|
|
$quad_loop_head:
|
|
FIXUP_LDST( stl $17, 0($16) )
|
|
addl $16, 8, $16
|
|
subl $18, 8, $18
|
|
and $16, 31, $1
|
|
beq $1, $dest_aligned_32
|
|
br $31, $quad_loop_head
|
|
|
|
$dest_aligned_32:
|
|
cmplt $18, 64, $1
|
|
bne $1, $simd_end
|
|
|
|
$prep_simd_loop:
|
|
SAVE_SIMD_REGS
|
|
ifmovd $17, $f1
|
|
vcpyf $f1, $f1
|
|
ldi $1, NC_STORE_THRESHOLD($31)
|
|
cmple $18, $1, $1
|
|
bne $1, $simd_loop
|
|
|
|
.align 3
|
|
$simd_loop_nc:
|
|
FIXUP_LDST( vstd_nc $f1, 0($16) )
|
|
FIXUP_LDST( vstd_nc $f1, 32($16) )
|
|
subl $18, 64, $18
|
|
addl $16, 64, $16
|
|
cmplt $18, 64, $1
|
|
beq $1, $simd_loop_nc
|
|
memb # required for _nc store instructions
|
|
br $31, $simd_loop_end
|
|
|
|
.align 3
|
|
$simd_loop:
|
|
FIXUP_LDST( vstd $f1, 0($16) )
|
|
FIXUP_LDST( vstd $f1, 32($16) )
|
|
subl $18, 64, $18
|
|
addl $16, 64, $16
|
|
cmplt $18, 64, $1
|
|
beq $1, $simd_loop
|
|
|
|
$simd_loop_end:
|
|
cmplt $18, 32, $1
|
|
bne $1, $no_more_simd
|
|
FIXUP_LDST( vstd $f1, 0($16) )
|
|
subl $18, 32, $18
|
|
addl $16, 32, $16
|
|
|
|
$no_more_simd:
|
|
RESTORE_SIMD_REGS
|
|
|
|
$simd_end:
|
|
ble $18, $out
|
|
cmplt $18, 16, $1
|
|
bne $1, $quad_loop_end
|
|
|
|
.align 3
|
|
$quad_loop_tail:
|
|
FIXUP_LDST( stl $17, 0($16) )
|
|
FIXUP_LDST( stl $17, 8($16) )
|
|
subl $18, 16, $18
|
|
addl $16, 16, $16
|
|
cmplt $18, 16, $1
|
|
beq $1, $quad_loop_tail
|
|
|
|
$quad_loop_end:
|
|
ble $18, $out
|
|
cmplt $18, 8, $1
|
|
bne $1, $byte_loop_tail
|
|
|
|
$move_one_quad:
|
|
FIXUP_LDST( stl $17, 0($16) )
|
|
subl $18, 8, $18
|
|
addl $16, 8, $16
|
|
ble $18, $out
|
|
|
|
.align 3
|
|
$byte_loop_tail:
|
|
FIXUP_LDST( stb $17, 0($16) )
|
|
subl $18, 1, $18
|
|
addl $16, 1, $16
|
|
bgt $18, $byte_loop_tail
|
|
br $31, $out
|