170 lines
2.8 KiB
ArmAsm
170 lines
2.8 KiB
ArmAsm
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Optimized memset() for SW64 with SIMD instructions
|
|
*
|
|
* Copyright (C) Mao Minkai
|
|
* Author: Mao Minkai
|
|
*
|
|
* Fill SIZE bytes pointed to by SRC with CHAR.
|
|
*
|
|
* Input:
|
|
* $16: SRC, clobbered
|
|
* $17: CHAR, clobbered
|
|
* $18: SIZE, clobbered
|
|
*
|
|
* Output:
|
|
* $0: SRC
|
|
*
|
|
* Temporaries:
|
|
* $1: unaligned parts of addr (0 means aligned addr), tmp data
|
|
* $2: tmp data
|
|
* $3: tmp data
|
|
* $4: tmp data
|
|
* $5: compare result
|
|
* $f10: 32 bytes data (manually saved)
|
|
*
|
|
*/
|
|
|
|
#include <asm/export.h>
|
|
|
|
#define NC_STORE_THRESHOLD 2048
|
|
|
|
.set noat
|
|
.set noreorder
|
|
.text
|
|
.align 4
|
|
.globl memset
|
|
.globl __memset
|
|
.globl ___memset
|
|
.globl __memsetw
|
|
.globl __constant_c_memset
|
|
.ent ___memset
|
|
___memset:
|
|
.frame $30, 0, $26, 0
|
|
.prologue 0
|
|
|
|
/* expand 1 byte data to 8 bytes */
|
|
and $17, 0xff, $17
|
|
sll $17, 8, $4
|
|
bis $17, $4, $17
|
|
sll $17, 16, $4
|
|
bis $17, $4, $17
|
|
sll $17, 32, $4
|
|
bis $17, $4, $17
|
|
|
|
__constant_c_memset:
|
|
bis $31, $16, $0 # set return value
|
|
beq $18, $out # return if size is 0
|
|
cmplt $18, 8, $5 # size less than 8, do 1-byte loop
|
|
bne $5, $tail_loop
|
|
|
|
/* loop until SRC is 8 bytes aligned */
|
|
.align 5
|
|
$head_loop:
|
|
and $16, 0x7, $1
|
|
beq $1, $mod8_aligned
|
|
stb $17, 0($16)
|
|
subl $18, 1, $18
|
|
beq $18, $out
|
|
addl $16, 1, $16
|
|
br $31, $head_loop
|
|
|
|
$mod8_aligned:
|
|
|
|
/* set 8 bytes each time */
|
|
.align 5
|
|
$mod8_loop:
|
|
and $16, 0x1f, $1
|
|
beq $1, $mod32_aligned
|
|
subl $18, 8, $18
|
|
blt $18, $tail
|
|
stl $17, 0($16)
|
|
addl $16, 8, $16
|
|
br $31, $mod8_loop
|
|
|
|
/* expand data to 32 bytes */
|
|
$mod32_aligned:
|
|
subl $sp, 64, $sp
|
|
addl $sp, 31, $4
|
|
bic $4, 0x1f, $4
|
|
vstd $f10, 0($4)
|
|
ifmovd $17, $f10
|
|
vcpyf $f10, $f10
|
|
|
|
ldi $1, NC_STORE_THRESHOLD($31)
|
|
cmple $18, $1, $1
|
|
bne $1, $mod32_loop
|
|
|
|
/* set 64 bytes each time */
|
|
.align 5
|
|
$mod32_loop_nc:
|
|
subl $18, 64, $18
|
|
blt $18, $mod32_tail
|
|
vstd_nc $f10, 0($16)
|
|
vstd_nc $f10, 32($16)
|
|
addl $16, 64, $16
|
|
br $31, $mod32_loop_nc
|
|
memb # required for _nc store instructions
|
|
|
|
.align 5
|
|
$mod32_loop:
|
|
subl $18, 64, $18
|
|
blt $18, $mod32_tail
|
|
vstd $f10, 0($16)
|
|
vstd $f10, 32($16)
|
|
addl $16, 64, $16
|
|
br $31, $mod32_loop
|
|
|
|
$mod32_tail:
|
|
vldd $f10, 0($4)
|
|
addl $sp, 64, $sp
|
|
addl $18, 64, $18
|
|
.align 5
|
|
$mod32_tail_loop:
|
|
subl $18, 8, $18
|
|
blt $18, $tail
|
|
stl $17, 0($16)
|
|
addl $16, 8, $16
|
|
br $31, $mod32_tail_loop
|
|
|
|
$tail:
|
|
addl $18, 8, $18
|
|
|
|
/* set one byte each time */
|
|
.align 5
|
|
$tail_loop:
|
|
beq $18, $out
|
|
stb $17, 0($16)
|
|
subl $18, 1, $18
|
|
addl $16, 1, $16
|
|
br $31, $tail_loop
|
|
|
|
/* done, return */
|
|
$out:
|
|
ret
|
|
|
|
.end ___memset
|
|
EXPORT_SYMBOL(___memset)
|
|
|
|
.align 5
|
|
.ent __memsetw
|
|
__memsetw:
|
|
.prologue 0
|
|
|
|
inslh $17, 0, $1
|
|
inslh $17, 2, $2
|
|
inslh $17, 4, $3
|
|
bis $1, $2, $1
|
|
inslh $17, 6, $4
|
|
bis $1, $3, $1
|
|
bis $1, $4, $17
|
|
br $31, __constant_c_memset
|
|
|
|
.end __memsetw
|
|
EXPORT_SYMBOL(__memsetw)
|
|
|
|
memset = ___memset
|
|
EXPORT_SYMBOL(memset)
|
|
__memset = ___memset
|
|
EXPORT_SYMBOL(__memset)
|