/* SPDX-License-Identifier: GPL-2.0 */ /* * Optimized memset() for SW64 with SIMD instructions * * Copyright (C) Mao Minkai * Author: Mao Minkai * * Fill SIZE bytes pointed to by SRC with CHAR. * * Input: * $16: SRC, clobbered * $17: CHAR, clobbered * $18: SIZE, clobbered * * Output: * $0: SRC * * Temporaries: * $1: unaligned parts of addr (0 means aligned addr), tmp data * $2: tmp data * $3: tmp data * $4: tmp data * $5: compare result * $f10: 32 bytes data (manually saved) * */ #include #define NC_STORE_THRESHOLD 2048 .set noat .set noreorder .text .align 4 .globl memset .globl __memset .globl ___memset .globl __memsetw .globl __constant_c_memset .ent ___memset ___memset: .frame $30, 0, $26, 0 .prologue 0 /* expand 1 byte data to 8 bytes */ and $17, 0xff, $17 sll $17, 8, $4 bis $17, $4, $17 sll $17, 16, $4 bis $17, $4, $17 sll $17, 32, $4 bis $17, $4, $17 __constant_c_memset: bis $31, $16, $0 # set return value beq $18, $out # return if size is 0 cmplt $18, 8, $5 # size less than 8, do 1-byte loop bne $5, $tail_loop /* loop until SRC is 8 bytes aligned */ .align 5 $head_loop: and $16, 0x7, $1 beq $1, $mod8_aligned stb $17, 0($16) subl $18, 1, $18 beq $18, $out addl $16, 1, $16 br $31, $head_loop $mod8_aligned: /* set 8 bytes each time */ .align 5 $mod8_loop: and $16, 0x1f, $1 beq $1, $mod32_aligned subl $18, 8, $18 blt $18, $tail stl $17, 0($16) addl $16, 8, $16 br $31, $mod8_loop /* expand data to 32 bytes */ $mod32_aligned: subl $sp, 64, $sp addl $sp, 31, $4 bic $4, 0x1f, $4 vstd $f10, 0($4) ifmovd $17, $f10 vcpyf $f10, $f10 ldi $1, NC_STORE_THRESHOLD($31) cmple $18, $1, $1 bne $1, $mod32_loop /* set 64 bytes each time */ .align 5 $mod32_loop_nc: subl $18, 64, $18 blt $18, $mod32_tail vstd_nc $f10, 0($16) vstd_nc $f10, 32($16) addl $16, 64, $16 br $31, $mod32_loop_nc memb # required for _nc store instructions .align 5 $mod32_loop: subl $18, 64, $18 blt $18, $mod32_tail vstd $f10, 0($16) vstd $f10, 32($16) addl $16, 64, $16 br $31, $mod32_loop $mod32_tail: vldd $f10, 0($4) addl $sp, 64, $sp addl $18, 64, $18 .align 5 $mod32_tail_loop: subl $18, 8, $18 blt $18, $tail stl $17, 0($16) addl $16, 8, $16 br $31, $mod32_tail_loop $tail: addl $18, 8, $18 /* set one byte each time */ .align 5 $tail_loop: beq $18, $out stb $17, 0($16) subl $18, 1, $18 addl $16, 1, $16 br $31, $tail_loop /* done, return */ $out: ret .end ___memset EXPORT_SYMBOL(___memset) .align 5 .ent __memsetw __memsetw: .prologue 0 inslh $17, 0, $1 inslh $17, 2, $2 inslh $17, 4, $3 bis $1, $2, $1 inslh $17, 6, $4 bis $1, $3, $1 bis $1, $4, $17 br $31, __constant_c_memset .end __memsetw EXPORT_SYMBOL(__memsetw) memset = ___memset EXPORT_SYMBOL(memset) __memset = ___memset EXPORT_SYMBOL(__memset)