blob: f8f3a9169ce1f9593e9ad781afe4d4c2d9661a13 [file] [log] [blame]
Duane Sandd4a80982012-10-12 14:25:19 -07001/*
2 * Copyright (c) 2009
3 * MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
33 * Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 * Include files
40 ************************************************************************/
41
42#include "machine/asm.h"
43
44/*
45 * This routine could be optimized for MIPS64. The current code only
46 * uses MIPS32 instructions.
47 */
48
49#if defined(__MIPSEB__)
50# define SWHI swl /* high part is left in big-endian */
51#endif
52
53#if defined(__MIPSEL__)
54# define SWHI swr /* high part is right in little-endian */
55#endif
56
57#if !(defined(XGPROF) || defined(XPROF))
58#undef SETUP_GP
59#define SETUP_GP
60#endif
61
62LEAF(memset_cmips,0)
63
64 .set noreorder
65 .set noat
66
67 addu t0,a0,a2 # t0 is the "past the end" address
68 slti AT,a2,4 # is a2 less than 4?
69 bne AT,zero,.Llast4 # if yes, go to last4
70 move v0,a0 # memset returns the dst pointer
71
72 beq a1,zero,.Lset0
73 subu v1,zero,a0
74
75 # smear byte into 32 bit word
76#if (__mips==32) && (__mips_isa_rev>=2)
77 ins a1, a1, 8, 8 # Replicate fill byte into half-word.
78 ins a1, a1, 16, 16 # Replicate fill byte into word.
79#else
80 and a1,0xff
81 sll AT,a1,8
82 or a1,AT
83 sll AT,a1,16
84 or a1,AT
85#endif
86
87.Lset0: andi v1,v1,0x3 # word-unaligned address?
88 beq v1,zero,.Laligned # v1 is the unalignment count
89 subu a2,a2,v1
90 SWHI a1,0(a0)
91 addu a0,a0,v1
92
93# Here we have the "word-aligned" a0 (until the "last4")
94.Laligned:
95 andi t8,a2,0x3f # any 64-byte chunks?
96 # t8 is the byte count past 64-byte chunks
97 beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks
98 # There will be at most 1 32-byte chunk then
99 subu a3,a2,t8 # subtract from a2 the reminder
100 # Here a3 counts bytes in 16w chunks
101 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
102
103# Find out, if there are any 64-byte chunks after which will be still at least
104# 96 bytes left. The value "96" is calculated as needed buffer for
105# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
106# incrementing "a0" by 64.
107# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
108#
109 sltiu v1,a2,160
110 bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)"
111 subu t7,a2,96 # subtract "pref 30 unsafe" region
112 # below we have at least 1 64-byte chunk which is "pref 30 safe"
113 andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder
114 subu t5,t7,t6 # subtract from t7 the reminder
115 # Here t5 counts bytes in 16w "safe" chunks
116 addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks
117
118# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
119# pref 30,0(a0)
120# Here we are in the region, where it is safe to use "pref 30,64(a0)"
121.Lloop16w:
122 addiu a0,a0,64
123 pref 30,-32(a0) # continue setting up the dest, addr 64-32
124 sw a1,-64(a0)
125 sw a1,-60(a0)
126 sw a1,-56(a0)
127 sw a1,-52(a0)
128 sw a1,-48(a0)
129 sw a1,-44(a0)
130 sw a1,-40(a0)
131 sw a1,-36(a0)
132 nop
133 nop # the extra nop instructions help to balance
134 nop # cycles needed for "store" + "fill" + "evict"
135 nop # For 64byte store there are needed 8 fill
136 nop # and 8 evict cycles, i.e. at least 32 instr.
137 nop
138 nop
139 pref 30,0(a0) # continue setting up the dest, addr 64-0
140 sw a1,-32(a0)
141 sw a1,-28(a0)
142 sw a1,-24(a0)
143 sw a1,-20(a0)
144 sw a1,-16(a0)
145 sw a1,-12(a0)
146 sw a1,-8(a0)
147 sw a1,-4(a0)
148 nop
149 nop
150 nop
151 nop # NOTE: adding 14 nop-s instead of 12 nop-s
152 nop # gives better results for "fast" memory
153 nop
154 bne a0,t4,.Lloop16w
155 nop
156
157 beq a0,a3,.Lchk8w # maybe no more 64-byte chunks?
158 nop # this "delayed slot" is useless ...
159
160.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks
161 addiu a0,a0,64
162 sw a1,-64(a0)
163 sw a1,-60(a0)
164 sw a1,-56(a0)
165 sw a1,-52(a0)
166 sw a1,-48(a0)
167 sw a1,-44(a0)
168 sw a1,-40(a0)
169 sw a1,-36(a0)
170 sw a1,-32(a0)
171 sw a1,-28(a0)
172 sw a1,-24(a0)
173 sw a1,-20(a0)
174 sw a1,-16(a0)
175 sw a1,-12(a0)
176 sw a1,-8(a0)
177 bne a0,a3,.Lloop16w_nopref30
178 sw a1,-4(a0)
179
180.Lchk8w: # t8 here is the byte count past 64-byte chunks
181
182 andi t7,t8,0x1f # is there a 32-byte chunk?
183 # the t7 is the reminder count past 32-bytes
184 beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk
185 move a2,t7
186
187 sw a1,0(a0)
188 sw a1,4(a0)
189 sw a1,8(a0)
190 sw a1,12(a0)
191 sw a1,16(a0)
192 sw a1,20(a0)
193 sw a1,24(a0)
194 sw a1,28(a0)
195 addiu a0,a0,32
196
197.Lchk1w:
198 andi t8,a2,0x3 # now t8 is the reminder past 1w chunks
199 beq a2,t8,.Llast4
200 subu a3,a2,t8 # a3 is the count of bytes in 1w chunks
201 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
202
203# copying in words (4-byte chunks)
204.LwordCopy_loop:
205 addiu a0,a0,4
206 bne a0,a3,.LwordCopy_loop
207 sw a1,-4(a0)
208
209.Llast4:beq a0,t0,.Llast4e
210.Llast4l:addiu a0,a0,1
211 bne a0,t0,.Llast4l
212 sb a1,-1(a0)
213
214.Llast4e:
215 j ra
216 nop
217
218 .set at
219 .set reorder
220
221END(memset_cmips)
222
223
224/************************************************************************
225 * Implementation : Static functions
226 ************************************************************************/
227