blob: 05eb64f64e829404e80751e21daecaabaeabfbbd [file] [log] [blame]
Lu, Hongjiubb12ac92010-12-27 16:53:58 -08001/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/*
17 * Contributed by: Intel Corporation
18 */
19
20#ifndef L
21# define L(label) .L##label
22#endif
23
24#ifndef ALIGN
25# define ALIGN(n) .p2align n
26#endif
27
28#ifndef cfi_startproc
29# define cfi_startproc .cfi_startproc
30#endif
31
32#ifndef cfi_endproc
33# define cfi_endproc .cfi_endproc
34#endif
35
36#ifndef cfi_rel_offset
37# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
38#endif
39
40#ifndef cfi_restore
41# define cfi_restore(reg) .cfi_restore reg
42#endif
43
44#ifndef cfi_adjust_cfa_offset
45# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
46#endif
47
48#ifndef ENTRY
49# define ENTRY(name) \
50 .type name, @function; \
51 .globl name; \
52 .p2align 4; \
53name: \
54 cfi_startproc
55#endif
56
57#ifndef END
58# define END(name) \
59 cfi_endproc; \
60 .size name, .-name
61#endif
62
63#define CFI_PUSH(REG) \
64 cfi_adjust_cfa_offset (4); \
65 cfi_rel_offset (REG, 0)
66
67#define CFI_POP(REG) \
68 cfi_adjust_cfa_offset (-4); \
69 cfi_restore (REG)
70
71#define PUSH(REG) pushl REG; CFI_PUSH (REG)
72#define POP(REG) popl REG; CFI_POP (REG)
73
74#ifdef USE_AS_BZERO32
75# define DEST PARMS
76# define LEN DEST+4
77#else
78# define DEST PARMS
79# define DWDS DEST+4
80# define LEN DWDS+4
81#endif
82
83#ifdef USE_AS_WMEMSET32
84# define SETRTNVAL movl DEST(%esp), %eax
85#else
86# define SETRTNVAL
87#endif
88
Pavel Chupin9ff87672013-10-24 21:42:30 +040089#if (defined SHARED || defined __PIC__)
Lu, Hongjiubb12ac92010-12-27 16:53:58 -080090# define ENTRANCE PUSH (%ebx);
91# define RETURN_END POP (%ebx); ret
92# define RETURN RETURN_END; CFI_PUSH (%ebx)
93# define PARMS 8 /* Preserve EBX. */
94# define JMPTBL(I, B) I - B
95
96/* Load an entry in a jump table into EBX and branch to it. TABLE is a
97 jump table with relative offsets. */
98# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
99 /* We first load PC into EBX. */ \
100 call __i686.get_pc_thunk.bx; \
101 /* Get the address of the jump table. */ \
102 add $(TABLE - .), %ebx; \
103 /* Get the entry and convert the relative offset to the \
104 absolute address. */ \
105 add (%ebx,%ecx,4), %ebx; \
106 /* We loaded the jump table and adjuested EDX. Go. */ \
107 jmp *%ebx
108
109 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
110 .globl __i686.get_pc_thunk.bx
111 .hidden __i686.get_pc_thunk.bx
112 ALIGN (4)
113 .type __i686.get_pc_thunk.bx,@function
114__i686.get_pc_thunk.bx:
115 movl (%esp), %ebx
116 ret
117#else
118# define ENTRANCE
119# define RETURN_END ret
120# define RETURN RETURN_END
121# define PARMS 4
122# define JMPTBL(I, B) I
123
124/* Branch to an entry in a jump table. TABLE is a jump table with
125 absolute offsets. */
126# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
127 jmp *TABLE(,%ecx,4)
128#endif
129
130 .section .text.sse2,"ax",@progbits
131 ALIGN (4)
132ENTRY (sse2_memset32_atom)
133 ENTRANCE
134
135 movl LEN(%esp), %ecx
136#ifdef USE_AS_ANDROID
137 shr $2, %ecx
138#endif
139#ifdef USE_AS_BZERO32
140 xor %eax, %eax
141#else
142 mov DWDS(%esp), %eax
143 mov %eax, %edx
144#endif
145 movl DEST(%esp), %edx
146 cmp $16, %ecx
147 jae L(16dbwordsormore)
148
149L(write_less16dbwords):
150 lea (%edx, %ecx, 4), %edx
151 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords))
152
153 .pushsection .rodata.sse2,"a",@progbits
154 ALIGN (2)
155L(table_less16dbwords):
156 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords))
157 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords))
158 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords))
159 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords))
160 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords))
161 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords))
162 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords))
163 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords))
164 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords))
165 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords))
166 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords))
167 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords))
168 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords))
169 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords))
170 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords))
171 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords))
172 .popsection
173
174 ALIGN (4)
175L(write_15dbwords):
176 movl %eax, -60(%edx)
177L(write_14dbwords):
178 movl %eax, -56(%edx)
179L(write_13dbwords):
180 movl %eax, -52(%edx)
181L(write_12dbwords):
182 movl %eax, -48(%edx)
183L(write_11dbwords):
184 movl %eax, -44(%edx)
185L(write_10dbwords):
186 movl %eax, -40(%edx)
187L(write_9dbwords):
188 movl %eax, -36(%edx)
189L(write_8dbwords):
190 movl %eax, -32(%edx)
191L(write_7dbwords):
192 movl %eax, -28(%edx)
193L(write_6dbwords):
194 movl %eax, -24(%edx)
195L(write_5dbwords):
196 movl %eax, -20(%edx)
197L(write_4dbwords):
198 movl %eax, -16(%edx)
199L(write_3dbwords):
200 movl %eax, -12(%edx)
201L(write_2dbwords):
202 movl %eax, -8(%edx)
203L(write_1dbwords):
204 movl %eax, -4(%edx)
205L(write_0dbwords):
206 SETRTNVAL
207 RETURN
208
209 ALIGN (4)
210L(16dbwordsormore):
211 test $3, %edx
212 jz L(aligned4bytes)
213 mov %eax, (%edx)
214 mov %eax, -4(%edx, %ecx, 4)
215 sub $1, %ecx
216 rol $24, %eax
217 add $1, %edx
218 test $3, %edx
219 jz L(aligned4bytes)
220 ror $8, %eax
221 add $1, %edx
222 test $3, %edx
223 jz L(aligned4bytes)
224 ror $8, %eax
225 add $1, %edx
226L(aligned4bytes):
227 shl $2, %ecx
228
229#ifdef USE_AS_BZERO32
230 pxor %xmm0, %xmm0
231#else
232 movd %eax, %xmm0
233 pshufd $0, %xmm0, %xmm0
234#endif
235 testl $0xf, %edx
236 jz L(aligned_16)
237/* ECX > 32 and EDX is not 16 byte aligned. */
238L(not_aligned_16):
239 movdqu %xmm0, (%edx)
240 movl %edx, %eax
241 and $-16, %edx
242 add $16, %edx
243 sub %edx, %eax
244 add %eax, %ecx
245 movd %xmm0, %eax
246 ALIGN (4)
247L(aligned_16):
248 cmp $128, %ecx
249 jae L(128bytesormore)
250
251L(aligned_16_less128bytes):
252 add %ecx, %edx
253 shr $2, %ecx
254 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
255
256 ALIGN (4)
257L(128bytesormore):
258#ifdef SHARED_CACHE_SIZE
259 PUSH (%ebx)
260 mov $SHARED_CACHE_SIZE, %ebx
261#else
Pavel Chupin9ff87672013-10-24 21:42:30 +0400262# if (defined SHARED || defined __PIC__)
Lu, Hongjiubb12ac92010-12-27 16:53:58 -0800263 call __i686.get_pc_thunk.bx
264 add $_GLOBAL_OFFSET_TABLE_, %ebx
265 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
266# else
267 PUSH (%ebx)
268 mov __x86_shared_cache_size, %ebx
269# endif
270#endif
271 cmp %ebx, %ecx
272 jae L(128bytesormore_nt_start)
273
274#ifdef DATA_CACHE_SIZE
275 POP (%ebx)
276# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
277 cmp $DATA_CACHE_SIZE, %ecx
278#else
Pavel Chupin9ff87672013-10-24 21:42:30 +0400279# if (defined SHARED || defined __PIC__)
Lu, Hongjiubb12ac92010-12-27 16:53:58 -0800280# define RESTORE_EBX_STATE
281 call __i686.get_pc_thunk.bx
282 add $_GLOBAL_OFFSET_TABLE_, %ebx
283 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
284# else
285 POP (%ebx)
286# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
287 cmp __x86_data_cache_size, %ecx
288# endif
289#endif
290
291 jae L(128bytes_L2_normal)
292 subl $128, %ecx
293L(128bytesormore_normal):
294 sub $128, %ecx
295 movdqa %xmm0, (%edx)
296 movdqa %xmm0, 0x10(%edx)
297 movdqa %xmm0, 0x20(%edx)
298 movdqa %xmm0, 0x30(%edx)
299 movdqa %xmm0, 0x40(%edx)
300 movdqa %xmm0, 0x50(%edx)
301 movdqa %xmm0, 0x60(%edx)
302 movdqa %xmm0, 0x70(%edx)
303 lea 128(%edx), %edx
304 jb L(128bytesless_normal)
305
306
307 sub $128, %ecx
308 movdqa %xmm0, (%edx)
309 movdqa %xmm0, 0x10(%edx)
310 movdqa %xmm0, 0x20(%edx)
311 movdqa %xmm0, 0x30(%edx)
312 movdqa %xmm0, 0x40(%edx)
313 movdqa %xmm0, 0x50(%edx)
314 movdqa %xmm0, 0x60(%edx)
315 movdqa %xmm0, 0x70(%edx)
316 lea 128(%edx), %edx
317 jae L(128bytesormore_normal)
318
319L(128bytesless_normal):
320 lea 128(%ecx), %ecx
321 add %ecx, %edx
322 shr $2, %ecx
323 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
324
325 ALIGN (4)
326L(128bytes_L2_normal):
327 prefetcht0 0x380(%edx)
328 prefetcht0 0x3c0(%edx)
329 sub $128, %ecx
330 movdqa %xmm0, (%edx)
331 movaps %xmm0, 0x10(%edx)
332 movaps %xmm0, 0x20(%edx)
333 movaps %xmm0, 0x30(%edx)
334 movaps %xmm0, 0x40(%edx)
335 movaps %xmm0, 0x50(%edx)
336 movaps %xmm0, 0x60(%edx)
337 movaps %xmm0, 0x70(%edx)
338 add $128, %edx
339 cmp $128, %ecx
340 jae L(128bytes_L2_normal)
341
342L(128bytesless_L2_normal):
343 add %ecx, %edx
344 shr $2, %ecx
345 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
346
347 RESTORE_EBX_STATE
348L(128bytesormore_nt_start):
349 sub %ebx, %ecx
350 mov %ebx, %eax
351 and $0x7f, %eax
352 add %eax, %ecx
353 movd %xmm0, %eax
354 ALIGN (4)
355L(128bytesormore_shared_cache_loop):
356 prefetcht0 0x3c0(%edx)
357 prefetcht0 0x380(%edx)
358 sub $0x80, %ebx
359 movdqa %xmm0, (%edx)
360 movdqa %xmm0, 0x10(%edx)
361 movdqa %xmm0, 0x20(%edx)
362 movdqa %xmm0, 0x30(%edx)
363 movdqa %xmm0, 0x40(%edx)
364 movdqa %xmm0, 0x50(%edx)
365 movdqa %xmm0, 0x60(%edx)
366 movdqa %xmm0, 0x70(%edx)
367 add $0x80, %edx
368 cmp $0x80, %ebx
369 jae L(128bytesormore_shared_cache_loop)
370 cmp $0x80, %ecx
371 jb L(shared_cache_loop_end)
372
373 ALIGN (4)
374L(128bytesormore_nt):
375 sub $0x80, %ecx
376 movntdq %xmm0, (%edx)
377 movntdq %xmm0, 0x10(%edx)
378 movntdq %xmm0, 0x20(%edx)
379 movntdq %xmm0, 0x30(%edx)
380 movntdq %xmm0, 0x40(%edx)
381 movntdq %xmm0, 0x50(%edx)
382 movntdq %xmm0, 0x60(%edx)
383 movntdq %xmm0, 0x70(%edx)
384 add $0x80, %edx
385 cmp $0x80, %ecx
386 jae L(128bytesormore_nt)
387 sfence
388L(shared_cache_loop_end):
Pavel Chupin9ff87672013-10-24 21:42:30 +0400389#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
Lu, Hongjiubb12ac92010-12-27 16:53:58 -0800390 POP (%ebx)
391#endif
392 add %ecx, %edx
393 shr $2, %ecx
394 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
395
396 .pushsection .rodata.sse2,"a",@progbits
397 ALIGN (2)
398L(table_16_128bytes):
399 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
400 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
401 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
402 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
403 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
404 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
405 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
406 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
407 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
408 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
409 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
410 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
411 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
412 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
413 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
414 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
415 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
416 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
417 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
418 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
419 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
420 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
421 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
422 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
423 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
424 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
425 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
426 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
427 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
428 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
429 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
430 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
431 .popsection
432
433 ALIGN (4)
434L(aligned_16_112bytes):
435 movdqa %xmm0, -112(%edx)
436L(aligned_16_96bytes):
437 movdqa %xmm0, -96(%edx)
438L(aligned_16_80bytes):
439 movdqa %xmm0, -80(%edx)
440L(aligned_16_64bytes):
441 movdqa %xmm0, -64(%edx)
442L(aligned_16_48bytes):
443 movdqa %xmm0, -48(%edx)
444L(aligned_16_32bytes):
445 movdqa %xmm0, -32(%edx)
446L(aligned_16_16bytes):
447 movdqa %xmm0, -16(%edx)
448L(aligned_16_0bytes):
449 SETRTNVAL
450 RETURN
451
452 ALIGN (4)
453L(aligned_16_116bytes):
454 movdqa %xmm0, -116(%edx)
455L(aligned_16_100bytes):
456 movdqa %xmm0, -100(%edx)
457L(aligned_16_84bytes):
458 movdqa %xmm0, -84(%edx)
459L(aligned_16_68bytes):
460 movdqa %xmm0, -68(%edx)
461L(aligned_16_52bytes):
462 movdqa %xmm0, -52(%edx)
463L(aligned_16_36bytes):
464 movdqa %xmm0, -36(%edx)
465L(aligned_16_20bytes):
466 movdqa %xmm0, -20(%edx)
467L(aligned_16_4bytes):
468 movl %eax, -4(%edx)
469 SETRTNVAL
470 RETURN
471
472 ALIGN (4)
473L(aligned_16_120bytes):
474 movdqa %xmm0, -120(%edx)
475L(aligned_16_104bytes):
476 movdqa %xmm0, -104(%edx)
477L(aligned_16_88bytes):
478 movdqa %xmm0, -88(%edx)
479L(aligned_16_72bytes):
480 movdqa %xmm0, -72(%edx)
481L(aligned_16_56bytes):
482 movdqa %xmm0, -56(%edx)
483L(aligned_16_40bytes):
484 movdqa %xmm0, -40(%edx)
485L(aligned_16_24bytes):
486 movdqa %xmm0, -24(%edx)
487L(aligned_16_8bytes):
488 movq %xmm0, -8(%edx)
489 SETRTNVAL
490 RETURN
491
492 ALIGN (4)
493L(aligned_16_124bytes):
494 movdqa %xmm0, -124(%edx)
495L(aligned_16_108bytes):
496 movdqa %xmm0, -108(%edx)
497L(aligned_16_92bytes):
498 movdqa %xmm0, -92(%edx)
499L(aligned_16_76bytes):
500 movdqa %xmm0, -76(%edx)
501L(aligned_16_60bytes):
502 movdqa %xmm0, -60(%edx)
503L(aligned_16_44bytes):
504 movdqa %xmm0, -44(%edx)
505L(aligned_16_28bytes):
506 movdqa %xmm0, -28(%edx)
507L(aligned_16_12bytes):
508 movq %xmm0, -12(%edx)
509 movl %eax, -4(%edx)
510 SETRTNVAL
511 RETURN
512
513END (sse2_memset32_atom)