blob: 4a21709fb82dfd47e53496b8d840b9e2402ff814 [file] [log] [blame]
Christopher Ferrisacdde8c2013-02-26 01:30:00 -08001/*
Christopher Ferris4d8fe512013-04-19 14:01:50 -07002 * Copyright (C) 2013 The Android Open Source Project
Christopher Ferrisacdde8c2013-02-26 01:30:00 -08003 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/* Assumes neon instructions and a cache line size of 32 bytes. */
30
31#include <machine/cpu-features.h>
32#include <machine/asm.h>
33
34/*
35 * This code assumes it is running on a processor that supports all arm v7
36 * instructions, that supports neon instructions, and that has a 32 byte
37 * cache line.
38 */
39
40 .text
41 .fpu neon
42
43#define CACHE_LINE_SIZE 32
44
45ENTRY(memcpy)
46 .save {r0, lr}
47 /* start preloading as early as possible */
Christopher Ferris4d8fe512013-04-19 14:01:50 -070048 pld [r1, #(CACHE_LINE_SIZE*4)]
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080049 stmfd sp!, {r0, lr}
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080050
51 /* do we have at least 16-bytes to copy (needed for alignment below) */
52 cmp r2, #16
53 blo 5f
54
55 /* align destination to cache-line for the write-buffer */
56 rsb r3, r0, #0
57 ands r3, r3, #0xF
Christopher Ferris4d8fe512013-04-19 14:01:50 -070058 beq 2f
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080059
60 /* copy up to 15-bytes (count in r3) */
61 sub r2, r2, r3
62 movs ip, r3, lsl #31
63 ldrmib lr, [r1], #1
64 strmib lr, [r0], #1
65 ldrcsb ip, [r1], #1
66 ldrcsb lr, [r1], #1
67 strcsb ip, [r0], #1
68 strcsb lr, [r0], #1
69 movs ip, r3, lsl #29
70 bge 1f
71 // copies 4 bytes, destination 32-bits aligned
72 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
73 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
741: bcc 2f
75 // copies 8 bytes, destination 64-bits aligned
76 vld1.8 {d0}, [r1]!
77 vst1.8 {d0}, [r0, :64]!
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080078
Christopher Ferris4d8fe512013-04-19 14:01:50 -0700792: /* make sure we have at least 64 bytes to copy */
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080080 subs r2, r2, #64
81 blo 2f
82
Christopher Ferrisacdde8c2013-02-26 01:30:00 -0800831: /* The main loop copies 64 bytes at a time */
84 vld1.8 {d0 - d3}, [r1]!
85 vld1.8 {d4 - d7}, [r1]!
Christopher Ferris4d8fe512013-04-19 14:01:50 -070086 pld [r1, #(CACHE_LINE_SIZE*2)]
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080087 subs r2, r2, #64
88 vst1.8 {d0 - d3}, [r0, :128]!
89 vst1.8 {d4 - d7}, [r0, :128]!
90 bhs 1b
91
922: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Christopher Ferris4d8fe512013-04-19 14:01:50 -070093 adds r2, r2, #32
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080094 blo 4f
95
Christopher Ferris4d8fe512013-04-19 14:01:50 -070096 /* Copy 32 bytes. These cache lines were already preloaded */
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080097 vld1.8 {d0 - d3}, [r1]!
Christopher Ferris4d8fe512013-04-19 14:01:50 -070098 sub r2, r2, #32
Christopher Ferrisacdde8c2013-02-26 01:30:00 -080099 vst1.8 {d0 - d3}, [r0, :128]!
Christopher Ferris4d8fe512013-04-19 14:01:50 -0700100
Christopher Ferrisacdde8c2013-02-26 01:30:00 -08001014: /* less than 32 left */
102 add r2, r2, #32
103 tst r2, #0x10
104 beq 5f
105 // copies 16 bytes, 128-bits aligned
106 vld1.8 {d0, d1}, [r1]!
107 vst1.8 {d0, d1}, [r0, :128]!
108
1095: /* copy up to 15-bytes (count in r2) */
110 movs ip, r2, lsl #29
111 bcc 1f
112 vld1.8 {d0}, [r1]!
113 vst1.8 {d0}, [r0]!
1141: bge 2f
115 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
116 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
1172: movs ip, r2, lsl #31
118 ldrmib r3, [r1], #1
119 ldrcsb ip, [r1], #1
120 ldrcsb lr, [r1], #1
121 strmib r3, [r0], #1
122 strcsb ip, [r0], #1
123 strcsb lr, [r0], #1
124
125 ldmfd sp!, {r0, lr}
126 bx lr
127END(memcpy)