Blame - libc/arch-arm/bionic/memcpy.a15.S - android_bionic

blob: d1bfb7c85ed7a252943511635faf0631434c0463 [file] [log] [blame]

Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame^]	1	/*
				2	* Copyright (c) 2013 ARM Ltd
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* 1. Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* 2. Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in the
				12	* documentation and/or other materials provided with the distribution.
				13	* 3. The name of the company may not be used to endorse or promote
				14	* products derived from this software without specific prior written
				15	* permission.
				16	*
				17	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
				18	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				19	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
				20	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
				22	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				23	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				24	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				25	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				26	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27	*/
				28
				29	#if (defined (__OPTIMIZE_SIZE__) \|\| defined (PREFER_SIZE_OVER_SPEED) \|\| \
				30	(!(defined (__ARM_ARCH_7A__))))
				31
				32	/* Do nothing here. See memcpy-stub.c in the same directory. */
				33
				34	#else
				35	/* Prototype: void memcpy (void dst, const void src, size_t count). /
				36
				37	/* Use the version of memcpy implemented using LDRD and STRD.
				38	This version is tuned for Cortex-A15.
				39	This might not be the best for other ARMv7-A CPUs,
				40	but there is no predefine to distinguish between
				41	different CPUs in the same architecture,
				42	and this version is better than the plain memcpy provided in newlib.
				43
				44	Therefore, we use this version for all ARMv7-A CPUS. */
				45
				46	/* To make the same code compile for both ARM and Thumb instruction
				47	sets, switch to unified syntax at the beginning of this function.
				48	However, by using the same code, we may be missing optimization
				49	opportunities. For instance, in LDRD/STRD instructions, the first
				50	destination register must be even and the second consecutive in
				51	ARM state, but not in Thumb state. */
				52
				53	.syntax unified
				54
				55	#if defined (__thumb__)
				56	.thumb
				57	.thumb_func
				58	#endif
				59
				60	.global memcpy
				61	.type memcpy, %function
				62	memcpy:
				63
				64	/* Assumes that n >= 0, and dst, src are valid pointers.
				65	If there is at least 8 bytes to copy, use LDRD/STRD.
				66	If src and dst are misaligned with different offsets,
				67	first copy byte by byte until dst is aligned,
				68	and then copy using LDRD/STRD and shift if needed.
				69	When less than 8 left, copy a word and then byte by byte. */
				70
				71	/* Save registers (r0 holds the return value):
				72	optimized push {r0, r4, r5, lr}.
				73	To try and improve performance, stack layout changed,
				74	i.e., not keeping the stack looking like users expect
				75	(highest numbered register at highest address). */
				76	push {r0, lr}
				77	strd r4, r5, [sp, #-8]!
				78
				79	/* TODO: Add debug frame directives.
				80	We don't need exception unwind directives, because the code below
				81	does not throw any exceptions and does not call any other functions.
				82	Generally, newlib functions like this lack debug information for
				83	assembler source. */
				84
				85	/* Get copying of tiny blocks out of the way first. */
				86	/* Is there at least 4 bytes to copy? */
				87	subs r2, r2, #4
				88	blt copy_less_than_4 /* If n < 4. */
				89
				90	/* Check word alignment. */
				91	ands ip, r0, #3 /* ip = last 2 bits of dst. */
				92	bne dst_not_word_aligned /* If dst is not word-aligned. */
				93
				94	/* Get here if dst is word-aligned. */
				95	ands ip, r1, #3 /* ip = last 2 bits of src. */
				96	bne src_not_word_aligned /* If src is not word-aligned. */
				97	word_aligned:
				98	/* Get here if source and dst both are word-aligned.
				99	The number of bytes remaining to copy is r2+4. */
				100
				101	/* Is there is at least 64 bytes to copy? */
				102	subs r2, r2, #60
				103	blt copy_less_than_64 /* If r2 + 4 < 64. */
				104
				105	/* First, align the destination buffer to 8-bytes,
				106	to make sure double loads and stores don't cross cache line boundary,
				107	as they are then more expensive even if the data is in the cache
				108	(require two load/store issue cycles instead of one).
				109	If only one of the buffers is not 8-bytes aligned,
				110	then it's more important to align dst than src,
				111	because there is more penalty for stores
				112	than loads that cross cacheline boundary.
				113	This check and realignment are only worth doing
				114	if there is a lot to copy. */
				115
				116	/* Get here if dst is word aligned,
				117	i.e., the 2 least significant bits are 0.
				118	If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
				119	then copy 1 word (4 bytes). */
				120	ands r3, r0, #4
				121	beq 11f /* If dst already two-word aligned. */
				122	ldr r3, [r1], #4
				123	str r3, [r0], #4
				124	subs r2, r2, #4
				125	blt copy_less_than_64
				126
				127	11:
				128	/* TODO: Align to cacheline (useful for PLD optimization). */
				129
				130	/* Every loop iteration copies 64 bytes. */
				131	1:
				132	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
				133	ldrd r4, r5, [r1, \offset]
				134	strd r4, r5, [r0, \offset]
				135	.endr
				136
				137	add r0, r0, #64
				138	add r1, r1, #64
				139	subs r2, r2, #64
				140	bge 1b /* If there is more to copy. */
				141
				142	copy_less_than_64:
				143
				144	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
				145	Restore the count if there is more than 7 bytes to copy. */
				146	adds r2, r2, #56
				147	blt copy_less_than_8
				148
				149	/* Copy 8 bytes at a time. */
				150	2:
				151	ldrd r4, r5, [r1], #8
				152	strd r4, r5, [r0], #8
				153	subs r2, r2, #8
				154	bge 2b /* If there is more to copy. */
				155
				156	copy_less_than_8:
				157
				158	/* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
				159	Check if there is more to copy. */
				160	cmn r2, #8
				161	beq return /* If r2 + 8 == 0. */
				162
				163	/* Restore the count if there is more than 3 bytes to copy. */
				164	adds r2, r2, #4
				165	blt copy_less_than_4
				166
				167	/* Copy 4 bytes. */
				168	ldr r3, [r1], #4
				169	str r3, [r0], #4
				170
				171	copy_less_than_4:
				172	/* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
				173
				174	/* Restore the count, check if there is more to copy. */
				175	adds r2, r2, #4
				176	beq return /* If r2 == 0. */
				177
				178	/* Get here with r2 is in {1,2,3}={01,10,11}. */
				179	/* Logical shift left r2, insert 0s, update flags. */
				180	lsls r2, r2, #31
				181
				182	/* Copy byte by byte.
				183	Condition ne means the last bit of r2 is 0.
				184	Condition cs means the second to last bit of r2 is set,
				185	i.e., r2 is 1 or 3. */
				186	itt ne
				187	ldrbne r3, [r1], #1
				188	strbne r3, [r0], #1
				189
				190	itttt cs
				191	ldrbcs r4, [r1], #1
				192	ldrbcs r5, [r1]
				193	strbcs r4, [r0], #1
				194	strbcs r5, [r0]
				195
				196	return:
				197	/* Restore registers: optimized pop {r0, r4, r5, pc} */
				198	ldrd r4, r5, [sp], #8
				199	pop {r0, pc} /* This is the only return point of memcpy. */
				200
				201	#ifndef __ARM_FEATURE_UNALIGNED
				202
				203	/* The following assembly macro implements misaligned copy in software.
				204	Assumes that dst is word aligned, src is at offset "pull" bits from
				205	word, push = 32 - pull, and the number of bytes that remain to copy
				206	is r2 + 4, r2 >= 0. */
				207
				208	/* In the code below, r2 is the number of bytes that remain to be
				209	written. The number of bytes read is always larger, because we have
				210	partial words in the shift queue. */
				211
				212	.macro miscopy pull push shiftleft shiftright
				213
				214	/* Align src to the previous word boundary. */
				215	bic r1, r1, #3
				216
				217	/* Initialize the shift queue. */
				218	ldr r5, [r1], #4 /* Load a word from source. */
				219
				220	subs r2, r2, #4
				221	blt 6f /* Go to misaligned copy of less than 8 bytes. */
				222
				223	/* Get here if there is more than 8 bytes to copy.
				224	The number of bytes to copy is r2+8, r2 >= 0. */
				225
				226	/* Save registers: push { r6, r7 }.
				227	We need additional registers for LDRD and STRD, because in ARM state
				228	the first destination register must be even and the second
				229	consecutive. */
				230	strd r6, r7, [sp, #-8]!
				231
				232	subs r2, r2, #56
				233	blt 4f /* Go to misaligned copy of less than 64 bytes. */
				234
				235	3:
				236	/* Get here if there is more than 64 bytes to copy.
				237	The number of bytes to copy is r2+64, r2 >= 0. */
				238
				239	/* Copy 64 bytes in every iteration.
				240	Use a partial word from the shift queue. */
				241	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
				242	mov r6, r5, \shiftleft #\pull
				243	ldrd r4, r5, [r1, \offset]
				244	orr r6, r6, r4, \shiftright #\push
				245	mov r7, r4, \shiftleft #\pull
				246	orr r7, r7, r5, \shiftright #\push
				247	strd r6, r7, [r0, \offset]
				248	.endr
				249
				250	add r1, r1, #64
				251	add r0, r0, #64
				252	subs r2, r2, #64
				253	bge 3b
				254
				255	4:
				256	/* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
				257	and they are misaligned. */
				258
				259	/* Restore the count if there is more than 7 bytes to copy. */
				260	adds r2, r2, #56
				261
				262	/* If less than 8 bytes to copy,
				263	restore registers saved for this loop: optimized poplt { r6, r7 }. */
				264	itt lt
				265	ldrdlt r6, r7, [sp], #8
				266	blt 6f /* Go to misaligned copy of less than 8 bytes. */
				267
				268	5:
				269	/* Copy 8 bytes at a time.
				270	Use a partial word from the shift queue. */
				271	mov r6, r5, \shiftleft #\pull
				272	ldrd r4, r5, [r1], #8
				273	orr r6, r6, r4, \shiftright #\push
				274	mov r7, r4, \shiftleft #\pull
				275	orr r7, r7, r5, \shiftright #\push
				276	strd r6, r7, [r0], #8
				277
				278	subs r2, r2, #8
				279	bge 5b /* If there is more to copy. */
				280
				281	/* Restore registers saved for this loop: optimized pop { r6, r7 }. */
				282	ldrd r6, r7, [sp], #8
				283
				284	6:
				285	/* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
				286	and they are misaligned. */
				287
				288	/* Check if there is more to copy. */
				289	cmn r2, #8
				290	beq return
				291
				292	/* Check if there is less than 4 bytes to copy. */
				293	cmn r2, #4
				294
				295	itt lt
				296	/* Restore src offset from word-align. */
				297	sublt r1, r1, #(\push / 8)
				298	blt copy_less_than_4
				299
				300	/* Use a partial word from the shift queue. */
				301	mov r3, r5, \shiftleft #\pull
				302	/* Load a word from src, but without writeback
				303	(this word is not fully written to dst). */
				304	ldr r5, [r1]
				305
				306	/* Restore src offset from word-align. */
				307	add r1, r1, #(\pull / 8)
				308
				309	/* Shift bytes to create one dst word and store it. */
				310	orr r3, r3, r5, \shiftright #\push
				311	str r3, [r0], #4
				312
				313	/* Use single byte copying of the remaining bytes. */
				314	b copy_less_than_4
				315
				316	.endm
				317
				318	#endif /* not __ARM_FEATURE_UNALIGNED */
				319
				320	dst_not_word_aligned:
				321
				322	/* Get here when dst is not aligned and ip has the last 2 bits of dst,
				323	i.e., ip is the offset of dst from word.
				324	The number of bytes that remains to copy is r2 + 4,
				325	i.e., there are at least 4 bytes to copy.
				326	Write a partial word (0 to 3 bytes), such that dst becomes
				327	word-aligned. */
				328
				329	/* If dst is at ip bytes offset from a word (with 0 < ip < 4),
				330	then there are (4 - ip) bytes to fill up to align dst to the next
				331	word. */
				332	rsb ip, ip, #4 /* ip = #4 - ip. */
				333	cmp ip, #2
				334
				335	/* Copy byte by byte with conditionals. */
				336	itt gt
				337	ldrbgt r3, [r1], #1
				338	strbgt r3, [r0], #1
				339
				340	itt ge
				341	ldrbge r4, [r1], #1
				342	strbge r4, [r0], #1
				343
				344	ldrb lr, [r1], #1
				345	strb lr, [r0], #1
				346
				347	/* Update the count.
				348	ip holds the number of bytes we have just copied. */
				349	subs r2, r2, ip /* r2 = r2 - ip. */
				350	blt copy_less_than_4 /* If r2 < ip. */
				351
				352	/* Get here if there are more than 4 bytes to copy.
				353	Check if src is aligned. If beforehand src and dst were not word
				354	aligned but congruent (same offset), then now they are both
				355	word-aligned, and we can copy the rest efficiently (without
				356	shifting). */
				357	ands ip, r1, #3 /* ip = last 2 bits of src. */
				358	beq word_aligned /* If r1 is word-aligned. */
				359
				360	src_not_word_aligned:
				361	/* Get here when src is not word-aligned, but dst is word-aligned.
				362	The number of bytes that remains to copy is r2+4. */
				363
				364	#ifdef __ARM_FEATURE_UNALIGNED
				365	/* Copy word by word using LDR when alignment can be done in hardware,
				366	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
				367	subs r2, r2, #60
				368	blt 8f
				369
				370	7:
				371	/* Copy 64 bytes in every loop iteration. */
				372	.irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
				373	ldr r3, [r1, \offset]
				374	str r3, [r0, \offset]
				375	.endr
				376
				377	add r0, r0, #64
				378	add r1, r1, #64
				379	subs r2, r2, #64
				380	bge 7b
				381
				382	8:
				383	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
				384	Check if there is more than 3 bytes to copy. */
				385	adds r2, r2, #60
				386	blt copy_less_than_4
				387
				388	9:
				389	/* Get here if there is less than 64 but at least 4 bytes to copy,
				390	where the number of bytes to copy is r2+4. */
				391	ldr r3, [r1], #4
				392	str r3, [r0], #4
				393	subs r2, r2, #4
				394	bge 9b
				395
				396	b copy_less_than_4
				397
				398	#else /* not __ARM_FEATURE_UNALIGNED */
				399
				400	/* ip has last 2 bits of src,
				401	i.e., ip is the offset of src from word, and ip > 0.
				402	Compute shifts needed to copy from src to dst. */
				403	cmp ip, #2
				404	beq miscopy_16_16 /* If ip == 2. */
				405	bge miscopy_24_8 /* If ip == 3. */
				406
				407	/* Get here if ip == 1. */
				408
				409	/* Endian independent macros for shifting bytes within registers. */
				410
				411	#ifndef __ARMEB__
				412	miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
				413	miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
				414	miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
				415	#else /* not __ARMEB__ */
				416	miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
				417	miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
				418	miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
				419	#endif /* not __ARMEB__ */
				420
				421	#endif /* not __ARM_FEATURE_UNALIGNED */
				422
				423	#endif /* memcpy */