Skip to content

Commit e7419fb

Browse files
Add ARM assembly optimized memcpy for RP2350 (#2552)
33% faster for 4K memcpy using DMAMemcyp example With this assembly: CPU: 4835 clock cycles for 4K DMA: 2169 clock cycles for 4K Using stock Newlib memcpy: CPU: 7314 clock cycles for 4K DMA: 2175 clock cycles for 4K
1 parent 060b15f commit e7419fb

File tree

2 files changed

+385
-0
lines changed

2 files changed

+385
-0
lines changed

cores/rp2040/rp2350-memcpy.S

+383
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,383 @@
1+
/*
2+
* Copyright (c) 2011 ARM Ltd
3+
* All rights reserved.
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions
7+
* are met:
8+
* 1. Redistributions of source code must retain the above copyright
9+
* notice, this list of conditions and the following disclaimer.
10+
* 2. Redistributions in binary form must reproduce the above copyright
11+
* notice, this list of conditions and the following disclaimer in the
12+
* documentation and/or other materials provided with the distribution.
13+
* 3. The name of the company may not be used to endorse or promote
14+
* products derived from this software without specific prior written
15+
* permission.
16+
*
17+
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18+
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19+
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20+
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21+
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22+
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23+
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24+
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25+
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26+
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27+
*/
28+
#if defined(PICO_RP2350) && defined(__arm__)
29+
/* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
30+
/* Use the version of memcpy implemented using LDRD and STRD.
31+
This version is tuned for Cortex-A15.
32+
This might not be the best for other ARMv7-A CPUs,
33+
but there is no predefine to distinguish between
34+
different CPUs in the same architecture,
35+
and this version is better than the plain memcpy provided in newlib.
36+
Therefore, we use this version for all ARMv7-A CPUS. */
37+
/* To make the same code compile for both ARM and Thumb instruction
38+
sets, switch to unified syntax at the beginning of this function.
39+
However, by using the same code, we may be missing optimization
40+
opportunities. For instance, in LDRD/STRD instructions, the first
41+
destination register must be even and the second consecutive in
42+
ARM state, but not in Thumb state. */
43+
.syntax unified
44+
#if defined (__thumb__)
45+
.thumb
46+
.thumb_func
47+
#endif
48+
#ifdef __native_client__
49+
#define SFI_BREG(reg) sfi_breg reg,
50+
#define IT(insn)
51+
#ifdef __thumb__
52+
#error "thumb and native_client are not compatible!"
53+
#endif
54+
.p2align 4
55+
#else
56+
#define SFI_BREG(reg)
57+
#define IT(insn) insn
58+
#endif
59+
.global __wrap_memcpy
60+
.type __wrap_memcpy, %function
61+
// .section .time_critical.memcpy // Actually slows down a bit because RAM and program RAM conflict
62+
__wrap_memcpy:
63+
/* Assumes that n >= 0, and dst, src are valid pointers.
64+
If there is at least 8 bytes to copy, use LDRD/STRD.
65+
If src and dst are misaligned with different offsets,
66+
first copy byte by byte until dst is aligned,
67+
and then copy using LDRD/STRD and shift if needed.
68+
When less than 8 left, copy a word and then byte by byte. */
69+
/* Save registers (r0 holds the return value):
70+
optimized push {r0, r4, r5, lr}.
71+
To try and improve performance, stack layout changed,
72+
i.e., not keeping the stack looking like users expect
73+
(highest numbered register at highest address). */
74+
push {r0, lr}
75+
strd r4, r5, [sp, #-8]!
76+
/* TODO: Add debug frame directives.
77+
We don't need exception unwind directives, because the code below
78+
does not throw any exceptions and does not call any other functions.
79+
Generally, newlib functions like this lack debug information for
80+
assembler source. */
81+
/* Get copying of tiny blocks out of the way first. */
82+
/* Is there at least 4 bytes to copy? */
83+
subs r2, r2, #4
84+
blt copy_less_than_4 /* If n < 4. */
85+
/* Check word alignment. */
86+
ands ip, r0, #3 /* ip = last 2 bits of dst. */
87+
bne dst_not_word_aligned /* If dst is not word-aligned. */
88+
/* Get here if dst is word-aligned. */
89+
ands ip, r1, #3 /* ip = last 2 bits of src. */
90+
bne src_not_word_aligned /* If src is not word-aligned. */
91+
word_aligned:
92+
/* Get here if source and dst both are word-aligned.
93+
The number of bytes remaining to copy is r2+4. */
94+
/* Is there is at least 64 bytes to copy? */
95+
subs r2, r2, #60
96+
blt copy_less_than_64 /* If r2 + 4 < 64. */
97+
/* First, align the destination buffer to 8-bytes,
98+
to make sure double loads and stores don't cross cache line boundary,
99+
as they are then more expensive even if the data is in the cache
100+
(require two load/store issue cycles instead of one).
101+
If only one of the buffers is not 8-bytes aligned,
102+
then it's more important to align dst than src,
103+
because there is more penalty for stores
104+
than loads that cross cacheline boundary.
105+
This check and realignment are only worth doing
106+
if there is a lot to copy. */
107+
/* Get here if dst is word aligned,
108+
i.e., the 2 least significant bits are 0.
109+
If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
110+
then copy 1 word (4 bytes). */
111+
ands r3, r0, #4
112+
beq 11f /* If dst already two-word aligned. */
113+
SFI_BREG(r1) \
114+
ldr r3, [r1], #4
115+
SFI_BREG(r0) \
116+
str r3, [r0], #4
117+
subs r2, r2, #4
118+
blt copy_less_than_64
119+
11:
120+
/* TODO: Align to cacheline (useful for PLD optimization). */
121+
/* Every loop iteration copies 64 bytes. */
122+
1:
123+
.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
124+
SFI_BREG(r1) \
125+
ldrd r4, r5, [r1, \offset]
126+
SFI_BREG(r0) \
127+
strd r4, r5, [r0, \offset]
128+
.endr
129+
add r0, r0, #64
130+
add r1, r1, #64
131+
subs r2, r2, #64
132+
bge 1b /* If there is more to copy. */
133+
copy_less_than_64:
134+
/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
135+
Restore the count if there is more than 7 bytes to copy. */
136+
adds r2, r2, #56
137+
blt copy_less_than_8
138+
/* Copy 8 bytes at a time. */
139+
2:
140+
SFI_BREG(r1) \
141+
ldrd r4, r5, [r1], #8
142+
SFI_BREG(r0) \
143+
strd r4, r5, [r0], #8
144+
subs r2, r2, #8
145+
bge 2b /* If there is more to copy. */
146+
copy_less_than_8:
147+
/* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
148+
Check if there is more to copy. */
149+
cmn r2, #8
150+
beq return /* If r2 + 8 == 0. */
151+
/* Restore the count if there is more than 3 bytes to copy. */
152+
adds r2, r2, #4
153+
blt copy_less_than_4
154+
/* Copy 4 bytes. */
155+
SFI_BREG(r1) \
156+
ldr r3, [r1], #4
157+
SFI_BREG(r0) \
158+
str r3, [r0], #4
159+
copy_less_than_4:
160+
/* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
161+
/* Restore the count, check if there is more to copy. */
162+
adds r2, r2, #4
163+
beq return /* If r2 == 0. */
164+
/* Get here with r2 is in {1,2,3}={01,10,11}. */
165+
/* Logical shift left r2, insert 0s, update flags. */
166+
lsls r2, r2, #31
167+
/* Copy byte by byte.
168+
Condition ne means the last bit of r2 is 0.
169+
Condition cs means the second to last bit of r2 is set,
170+
i.e., r2 is 1 or 3. */
171+
IT(itt ne)
172+
SFI_BREG(r1) \
173+
ldrbne r3, [r1], #1
174+
SFI_BREG(r0) \
175+
strbne r3, [r0], #1
176+
IT(itttt cs)
177+
SFI_BREG(r1) \
178+
ldrbcs r4, [r1], #1
179+
SFI_BREG(r1) \
180+
ldrbcs r5, [r1]
181+
SFI_BREG(r0) \
182+
strbcs r4, [r0], #1
183+
SFI_BREG(r0) \
184+
strbcs r5, [r0]
185+
return:
186+
/* Restore registers: optimized pop {r0, r4, r5, pc} */
187+
ldrd r4, r5, [sp], #8
188+
#ifdef __native_client__
189+
pop {r0, lr}
190+
sfi_bx lr
191+
#else
192+
pop {r0, pc} /* This is the only return point of memcpy. */
193+
#endif
194+
#ifndef __ARM_FEATURE_UNALIGNED
195+
/* The following assembly macro implements misaligned copy in software.
196+
Assumes that dst is word aligned, src is at offset "pull" bits from
197+
word, push = 32 - pull, and the number of bytes that remain to copy
198+
is r2 + 4, r2 >= 0. */
199+
/* In the code below, r2 is the number of bytes that remain to be
200+
written. The number of bytes read is always larger, because we have
201+
partial words in the shift queue. */
202+
.macro miscopy pull push shiftleft shiftright
203+
/* Align src to the previous word boundary. */
204+
bic r1, r1, #3
205+
/* Initialize the shift queue. */
206+
SFI_BREG(r1) \
207+
ldr r5, [r1], #4 /* Load a word from source. */
208+
subs r2, r2, #4
209+
blt 6f /* Go to misaligned copy of less than 8 bytes. */
210+
/* Get here if there is more than 8 bytes to copy.
211+
The number of bytes to copy is r2+8, r2 >= 0. */
212+
/* Save registers: push { r6, r7 }.
213+
We need additional registers for LDRD and STRD, because in ARM state
214+
the first destination register must be even and the second
215+
consecutive. */
216+
strd r6, r7, [sp, #-8]!
217+
subs r2, r2, #56
218+
blt 4f /* Go to misaligned copy of less than 64 bytes. */
219+
3:
220+
/* Get here if there is more than 64 bytes to copy.
221+
The number of bytes to copy is r2+64, r2 >= 0. */
222+
/* Copy 64 bytes in every iteration.
223+
Use a partial word from the shift queue. */
224+
.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
225+
mov r6, r5, \shiftleft #\pull
226+
SFI_BREG(r1) \
227+
ldrd r4, r5, [r1, \offset]
228+
orr r6, r6, r4, \shiftright #\push
229+
mov r7, r4, \shiftleft #\pull
230+
orr r7, r7, r5, \shiftright #\push
231+
SFI_BREG(r0) \
232+
strd r6, r7, [r0, \offset]
233+
.endr
234+
add r1, r1, #64
235+
add r0, r0, #64
236+
subs r2, r2, #64
237+
bge 3b
238+
4:
239+
/* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
240+
and they are misaligned. */
241+
/* Restore the count if there is more than 7 bytes to copy. */
242+
adds r2, r2, #56
243+
/* If less than 8 bytes to copy,
244+
restore registers saved for this loop: optimized poplt { r6, r7 }. */
245+
itt lt
246+
ldrdlt r6, r7, [sp], #8
247+
blt 6f /* Go to misaligned copy of less than 8 bytes. */
248+
5:
249+
/* Copy 8 bytes at a time.
250+
Use a partial word from the shift queue. */
251+
mov r6, r5, \shiftleft #\pull
252+
SFI_BREG(r1) \
253+
ldrd r4, r5, [r1], #8
254+
orr r6, r6, r4, \shiftright #\push
255+
mov r7, r4, \shiftleft #\pull
256+
orr r7, r7, r5, \shiftright #\push
257+
SFI_BREG(r0) \
258+
strd r6, r7, [r0], #8
259+
subs r2, r2, #8
260+
bge 5b /* If there is more to copy. */
261+
/* Restore registers saved for this loop: optimized pop { r6, r7 }. */
262+
ldrd r6, r7, [sp], #8
263+
6:
264+
/* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
265+
and they are misaligned. */
266+
/* Check if there is more to copy. */
267+
cmn r2, #8
268+
beq return
269+
/* Check if there is less than 4 bytes to copy. */
270+
cmn r2, #4
271+
itt lt
272+
/* Restore src offset from word-align. */
273+
sublt r1, r1, #(\push / 8)
274+
blt copy_less_than_4
275+
/* Use a partial word from the shift queue. */
276+
mov r3, r5, \shiftleft #\pull
277+
/* Load a word from src, but without writeback
278+
(this word is not fully written to dst). */
279+
SFI_BREG(r1) \
280+
ldr r5, [r1]
281+
/* Restore src offset from word-align. */
282+
add r1, r1, #(\pull / 8)
283+
/* Shift bytes to create one dst word and store it. */
284+
orr r3, r3, r5, \shiftright #\push
285+
SFI_BREG(r0) \
286+
str r3, [r0], #4
287+
/* Use single byte copying of the remaining bytes. */
288+
b copy_less_than_4
289+
.endm
290+
#endif /* not __ARM_FEATURE_UNALIGNED */
291+
dst_not_word_aligned:
292+
/* Get here when dst is not aligned and ip has the last 2 bits of dst,
293+
i.e., ip is the offset of dst from word.
294+
The number of bytes that remains to copy is r2 + 4,
295+
i.e., there are at least 4 bytes to copy.
296+
Write a partial word (0 to 3 bytes), such that dst becomes
297+
word-aligned. */
298+
/* If dst is at ip bytes offset from a word (with 0 < ip < 4),
299+
then there are (4 - ip) bytes to fill up to align dst to the next
300+
word. */
301+
rsb ip, ip, #4 /* ip = #4 - ip. */
302+
cmp ip, #2
303+
/* Copy byte by byte with conditionals. */
304+
IT(itt gt)
305+
SFI_BREG(r1) \
306+
ldrbgt r3, [r1], #1
307+
SFI_BREG(r0) \
308+
strbgt r3, [r0], #1
309+
IT(itt ge)
310+
SFI_BREG(r1) \
311+
ldrbge r4, [r1], #1
312+
SFI_BREG(r0) \
313+
strbge r4, [r0], #1
314+
SFI_BREG(r1) \
315+
ldrb lr, [r1], #1
316+
SFI_BREG(r0) \
317+
strb lr, [r0], #1
318+
/* Update the count.
319+
ip holds the number of bytes we have just copied. */
320+
subs r2, r2, ip /* r2 = r2 - ip. */
321+
blt copy_less_than_4 /* If r2 < ip. */
322+
/* Get here if there are more than 4 bytes to copy.
323+
Check if src is aligned. If beforehand src and dst were not word
324+
aligned but congruent (same offset), then now they are both
325+
word-aligned, and we can copy the rest efficiently (without
326+
shifting). */
327+
ands ip, r1, #3 /* ip = last 2 bits of src. */
328+
beq word_aligned /* If r1 is word-aligned. */
329+
src_not_word_aligned:
330+
/* Get here when src is not word-aligned, but dst is word-aligned.
331+
The number of bytes that remains to copy is r2+4. */
332+
#ifdef __ARM_FEATURE_UNALIGNED
333+
/* Copy word by word using LDR when alignment can be done in hardware,
334+
i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
335+
subs r2, r2, #60
336+
blt 8f
337+
7:
338+
/* Copy 64 bytes in every loop iteration. */
339+
.irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
340+
SFI_BREG(r1) \
341+
ldr r3, [r1, \offset]
342+
SFI_BREG(r0) \
343+
str r3, [r0, \offset]
344+
.endr
345+
add r0, r0, #64
346+
add r1, r1, #64
347+
subs r2, r2, #64
348+
bge 7b
349+
8:
350+
/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
351+
Check if there is more than 3 bytes to copy. */
352+
adds r2, r2, #60
353+
blt copy_less_than_4
354+
9:
355+
/* Get here if there is less than 64 but at least 4 bytes to copy,
356+
where the number of bytes to copy is r2+4. */
357+
SFI_BREG(r1) \
358+
ldr r3, [r1], #4
359+
SFI_BREG(r0) \
360+
str r3, [r0], #4
361+
subs r2, r2, #4
362+
bge 9b
363+
b copy_less_than_4
364+
#else /* not __ARM_FEATURE_UNALIGNED */
365+
/* ip has last 2 bits of src,
366+
i.e., ip is the offset of src from word, and ip > 0.
367+
Compute shifts needed to copy from src to dst. */
368+
cmp ip, #2
369+
beq miscopy_16_16 /* If ip == 2. */
370+
bge miscopy_24_8 /* If ip == 3. */
371+
/* Get here if ip == 1. */
372+
/* Endian independent macros for shifting bytes within registers. */
373+
#ifndef __ARMEB__
374+
miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
375+
miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
376+
miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
377+
#else /* not __ARMEB__ */
378+
miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
379+
miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
380+
miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
381+
#endif /* not __ARMEB__ */
382+
#endif /* not __ARM_FEATURE_UNALIGNED */
383+
#endif /* memcpy */

lib/rp2350/platform_wrap.txt

+2
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,5 @@
9797
-Wl,--wrap=tanhf
9898
-Wl,--wrap=trunc
9999
-Wl,--wrap=truncf
100+
101+
-Wl,--wrap=memcpy

0 commit comments

Comments
 (0)