Skip to content

Commit 1c172b0

Browse files
Add RP2040.memcpyDMA for DMA-managed memory copies
RP2040::memcpyDMA implements a DMA-controlled memory copy call identical in function to standard memcpy, but using an onboard DMA engine. For large memory transfers this can be significantly faster than using the CPU-based memcpy. Only 4-byte aligned source, destination, and counts are allowed. If any inputs are not 4-byte aligned, then standard memcpy will occur so it will behave correctly for any inputs.
1 parent e022d47 commit 1c172b0

File tree

3 files changed

+93
-0
lines changed

3 files changed

+93
-0
lines changed

cores/rp2040/RP2040Support.h

+32
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <hardware/structs/rosc.h>
2828
#include <hardware/structs/systick.h>
2929
#include <pico/multicore.h>
30+
#include <hardware/dma.h>
3031
#include <pico/rand.h>
3132
#include <pico/util/queue.h>
3233
#include <pico/bootrom.h>
@@ -311,6 +312,36 @@ class RP2040 {
311312
return id;
312313
}
313314

315+
#pragma GCC push_options
316+
#pragma GCC optimize ("Os")
317+
void *memcpyDMA(void *dest, const void *src, size_t n) {
318+
// Allocate a DMA channel on 1st call, reuse it every call after
319+
if (memcpyDMAChannel < 1) {
320+
memcpyDMAChannel = dma_claim_unused_channel(true);
321+
dma_channel_config c = dma_channel_get_default_config(memcpyDMAChannel);
322+
channel_config_set_transfer_data_size(&c, DMA_SIZE_32);
323+
channel_config_set_read_increment(&c, true);
324+
channel_config_set_write_increment(&c, true);
325+
channel_config_set_irq_quiet(&c, true);
326+
dma_channel_set_config(memcpyDMAChannel, &c, false);
327+
}
328+
// If there's any misalignment or too small, use regular memcpy which can handle it
329+
if ((n < 64) || (((uint32_t)dest) | ((uint32_t)src) | n) & 3) {
330+
return memcpy(dest, src, n);
331+
}
332+
333+
int words = n / 4;
334+
dma_channel_set_read_addr(memcpyDMAChannel, src, false);
335+
dma_channel_set_write_addr(memcpyDMAChannel, dest, false);
336+
dma_channel_set_trans_count(memcpyDMAChannel, words, false);
337+
dma_channel_start(memcpyDMAChannel);
338+
while (dma_channel_is_busy(memcpyDMAChannel)) {
339+
/* busy wait dma */
340+
}
341+
return dest;
342+
}
343+
#pragma GCC pop_options
344+
314345
// Multicore comms FIFO
315346
_MFIFO fifo;
316347

@@ -336,4 +367,5 @@ class RP2040 {
336367
PIO _pio;
337368
int _sm;
338369
PIOProgram *_ccountPgm;
370+
int memcpyDMAChannel = -1;
339371
};

docs/rp2040.rst

+10
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,13 @@ bootloader.
9999
void rp2040.rebootToBootloader()
100100
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
101101
Will reboot the RP2040 into USB UF2 upload mode.
102+
103+
DMA-based MEMCPY
104+
----------------
105+
The onboard DMA engines can copy 4-byte aligned quantities faster than the CPU.
106+
107+
void \*rp2040.memcpyDMA(void \*dest, const void \*src, size_t count);
108+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
109+
Uses a DMA engine to transfer data from src to dest in 4-byte chunks without CPU
110+
intervention. If any arguments are not 4-byte aligned, or if the count is not a
111+
multiple of 4, then it will fall back to CPU-managed memcpy.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Simple speed and functionality test for DMA memcpy
2+
// Released to the public domain by Earle F. Philhower, III, 2024
3+
4+
5+
uint32_t src[1024];
6+
uint32_t dest[1024];
7+
8+
void setup() {
9+
// Set up a simple test pattern in src, verify after each pass
10+
srand(0xc0de);
11+
for (int i = 0; i < 1024; i++) {
12+
src[i] = ((~i) & 1024) | ((i) << 22);
13+
}
14+
}
15+
16+
void verify(const char *name, uint32_t *src) {
17+
for (int i = 0; i < 1024; i++) {
18+
uint32_t expected = ((~i) & 1024) | ((i) << 22);
19+
if (expected != src[i]) {
20+
Serial.printf("ERROR, mismatch @ %d on %s memcpy\n", i, name);
21+
while (true) {
22+
// Idle forever, this is fatal!
23+
}
24+
}
25+
}
26+
}
27+
28+
void loop() {
29+
uint64_t start, stop;
30+
31+
start = rp2040.getCycleCount();
32+
for (int i = 0; i < 1000; i++) {
33+
memcpy(dest, src, 4 * 1024);
34+
memcpy(src, dest, 4 * 1024);
35+
}
36+
stop = rp2040.getCycleCount();
37+
verify("CPU", src);
38+
verify("CPU", dest);
39+
Serial.printf("CPU: %lld clock cycles for 4K\n", (stop - start) / 1000);
40+
41+
start = rp2040.getCycleCount();
42+
for (int i = 0; i < 1000; i++) {
43+
rp2040.memcpyDMA(dest, src, 4 * 1024);
44+
rp2040.memcpyDMA(src, dest, 4 * 1024);
45+
}
46+
stop = rp2040.getCycleCount();
47+
verify("DMA", src);
48+
verify("DMA", dest);
49+
Serial.printf("DMA: %lld clock cycles for 4K\n\n\n", (stop - start) / 1000);
50+
delay(1000);
51+
}

0 commit comments

Comments
 (0)