Skip to content

Commit 37e4037

Browse files
committedJan 12, 2025
add a new MPI program to evaluate MPI_Alltoallw
1 parent 5e95f0e commit 37e4037

File tree

2 files changed

+375
-0
lines changed

2 files changed

+375
-0
lines changed
 

‎MPI/alltoallw.c

+304
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2+
*
3+
* Copyright (C) 2025, Northwestern University
4+
* See COPYRIGHT notice in top-level directory.
5+
*
6+
* Evaluate performane of all-to-many personalized communication implemented
7+
* with MPI_Alltoallw() and MPI_Issend()/MPI_Irecv().
8+
*
9+
* To compile:
10+
* % mpicc -O2 alltoallw.c -o alltoallw
11+
*
12+
* Usage:
13+
* % ./alltoallw -h
14+
* Usage: ./alltoallw [OPTION]
15+
* [-h] Print this help message
16+
* [-v] Verbose mode (default: no)
17+
* [-n num] number of iterations (default: 1)
18+
* [-r ratio] every ratio processes is a receiver (default: 1)
19+
* [-l len] receive message size per iteration( default: 8MB)
20+
*
21+
* Example run command and output on screen:
22+
* % mpiexec -n 2048 ./alltoallw -n 253 -r 32
23+
*
24+
* ---- Using MPI_Alltoallw ----
25+
* nprocs = 2048
26+
* ntimes = 253
27+
* num_recvers = 64
28+
* individual message len = 4096 bytes
29+
* send/recv buffer gap = 4 bytes
30+
* Max time: = 53.60 sec
31+
* ---- Using MPI_Issend/Irecv ----
32+
* nprocs = 2048
33+
* ntimes = 253
34+
* num_recvers = 64
35+
* individual message len = 4096 bytes
36+
* send/recv buffer gap = 4 bytes
37+
* Max time: = 2.59 sec
38+
*
39+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
40+
41+
#include <stdio.h>
42+
#include <stdlib.h>
43+
#include <string.h>
44+
#include <unistd.h>
45+
#include <assert.h>
46+
47+
#include <mpi.h>
48+
49+
static int verbose;
50+
51+
#define GAP 4
52+
53+
#define ERR \
54+
if (err != MPI_SUCCESS) { \
55+
int errorStringLen; \
56+
char errorString[MPI_MAX_ERROR_STRING]; \
57+
MPI_Error_string(err, errorString, &errorStringLen); \
58+
printf("Error at line %d: %s\n",__LINE__,errorString); \
59+
goto err_out; \
60+
}
61+
62+
void run_async_send_recv(int ntimes,
63+
int ratio,
64+
int is_receiver,
65+
int len,
66+
char *sendBuf,
67+
char *recvBuf)
68+
{
69+
char *sendPtr, *recvPtr;
70+
int i, j, err, nprocs, rank, nreqs, num_recvers;
71+
MPI_Request *reqs;
72+
MPI_Status *st;
73+
double timing, maxt;
74+
75+
MPI_Barrier(MPI_COMM_WORLD);
76+
timing = MPI_Wtime();
77+
78+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
79+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
80+
num_recvers = nprocs/ ratio;
81+
82+
/* allocate MPI_Request and MPI_Status arrays */
83+
reqs = (MPI_Request*) malloc(sizeof(MPI_Request) * (nprocs + num_recvers));
84+
st = (MPI_Status*) malloc(sizeof(MPI_Status) * (nprocs + num_recvers));
85+
86+
sendPtr = sendBuf;
87+
recvPtr = recvBuf;
88+
89+
for (i=0; i<ntimes; i++) {
90+
nreqs = 0;
91+
92+
/* Only receivers post recv requests */
93+
if (is_receiver) {
94+
for (j=0; j<nprocs; j++) {
95+
if (rank != j) { /* skip recv from self */
96+
err = MPI_Irecv(recvPtr, len, MPI_BYTE, j, 0, MPI_COMM_WORLD,
97+
&reqs[nreqs++]);
98+
ERR
99+
}
100+
recvPtr += len + GAP;
101+
}
102+
}
103+
104+
/* all ranks post send requests */
105+
for (j=0; j<nprocs; j++) {
106+
if (j % ratio) continue; /* j is not a receiver */
107+
if (rank != j) { /* skip send to self */
108+
err = MPI_Issend(sendPtr, len, MPI_BYTE, j, 0, MPI_COMM_WORLD,
109+
&reqs[nreqs++]);
110+
ERR
111+
}
112+
sendPtr += len + GAP;
113+
}
114+
115+
err = MPI_Waitall(nreqs, reqs, st); ERR
116+
}
117+
118+
err_out:
119+
free(st);
120+
free(reqs);
121+
122+
timing = MPI_Wtime() - timing;
123+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
124+
125+
if (rank == 0) {
126+
printf("---- Using MPI_Issend/Irecv ----\n");
127+
printf("nprocs = %d\n", nprocs);
128+
printf("ntimes = %d\n", ntimes);
129+
printf("num_recvers = %d\n", num_recvers);
130+
printf("individual message len = %d bytes\n", len);
131+
printf("send/recv buffer gap = %d bytes\n", GAP);
132+
printf("Max time: = %.2f sec\n", maxt);
133+
}
134+
}
135+
136+
void run_alltoallw(int ntimes,
137+
int ratio,
138+
int is_receiver,
139+
int len,
140+
char *sendBuf,
141+
char *recvBuf)
142+
{
143+
char *sendPtr, *recvPtr;
144+
int i, j, err, nprocs, rank, num_recvers;
145+
int *sendCounts, *recvCounts, *sendDisps, *recvDisps;
146+
MPI_Datatype *sendTypes, *recvTypes;
147+
double timing, maxt;
148+
149+
MPI_Barrier(MPI_COMM_WORLD);
150+
timing = MPI_Wtime();
151+
152+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
153+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
154+
num_recvers = nprocs/ ratio;
155+
156+
sendTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nprocs * 2);
157+
recvTypes = sendTypes + nprocs;
158+
for (i=0; i<nprocs * 2; i++) sendTypes[i] = MPI_BYTE;
159+
160+
sendCounts = (int*) calloc(nprocs * 2, sizeof(int));
161+
recvCounts = sendCounts + nprocs;
162+
sendDisps = (int*) calloc(nprocs * 2, sizeof(int));
163+
recvDisps = sendDisps + nprocs;
164+
165+
sendPtr = sendBuf;
166+
recvPtr = recvBuf;
167+
168+
/* Only receivers has non-zero data to receive */
169+
if (is_receiver) {
170+
j = 0;
171+
for (i=0; i<nprocs; i++) {
172+
if (i != rank) { /* skip receiving from self */
173+
recvCounts[i] = len;
174+
recvDisps[i] = (len + GAP) * j;
175+
}
176+
j++;
177+
if (verbose && i != rank)
178+
printf("%2d recv from %2d of %d\n",rank,i,recvCounts[i]);
179+
}
180+
}
181+
182+
/* All ranks send to each receivers */
183+
j = 0;
184+
for (i=0; i<nprocs; i++) {
185+
if (i % ratio) continue; /* i is not a receiver */
186+
if (i != rank) { /* skip sending to self */
187+
sendCounts[i] = len;
188+
sendDisps[i] = (len + GAP) * j;
189+
}
190+
j++;
191+
if (verbose && i != rank)
192+
printf("%2d send to %2d of %d\n",rank,i,sendCounts[i]);
193+
}
194+
195+
for (i=0; i<ntimes; i++) {
196+
err = MPI_Alltoallw(sendPtr, sendCounts, sendDisps, sendTypes,
197+
recvPtr, recvCounts, recvDisps, recvTypes,
198+
MPI_COMM_WORLD); ERR
199+
sendPtr += num_recvers * (len + GAP);
200+
recvPtr += nprocs * (len + GAP);
201+
}
202+
203+
err_out:
204+
free(sendTypes);
205+
free(sendCounts);
206+
free(sendDisps);
207+
208+
timing = MPI_Wtime() - timing;
209+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
210+
211+
if (rank == 0) {
212+
printf("---- Using MPI_Alltoallw ----\n");
213+
printf("nprocs = %d\n", nprocs);
214+
printf("ntimes = %d\n", ntimes);
215+
printf("num_recvers = %d\n", num_recvers);
216+
printf("individual message len = %d bytes\n", len);
217+
printf("send/recv buffer gap = %d bytes\n", GAP);
218+
printf("Max time: = %.2f sec\n", maxt);
219+
}
220+
}
221+
222+
/*----< usage() >------------------------------------------------------------*/
223+
static void usage (char *argv0) {
224+
char *help = "Usage: %s [OPTION]\n\
225+
[-h] Print this help message\n\
226+
[-v] Verbose mode (default: no)\n\
227+
[-n num] number of iterations (default: 1)\n\
228+
[-r ratio] every ratio processes is a receiver (default: 1)\n\
229+
[-l len] receive message size per iteration( default: 8MB)\n";
230+
fprintf (stderr, help, argv0);
231+
}
232+
233+
/*----< main() >------------------------------------------------------------*/
234+
int main(int argc, char **argv) {
235+
extern int optind;
236+
extern char *optarg;
237+
char *sendBuf, *recvBuf;
238+
int i, rank, nprocs;
239+
int len, block_len, ntimes, ratio, num_recvers, is_receiver;
240+
241+
MPI_Init(&argc, &argv);
242+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
243+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
244+
245+
verbose = 0;
246+
block_len = 8 * 1024 * 1024;
247+
ntimes = 1;
248+
ratio = 1;
249+
250+
/* command-line arguments */
251+
while ((i = getopt (argc, argv, "hvl:n:r:")) != EOF)
252+
switch (i) {
253+
case 'v':
254+
verbose = 1;
255+
break;
256+
case 'l':
257+
block_len = atoi(optarg);
258+
break;
259+
case 'n':
260+
ntimes = atoi(optarg);
261+
break;
262+
case 'r':
263+
ratio = atoi(optarg);
264+
break;
265+
case 'h':
266+
default:
267+
if (rank == 0) usage(argv[0]);
268+
goto err_out;
269+
}
270+
271+
/* set the number of receivers */
272+
if (ratio <= 0 || ratio > nprocs) ratio = 1;
273+
num_recvers = nprocs / ratio;
274+
275+
/* set whether this rank has non-zero data to receive */
276+
is_receiver = (rank % ratio == 0) ? 1 : 0;
277+
278+
/* per message size */
279+
len = block_len / nprocs;
280+
281+
if (verbose && rank == 0)
282+
printf("nprocs=%d ntimes=%d block_len=%d num_recvers=%d len=%d\n",
283+
nprocs, ntimes, block_len, num_recvers, len);
284+
285+
if (verbose && is_receiver)
286+
printf("rank %2d is_receiver\n", rank);
287+
288+
if (verbose) fflush(stdout);
289+
290+
/* allocate send and recevive buffer */
291+
sendBuf = (char*) calloc(num_recvers * (len + GAP) * ntimes, 1);
292+
recvBuf = (char*) calloc(nprocs * (len + GAP) * ntimes, 1);
293+
294+
run_alltoallw(ntimes, ratio, is_receiver, len, sendBuf, recvBuf);
295+
296+
run_async_send_recv(ntimes, ratio, is_receiver, len, sendBuf, recvBuf);
297+
298+
free(recvBuf);
299+
free(sendBuf);
300+
301+
err_out:
302+
MPI_Finalize();
303+
return 0;
304+
}

‎MPI/sbatch_perlmutter.sh

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash -l
2+
#SBATCH --constraint=cpu
3+
#SBATCH --qos=regular
4+
#SBATCH -t 00:10:00
5+
6+
#SBATCH --nodes=16
7+
#SBATCH --job-name=alltoallw
8+
#SBATCH -o qout.%x.%j
9+
#SBATCH -e qout.%x.%j
10+
#------------------------------------------------------------------------#
11+
cd $PWD
12+
13+
if test "x$SLURM_NTASKS_PER_NODE" = x ; then
14+
SLURM_NTASKS_PER_NODE=128
15+
fi
16+
NP=$(($SLURM_JOB_NUM_NODES * $SLURM_NTASKS_PER_NODE))
17+
18+
export FI_MR_CACHE_MONITOR=kdreg2
19+
export FI_CXI_RX_MATCH_MODE=software
20+
export MPICH_OFI_NIC_POLICY=NUMA
21+
22+
echo "------------------------------------------------------"
23+
echo "---- Running on Perlmutter CPU nodes ----"
24+
echo "---- SLURM_CLUSTER_NAME = $SLURM_CLUSTER_NAME"
25+
echo "---- SLURM_JOB_QOS = $SLURM_JOB_QOS"
26+
echo "---- SLURM_JOB_PARTITION = $SLURM_JOB_PARTITION"
27+
echo "---- SLURM_JOB_NAME = $SLURM_JOB_NAME"
28+
echo "---- SBATCH_CONSTRAINT = $SBATCH_CONSTRAINT"
29+
echo "---- SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST"
30+
echo "---- SLURM_JOB_NUM_NODES = $SLURM_JOB_NUM_NODES"
31+
echo "---- SLURM_NTASKS_PER_NODE = $SLURM_NTASKS_PER_NODE"
32+
echo "---- SLURM_JOB_ID = $SLURM_JOB_ID"
33+
echo "---- SLURM out/err file = qout.$SLURM_JOB_NAME.$SLURM_JOB_ID"
34+
echo ""
35+
echo "ENV explicitly set:"
36+
echo "---- FI_MR_CACHE_MONITOR = $FI_MR_CACHE_MONITOR"
37+
echo "---- FI_UNIVERSE_SIZE = $FI_UNIVERSE_SIZE"
38+
echo "---- FI_CXI_DEFAULT_CQ_SIZE = $FI_CXI_DEFAULT_CQ_SIZE"
39+
echo "---- FI_CXI_RX_MATCH_MODE = $FI_CXI_RX_MATCH_MODE"
40+
echo "---- MPICH_COLL_SYNC = $MPICH_COLL_SYNC"
41+
echo "---- MPICH_OFI_NIC_POLICY = $MPICH_OFI_NIC_POLICY"
42+
echo "------------------------------------------------------"
43+
echo ""
44+
45+
# For fast executable loading on Cori and Perlmutter
46+
EXE_FILE=alltoallw
47+
EXE=/tmp/${USER}_${EXE_FILE}
48+
sbcast ${EXE_FILE} ${EXE}
49+
50+
echo ""
51+
echo "========================================================================"
52+
echo ""
53+
54+
NTIMES=3
55+
for ntime in $(seq 1 ${NTIMES}) ; do
56+
date
57+
echo "---- iteration $ntime -----------------------------------------------"
58+
echo ""
59+
60+
CMD_OPTS="-n 253 -r 32"
61+
62+
CMD="srun -n $NP ${EXE} $CMD_OPTS"
63+
echo "CMD=$CMD"
64+
$CMD
65+
66+
echo ""
67+
echo "====================================================================="
68+
done # loop ntimes
69+
70+
date
71+

0 commit comments

Comments
 (0)