Skip to content

Commit

Permalink
Merge pull request #5 from TomMelt/add-openacc
Browse files Browse the repository at this point in the history
Add openacc directives - port to GPU
  • Loading branch information
TomMelt authored Mar 5, 2023
2 parents 6bec9a5 + 0c5ba79 commit 16326db
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# added by melt
build/*
build*/*
notebooks/*

# vim releated
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -O3 -fopenmp -fcheck=all -Wall -std=f2008")
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -O3 -qopenmp -warn all")
# elseif (CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC")
# set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -O3 -acc -Wall")
endif()

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
Expand Down
31 changes: 21 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ same answer.

## Results

### openMP (CPU only)

#### Intel CPU - ifort compiler

For problem size:
* `ugrid[200,200]`
* `tol = 1e-11` (GS only)
Expand All @@ -40,7 +44,7 @@ LAPACK | Gauss-Seidel
------ | ------------
11.6Gb | 22Mb

**TESTS**
#### Arm CPU - Gnu compiler

The test was run on arm CPU `Neoverse-N1` with 512 Gb RAM.

Expand All @@ -49,14 +53,16 @@ number of iterations = 192018

Cores | Time (s)
------|---------
80 | 36.0912
64 | 31.6890
32 | 35.1820
16 | 43.3966
8 | 60.4974
4 | 94.1945
2 | 159.6329
1 | 218.5429
80 | 15.4646
64 | 14.1203
32 | 15.4587
16 | 21.6295
8 | 38.4833
4 | 70.8669
2 | 136.7466
1 | 270.7637

### openACC (GPU)

The test was run on nvidia GPU `NVIDIA A100` with 40 Gb RAM connected to the arm system above.

Expand All @@ -65,4 +71,9 @@ number of iterations = 192018

GPU | Time (s)
------|---------
na | 18.1043
na | 17.7678


## TODO

Continue to profile openACC version. Check for unneccessary memory transfers.
32 changes: 32 additions & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# compiler
FC := nvfortran

FCFLAGS = -Wall -O3 -acc -Minfo=accel
FLFLAGS =

# source files and objects
OBJ = color_mod.o config_mod.o gauss_seidel_mod.o grid_mod.o poisson_solver.o precisn_mod.o
PRE = -DUSEGS

# program name
PROGRAM = solver

all: $(PROGRAM)

.PHONY: clean

$(PROGRAM): ${OBJ}
$(FC) $(FCFLAGS) $(FLFLAGS) ${PRE} -o $@ $^

%.o: %.f90
$(FC) $(FCFLAGS) -c $@ $<

%.o: %.F90
$(FC) $(FCFLAGS) ${PRE} -c $@ $<

color_mod.o: precisn_mod.o config_mod.o
config_mod.o: precisn_mod.o
gauss_seidel_mod.o: precisn_mod.o config_mod.o color_mod.o grid_mod.o

clean:
rm -f *.o *.mod $(PROGRAM)
36 changes: 32 additions & 4 deletions src/gauss_seidel_mod.f90
Original file line number Diff line number Diff line change
Expand Up @@ -44,41 +44,69 @@ subroutine solve(u_grid, max_diff, n_iter)
real(kind=wp), intent(out) :: max_diff ! max difference between matrix elements
integer, intent(out) :: n_iter ! final number of iterations

! local vars
type(color_group) :: red, blu
real(kind=wp) :: u_grid_old(nr, nc), u_next
integer :: r, c
integer :: i
integer :: i, j

call init_grid(u_grid)
call init_color_groups(red, blu)

!$acc data copyin(red, red%rows, red%cols, &
!$acc blu, blu%rows, blu%cols, &
!$acc u_grid) &
!$acc create(u_next, u_grid_old)

do n_iter = 1, max_iter

u_grid_old = u_grid
!$omp parallel do private(j) collapse(2)
!$acc parallel loop gang collapse(2)
do j = 1, nc
do i = 1, nr
u_grid_old(i, j) = u_grid(i, j)
end do
end do
!$omp end parallel do
!$acc end parallel loop

!$omp parallel do private(i, r, c, u_next)
!$acc parallel loop private(r, c, u_next)
do i = 1, red%num
r = red%rows(i)
c = red%cols(i)
call gs_method(r, c, u_grid, u_next)
u_grid(r, c) = u_next
end do
!$omp end parallel do
!$acc end parallel loop

!$omp parallel do private(i, r, c, u_next)
!$acc parallel loop private(r, c, u_next)
do i = 1, blu%num
r = blu%rows(i)
c = blu%cols(i)
call gs_method(r, c, u_grid, u_next)
u_grid(r, c) = u_next
end do
!$omp end parallel do
!$acc end parallel loop

max_diff = 0._wp
!$omp parallel do private(j) collapse(2) reduction(max:max_diff)
!$acc parallel loop collapse(2) reduction(max:max_diff)
do j = 1, nc
do i = 1, nr
max_diff = max(max_diff, abs(u_grid_old(i, j) - u_grid(i, j)))
end do
end do
!$omp end parallel do
!$acc end parallel loop

max_diff = maxval(abs(u_grid - u_grid_old))
if (max_diff < 1e-11) exit

end do
!$acc update self(u_grid)
!$acc end data

end subroutine solve

Expand Down
2 changes: 2 additions & 0 deletions src/grid_mod.f90
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ module grid

contains
subroutine get_xy_pos(r, c, x, y)
!$acc routine seq
! Get the x and y position in real-space for a given row and column (r, c)
! inputs:
! - r : int : row index
Expand Down Expand Up @@ -44,6 +45,7 @@ subroutine init_grid(u_grid)
end subroutine init_grid

function func(x, y) result(f)
!$acc routine seq
! This function is the RHS of Laplacian D_xy U(x,y) = f(x,y)
! inputs:
! - x : real(kind=wp) : x position in real-space
Expand Down

0 comments on commit 16326db

Please # to comment.