forked from libxsmm/parlooper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgemm_loop_tuner.sh
74 lines (64 loc) · 1.74 KB
/
gemm_loop_tuner.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/bash
#SBATCH -J gemm_tune_job
#SBATCH --get-user-env
#SBATCH --partition=clx
#SBATCH --nodes=1
#SBATCH --time=23:59:00
source /swtools/gcc/9.4.0/gcc_vars.sh
export OMP_NUM_THREADS=28
export GOMP_CPU_AFFINITY="0-27"
loop_names=()
loop_specs=()
#For now we don't block additionally K dim, handled by BF of BR dimension
for i in 0; do
for j in 0 1 2; do
for k in 0 1 2; do
loop_names+=("gemm${i}${j}${k}")
loop_specs+=("a_${i}_K,b_${j}_M,c_${k}_N")
done
done
done
for i in ${!loop_specs[@]}; do
./loop_permute_generator "${loop_names[$i]}" "${loop_specs[$i]}"
done
cat *bench_configs.txt > uber_config.txt
rm -rf *bench_configs.txt
awk '!seen[$0]++' uber_config.txt > tuner_config.txt
rm uber_config.txt
#Try various blocking factors for BR dimension
for m in 512 1024 2048; do
for n in 512 1024 2048; do
for k in 512 1024 2048; do
benchmark_out_name="${m}_${n}_${k}_bench_results"
echo -e "Performance" > ${benchmark_out_name}
KBFS=()
if [ $k -eq 512 ]; then
KBFS+=("1")
fi
if [ $k -eq 1024 ]; then
KBFS+=("1")
fi
if [ $k -eq 2048 ]; then
KBFS+=("1" "2")
fi
if [ $k -eq 4096 ]; then
KBFS+=("1" "2" "4")
fi
for b in "${KBFS[@]}"; do
loopArray=()
nLoops=0
while IFS= read -r line || [[ "$line" ]]; do
loopArray+=("$line")
let "nLoops+=1"
done < tuner_config.txt
for (( j = 0 ; j < $nLoops ; j++)); do
line=${loopArray[$j]}
export OMP_NUM_THREADS=28
export GOMP_CPU_AFFINITY="0-27"
unset LIBXSMM_VERBOSE
./gemm ${line} ${m} ${n} ${k} 32 32 32 ${b} 1 400 >> ${benchmark_out_name}
done
done
done
done
done