-
Notifications
You must be signed in to change notification settings - Fork 0
/
sequential-gpu-cpu-benchmark.py
96 lines (78 loc) · 4.06 KB
/
sequential-gpu-cpu-benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
import time
import itertools
def get_max_matrix_size(device_id):
total_memory = torch.cuda.get_device_properties(device_id).total_memory
free_memory = torch.cuda.memory_reserved(device_id) - torch.cuda.memory_allocated(device_id)
safe_memory_use = free_memory * 0.7 # Use 70% of free memory to be safer
bytes_per_element = 4 # float32 has 4 bytes
num_elements = safe_memory_use // bytes_per_element // 3 # Divide by 3 for A, B, and result
matrix_size = int(num_elements ** 0.5) # Square root for square matrix
return matrix_size
def compute_test(device_id, num_iterations):
device = torch.device(f'cuda:{device_id}')
matrix_size = get_max_matrix_size(device_id)
A = torch.rand(matrix_size, matrix_size, device=device, dtype=torch.float32)
B = torch.rand(matrix_size, matrix_size, device=device, dtype=torch.float32)
start_time = time.time()
for _ in range(num_iterations):
A = torch.matmul(A, B)
torch.cuda.synchronize(device=device)
total_time = time.time() - start_time
return total_time
def transfer_test(device_id, data_size_gb):
device = torch.device(f'cuda:{device_id}')
num_elements = (data_size_gb * (1024**3)) // 4
data = torch.randn(num_elements, dtype=torch.float32)
start_time = time.time()
data_gpu = data.to(device)
torch.cuda.synchronize(device=device)
gpu_to_cpu_time = time.time() - start_time
start_time = time.time()
data_cpu = data_gpu.to('cpu')
torch.cuda.synchronize(device=device)
cpu_to_gpu_time = time.time() - start_time
return gpu_to_cpu_time, cpu_to_gpu_time
def gpu_to_gpu_transfer_test(src_device_id, dst_device_id, data_size_gb):
src_device = torch.device(f'cuda:{src_device_id}')
dst_device = torch.device(f'cuda:{dst_device_id}')
num_elements = (data_size_gb * (1024**3)) // 4
data = torch.randn(num_elements, dtype=torch.float32, device=src_device)
start_time = time.time()
data.to(dst_device)
torch.cuda.synchronize(dst_device)
transfer_time = time.time() - start_time
return transfer_time
def main():
num_iterations = 50 # Iterations for compute test
data_size_gb = 10 # Data size for transfer test
compute_results = {}
transfer_results = {}
gpu_to_gpu_transfer_results = {}
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
for device_id in range(torch.cuda.device_count()):
compute_time = compute_test(device_id, num_iterations)
compute_results[device_id] = compute_time
gpu_to_cpu_time, cpu_to_gpu_time = transfer_test(device_id, data_size_gb)
transfer_results[device_id] = (gpu_to_cpu_time, cpu_to_gpu_time)
print(f"Device {device_id} - Compute Time: {compute_time:.4f}s, "
f"GPU to CPU Transfer Time: {gpu_to_cpu_time:.4f}s, "
f"CPU to GPU Transfer Time: {cpu_to_gpu_time:.4f}s")
# GPU-to-GPU transfer test for all pairs
for src_id, dst_id in itertools.combinations(range(torch.cuda.device_count()), 2):
transfer_time = gpu_to_gpu_transfer_test(src_id, dst_id, 1) # 1 GB for quick test
gpu_to_gpu_transfer_results[(src_id, dst_id)] = transfer_time
print(f"Transfer from GPU {src_id} to GPU {dst_id}: {transfer_time:.4f}s")
# Displaying fastest and slowest GPUs for compute and transfer
fastest_gpu_compute = min(compute_results, key=compute_results.get)
slowest_gpu_compute = max(compute_results, key=compute_results.get)
print(f"\nFastest GPU (Compute): Device {fastest_gpu_compute}")
print(f"Slowest GPU (Compute): Device {slowest_gpu_compute}")
fastest_gpu_transfer = min(transfer_results, key=lambda x: sum(transfer_results[x]))
slowest_gpu_transfer = max(transfer_results, key=lambda x: sum(transfer_results[x]))
print(f"Fastest GPU (Transfer): Device {fastest_gpu_transfer}")
print(f"Slowest GPU (Transfer): Device {slowest_gpu_transfer}")
else:
print("CUDA is not available or only one GPU detected.")
if __name__ == "__main__":
main()