-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_gemm_table.py
More file actions
125 lines (106 loc) · 4.59 KB
/
test_gemm_table.py
File metadata and controls
125 lines (106 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
os.environ["MIOPEN_ENABLE_LOGGING"] = "1"
os.environ["MIOPEN_ENABLE_LOGGING_CMD"] = "1"
os.environ["MIOPEN_LOG_LEVEL"] = "5"
#os.environ["MIOPEN_ENABLE_LOGGING_ELAPSED_TIME"] = "1"
#os.environ["MIOPEN_DEBUG_CONV_WINOGRAD"] = "1"
#os.environ["MIOPEN_DEBUG_CONV_GEMM"] = "0"
#cos.environ["MIOPEN_DEBUG_CONV_DIRECT"] = "0"
#os.environ["MIOPEN_FIND_MODE"] = "2"
import torch
import torch.nn.functional as F
import numpy as np
import time
# ===== 設定 =====
device = "cuda" # GPU
# ===== 多組卷積配置 =====
configs = [
(1, 3, 8, 16, 16, 3, 1, 1),
(1, 8, 16, 32, 32, 3, 1, 1),
(2, 3, 8, 32, 32, 5, 2, 2),
(1, 16, 32, 64, 64, 3, 1, 1),
(1, 32, 64, 64, 64, 3, 2, 1),
(2, 32, 64, 80, 80, 3, 1, 1),
(2, 32, 128, 80, 80, 3, 2, 1),
(4, 32, 64, 56, 56, 5, 1, 2),
(1, 64, 128, 128, 128, 3, 1, 1),
(1, 64, 128, 128, 128, 3, 2, 1),
(2, 64, 256, 128, 128, 3, 2, 1),
(2, 128, 256, 64, 64, 3, 1, 1),
#(4, 128, 256, 128, 128, 5, 2, 2),
(1, 1, 1, 1, 1, 1, 1, 0),
(1, 32, 64, 7, 7, 7, 1, 3),
(1, 32, 64, 3, 3, 3, 2, 0),
(2, 3, 64, 224, 224, 3, 2, 1),
]
# ===== 儲存表格資料 =====
results_fp32 = []
results_fp16 = []
# ===== 迴圈測試 =====
for idx, (N, C_in, C_out, H, W, kernel_size, stride, padding) in enumerate(configs):
print(f"\n=== Config {idx+1} ===")
print(f"N={N}, C_in={C_in}, C_out={C_out}, H={H}, W={W}, kernel_size={kernel_size}, stride={stride}, padding={padding}")
for dtype, table in [(torch.float32, results_fp32), (torch.float16, results_fp16)]:
dtype_name = "FP32" if dtype == torch.float32 else "FP16"
# 隨機輸入
x_cpu = torch.randn(N, C_in, H, W, dtype=dtype)
weight_cpu = torch.randn(C_out, C_in, kernel_size, kernel_size, dtype=dtype)
# CPU 計算
start_cpu = time.time()
out_cpu = F.conv2d(x_cpu, weight_cpu, stride=stride, padding=padding)
cpu_time = time.time() - start_cpu
# GPU 計算
x_gpu = x_cpu.to(device)
weight_gpu = weight_cpu.to(device)
torch.cuda.synchronize()
start_gpu = time.time()
out_gpu = F.conv2d(x_gpu, weight_gpu, stride=stride, padding=padding)
torch.cuda.synchronize()
gpu_time = time.time() - start_gpu
# GPU NaN / Inf 檢查
gpu_has_nan = torch.isnan(out_gpu).any().item()
gpu_has_inf = torch.isinf(out_gpu).any().item()
print(f"{dtype_name} → GPU NaN: {gpu_has_nan}, Inf: {gpu_has_inf}")
print(f"GPU Time:{gpu_time}")
# CPU vs GPU 嚴謹比對
gpu_np = out_gpu.cpu().numpy()
cpu_np = out_cpu.numpy()
diff = np.abs(gpu_np - cpu_np)
valid_mask = np.isfinite(diff)
if not np.any(valid_mask):
print(f"{dtype_name} ❌ All diff values invalid")
max_diff = float("nan")
mean_diff = float("nan")
else:
max_diff = np.max(diff[valid_mask])
mean_diff = np.mean(diff[valid_mask])
print(f"{dtype_name} max_diff = {max_diff:.3e}, mean_diff = {mean_diff:.3e}")
if dtype_name == 'FP32':
if max_diff > 1e-3:
print(f"{dtype_name} ⚠️ GPU result deviates from CPU")
else:
print(f"{dtype_name} ✅ GPU matches CPU")
else:
if max_diff > 1e-1:
print(f"{dtype_name} ⚠️ GPU result deviates from CPU")
else:
print(f"{dtype_name} ✅ GPU matches CPU")
# 儲存結果
table.append({
"idx": idx+1,
"N": N, "C_in": C_in, "C_out": C_out, "H": H, "W": W,
"kernel": kernel_size, "stride": stride, "padding": padding,
"CPU_time": cpu_time, "GPU_time": gpu_time,
"max_diff": max_diff, "mean_diff": mean_diff,
"NaN": gpu_has_nan, "Inf": gpu_has_inf
})
# ===== 最後輸出表格 =====
def print_table(results, title):
print(f"\n\n=== Summary Table ({title}) ===")
header = f"{'Idx':>3} | {'N':>2} | {'Cin':>3} | {'Cout':>4} | {'H':>3} | {'W':>3} | {'K':>2} | {'S':>2} | {'P':>2} | {'CPU(s)':>7} | {'GPU(s)':>7} | {'max_diff':>10} | {'mean_diff':>10} | {'NaN':>3} | {'Inf':>3}"
print(header)
print("-"*100)
for r in results:
print(f"{r['idx']:3d} | {r['N']:2d} | {r['C_in']:3d} | {r['C_out']:4d} | {r['H']:3d} | {r['W']:3d} | {r['kernel']:2d} | {r['stride']:2d} | {r['padding']:2d} | {r['CPU_time']:7.4f} | {r['GPU_time']:7.4f} | {r['max_diff']:10.3e} | {r['mean_diff']:10.3e} | {r['NaN']:3} | {r['Inf']:3}")
print_table(results_fp32, "FP32")
print_table(results_fp16, "FP16")