Skip to content

Commit 6a086d7

Browse files
committed
[NEW]add conv cuda support
1 parent 95990e6 commit 6a086d7

2 files changed

Lines changed: 727 additions & 5 deletions

File tree

cuda_mat/conv_cu_grad.cu

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
// Test function for convolution
2+
#include <iostream>
3+
#include <cstdlib>
4+
#include <chrono>
5+
#include <cuda_runtime.h>
6+
#include <cassert>
7+
#include <cstring>
8+
#include <cmath>
9+
#include <functional>
10+
#include "mat_grad.cu"
11+
void testConvolution() {
12+
std::cout << "\n====== Convolution Test ======\n";
13+
14+
// Configuration
15+
int batch_size = 2;
16+
int in_channels = 3;
17+
int in_height = 5;
18+
int in_width = 5;
19+
int out_channels = 2;
20+
int kernel_size = 3;
21+
int stride = 1;
22+
int padding = 1;
23+
24+
// Expected output dimensions
25+
int out_height = ((in_height + 2 * padding - kernel_size) / stride) + 1; // Should be 5
26+
int out_width = ((in_width + 2 * padding - kernel_size) / stride) + 1; // Should be 5
27+
28+
std::cout << "Configuration:\n";
29+
std::cout << " Input: [" << batch_size << ", " << in_channels << ", " << in_height << ", " << in_width << "]\n";
30+
std::cout << " Kernel: [" << out_channels << ", " << in_channels << ", " << kernel_size << ", " << kernel_size << "]\n";
31+
std::cout << " Stride: [" << stride << ", " << stride << "], Padding: [" << padding << ", " << padding << "]\n";
32+
std::cout << " Expected Output: [" << batch_size << ", " << out_channels << ", " << out_height << ", " << out_width << "]\n";
33+
34+
ComputeGraph graph;
35+
36+
// Create input tensor: [batch_size * in_channels, in_height * in_width]
37+
ComputeNode* input = graph.addInput(batch_size * in_channels, in_height * in_width);
38+
input->batchSize = batch_size;
39+
40+
// Create kernel tensor: [out_channels, in_channels * kernel_size * kernel_size]
41+
ComputeNode* kernel = graph.addInput(out_channels, in_channels * kernel_size * kernel_size);
42+
43+
// Initialize input with a pattern that increases along width, then height, then channel
44+
for (int b = 0; b < batch_size; b++) {
45+
for (int c = 0; c < in_channels; c++) {
46+
for (int h = 0; h < in_height; h++) {
47+
for (int w = 0; w < in_width; w++) {
48+
int flat_idx = (b * in_channels + c) * (in_height * in_width) + (h * in_width + w);
49+
// Simple pattern: value = (c+1) * 0.1 + h * 0.01 + w * 0.001
50+
input->value.data[flat_idx] = (c + 1) * 0.1f + h * 0.01f + w * 0.001f;
51+
}
52+
}
53+
}
54+
}
55+
56+
// Initialize kernel with identity-like pattern for testing
57+
for (int oc = 0; oc < out_channels; oc++) {
58+
for (int ic = 0; ic < in_channels; ic++) {
59+
for (int kh = 0; kh < kernel_size; kh++) {
60+
for (int kw = 0; kw < kernel_size; kw++) {
61+
int flat_idx = oc * (in_channels * kernel_size * kernel_size) +
62+
(ic * kernel_size * kernel_size + kh * kernel_size + kw);
63+
64+
// Identity-like kernel for the center position, zeros elsewhere
65+
// For convolution, this should preserve the input features with some edge effects
66+
if (kh == kernel_size/2 && kw == kernel_size/2 && ic == oc % in_channels) {
67+
kernel->value.data[flat_idx] = 1.0f;
68+
} else {
69+
kernel->value.data[flat_idx] = 0.0f;
70+
}
71+
}
72+
}
73+
}
74+
}
75+
76+
// Create convolution node
77+
ComputeNode* conv = graph.addConv2D(input, kernel, in_height, in_width, in_channels,
78+
stride, stride, padding, padding);
79+
80+
// Forward pass
81+
auto start = std::chrono::high_resolution_clock::now();
82+
graph.forward();
83+
auto end = std::chrono::high_resolution_clock::now();
84+
std::chrono::duration<double, std::milli> forward_duration = end - start;
85+
86+
std::cout << "\nForward Pass Results:\n";
87+
std::cout << " Convolution output shape: [" << conv->value.row << ", " << conv->value.col << "]\n";
88+
std::cout << " Time: " << forward_duration.count() << " ms\n";
89+
90+
// Print sample of the output (first batch, first channel)
91+
std::cout << "\nOutput sample (first batch, first channel):\n";
92+
for (int h = 0; h < out_height; h++) {
93+
std::cout << " ";
94+
for (int w = 0; w < out_width; w++) {
95+
int flat_idx = 0 * (out_height * out_width) + (h * out_width + w);
96+
printf("%.4f ", conv->value.data[flat_idx]);
97+
}
98+
std::cout << std::endl;
99+
}
100+
101+
// Backward pass
102+
start = std::chrono::high_resolution_clock::now();
103+
graph.backward(conv);
104+
end = std::chrono::high_resolution_clock::now();
105+
std::chrono::duration<double, std::milli> backward_duration = end - start;
106+
107+
std::cout << "\nBackward Pass Results:\n";
108+
std::cout << " Time: " << backward_duration.count() << " ms\n";
109+
110+
// Print sample of input gradients (first batch, first channel)
111+
std::cout << "\nInput gradient sample (first batch, first channel):\n";
112+
for (int h = 0; h < 3; h++) {
113+
std::cout << " ";
114+
for (int w = 0; w < 3; w++) {
115+
int flat_idx = 0 * (in_height * in_width) + (h * in_width + w);
116+
printf("%.4f ", input->grad.data[flat_idx]);
117+
}
118+
std::cout << std::endl;
119+
}
120+
121+
// Print sample of kernel gradients (first output channel, first input channel)
122+
std::cout << "\nKernel gradient sample (first output channel, first input channel):\n";
123+
for (int kh = 0; kh < kernel_size; kh++) {
124+
std::cout << " ";
125+
for (int kw = 0; kw < kernel_size; kw++) {
126+
int flat_idx = 0 * (in_channels * kernel_size * kernel_size) +
127+
(0 * kernel_size * kernel_size + kh * kernel_size + kw);
128+
printf("%.4f ", kernel->grad.data[flat_idx]);
129+
}
130+
std::cout << std::endl;
131+
}
132+
133+
std::cout << "\nConvolution Test Completed\n";
134+
}
135+
136+
// CPU reference implementation for verification
137+
void conv2dCPU(const float* input, const float* kernel, float* output,
138+
int batch_size, int in_channels, int out_channels,
139+
int in_height, int in_width, int kernel_height, int kernel_width,
140+
int out_height, int out_width, int stride_height, int stride_width,
141+
int pad_height, int pad_width) {
142+
143+
for (int n = 0; n < batch_size; n++) {
144+
for (int c = 0; c < out_channels; c++) {
145+
for (int h = 0; h < out_height; h++) {
146+
for (int w = 0; w < out_width; w++) {
147+
float sum = 0.0f;
148+
149+
for (int ic = 0; ic < in_channels; ic++) {
150+
for (int kh = 0; kh < kernel_height; kh++) {
151+
for (int kw = 0; kw < kernel_width; kw++) {
152+
int ih = h * stride_height + kh - pad_height;
153+
int iw = w * stride_width + kw - pad_width;
154+
155+
if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
156+
int input_idx = ((n * in_channels + ic) * in_height + ih) * in_width + iw;
157+
int kernel_idx = ((c * in_channels + ic) * kernel_height + kh) * kernel_width + kw;
158+
159+
sum += input[input_idx] * kernel[kernel_idx];
160+
}
161+
}
162+
}
163+
}
164+
165+
int output_idx = ((n * out_channels + c) * out_height + h) * out_width + w;
166+
output[output_idx] = sum;
167+
}
168+
}
169+
}
170+
}
171+
}
172+
173+
// Test function with CPU validation
174+
void testConvolutionWithValidation() {
175+
std::cout << "\n====== Convolution Test with CPU Validation ======\n";
176+
177+
// Configuration (smaller for CPU comparison)
178+
int batch_size = 2;
179+
int in_channels = 2;
180+
int in_height = 4;
181+
int in_width = 4;
182+
int out_channels = 2;
183+
int kernel_size = 3;
184+
int stride = 1;
185+
int padding = 1;
186+
187+
// Expected output dimensions
188+
int out_height = ((in_height + 2 * padding - kernel_size) / stride) + 1;
189+
int out_width = ((in_width + 2 * padding - kernel_size) / stride) + 1;
190+
191+
ComputeGraph graph;
192+
193+
// Create input tensor
194+
ComputeNode* input = graph.addInput(batch_size * in_channels, in_height * in_width);
195+
input->batchSize = batch_size;
196+
197+
// Create kernel tensor
198+
ComputeNode* kernel = graph.addInput(out_channels, in_channels * kernel_size * kernel_size);
199+
200+
// Initialize with random values for better test
201+
for (int i = 0; i < input->value.row * input->value.col; i++) {
202+
input->value.data[i] = static_cast<float>(rand()) / RAND_MAX;
203+
}
204+
205+
for (int i = 0; i < kernel->value.row * kernel->value.col; i++) {
206+
kernel->value.data[i] = static_cast<float>(rand()) / RAND_MAX * 0.1f;
207+
}
208+
209+
// Create convolution node
210+
ComputeNode* conv = graph.addConv2D(input, kernel, in_height, in_width, in_channels,
211+
stride, stride, padding, padding);
212+
213+
// Forward pass
214+
graph.forward();
215+
216+
// CPU reference calculation
217+
std::vector<float> input_reshaped(batch_size * in_channels * in_height * in_width);
218+
std::vector<float> kernel_reshaped(out_channels * in_channels * kernel_size * kernel_size);
219+
std::vector<float> output_cpu(batch_size * out_channels * out_height * out_width, 0.0f);
220+
221+
// Reshape input for CPU calculation
222+
for (int n = 0; n < batch_size; n++) {
223+
for (int c = 0; c < in_channels; c++) {
224+
for (int h = 0; h < in_height; h++) {
225+
for (int w = 0; w < in_width; w++) {
226+
int flat_idx = (n * in_channels + c) * (in_height * in_width) + (h * in_width + w);
227+
int tensor_idx = ((n * in_channels + c) * in_height + h) * in_width + w;
228+
input_reshaped[tensor_idx] = input->value.data[flat_idx];
229+
}
230+
}
231+
}
232+
}
233+
234+
// Reshape kernel for CPU calculation
235+
for (int oc = 0; oc < out_channels; oc++) {
236+
for (int ic = 0; ic < in_channels; ic++) {
237+
for (int kh = 0; kh < kernel_size; kh++) {
238+
for (int kw = 0; kw < kernel_size; kw++) {
239+
int flat_idx = oc * (in_channels * kernel_size * kernel_size) +
240+
(ic * kernel_size * kernel_size + kh * kernel_size + kw);
241+
int tensor_idx = ((oc * in_channels + ic) * kernel_size + kh) * kernel_size + kw;
242+
kernel_reshaped[tensor_idx] = kernel->value.data[flat_idx];
243+
}
244+
}
245+
}
246+
}
247+
248+
// CPU convolution
249+
conv2dCPU(input_reshaped.data(), kernel_reshaped.data(), output_cpu.data(),
250+
batch_size, in_channels, out_channels,
251+
in_height, in_width, kernel_size, kernel_size,
252+
out_height, out_width, stride, stride,
253+
padding, padding);
254+
255+
// Compare results
256+
std::vector<float> output_gpu(batch_size * out_channels * out_height * out_width);
257+
for (int n = 0; n < batch_size; n++) {
258+
for (int c = 0; c < out_channels; c++) {
259+
for (int h = 0; h < out_height; h++) {
260+
for (int w = 0; w < out_width; w++) {
261+
int flat_idx = (n * out_channels + c) * (out_height * out_width) + (h * out_width + w);
262+
int tensor_idx = ((n * out_channels + c) * out_height + h) * out_width + w;
263+
output_gpu[tensor_idx] = conv->value.data[flat_idx];
264+
}
265+
}
266+
}
267+
}
268+
269+
// Calculate error
270+
float max_diff = 0.0f;
271+
float avg_diff = 0.0f;
272+
273+
for (int i = 0; i < output_cpu.size(); i++) {
274+
float diff = std::abs(output_cpu[i] - output_gpu[i]);
275+
max_diff = std::max(max_diff, diff);
276+
avg_diff += diff;
277+
}
278+
avg_diff /= output_cpu.size();
279+
280+
std::cout << "Validation Results:\n";
281+
std::cout << " Maximum absolute difference: " << max_diff << std::endl;
282+
std::cout << " Average absolute difference: " << avg_diff << std::endl;
283+
std::cout << " Validation " << (max_diff < 1e-4 ? "PASSED" : "FAILED") << std::endl;
284+
285+
std::cout << "\nConvolution Validation Completed\n";
286+
}
287+
288+
// Update main function to test convolution
289+
int main() {
290+
// Test convolution
291+
testConvolution();
292+
293+
// Test with CPU validation
294+
testConvolutionWithValidation();
295+
296+
// Original code can follow here
297+
// ...
298+
299+
return 0;
300+
}

0 commit comments

Comments
 (0)