1+ // Test function for convolution
2+ #include < iostream>
3+ #include < cstdlib>
4+ #include < chrono>
5+ #include < cuda_runtime.h>
6+ #include < cassert>
7+ #include < cstring>
8+ #include < cmath>
9+ #include < functional>
10+ #include " mat_grad.cu"
11+ void testConvolution () {
12+ std::cout << " \n ====== Convolution Test ======\n " ;
13+
14+ // Configuration
15+ int batch_size = 2 ;
16+ int in_channels = 3 ;
17+ int in_height = 5 ;
18+ int in_width = 5 ;
19+ int out_channels = 2 ;
20+ int kernel_size = 3 ;
21+ int stride = 1 ;
22+ int padding = 1 ;
23+
24+ // Expected output dimensions
25+ int out_height = ((in_height + 2 * padding - kernel_size) / stride) + 1 ; // Should be 5
26+ int out_width = ((in_width + 2 * padding - kernel_size) / stride) + 1 ; // Should be 5
27+
28+ std::cout << " Configuration:\n " ;
29+ std::cout << " Input: [" << batch_size << " , " << in_channels << " , " << in_height << " , " << in_width << " ]\n " ;
30+ std::cout << " Kernel: [" << out_channels << " , " << in_channels << " , " << kernel_size << " , " << kernel_size << " ]\n " ;
31+ std::cout << " Stride: [" << stride << " , " << stride << " ], Padding: [" << padding << " , " << padding << " ]\n " ;
32+ std::cout << " Expected Output: [" << batch_size << " , " << out_channels << " , " << out_height << " , " << out_width << " ]\n " ;
33+
34+ ComputeGraph graph;
35+
36+ // Create input tensor: [batch_size * in_channels, in_height * in_width]
37+ ComputeNode* input = graph.addInput (batch_size * in_channels, in_height * in_width);
38+ input->batchSize = batch_size;
39+
40+ // Create kernel tensor: [out_channels, in_channels * kernel_size * kernel_size]
41+ ComputeNode* kernel = graph.addInput (out_channels, in_channels * kernel_size * kernel_size);
42+
43+ // Initialize input with a pattern that increases along width, then height, then channel
44+ for (int b = 0 ; b < batch_size; b++) {
45+ for (int c = 0 ; c < in_channels; c++) {
46+ for (int h = 0 ; h < in_height; h++) {
47+ for (int w = 0 ; w < in_width; w++) {
48+ int flat_idx = (b * in_channels + c) * (in_height * in_width) + (h * in_width + w);
49+ // Simple pattern: value = (c+1) * 0.1 + h * 0.01 + w * 0.001
50+ input->value .data [flat_idx] = (c + 1 ) * 0 .1f + h * 0 .01f + w * 0 .001f ;
51+ }
52+ }
53+ }
54+ }
55+
56+ // Initialize kernel with identity-like pattern for testing
57+ for (int oc = 0 ; oc < out_channels; oc++) {
58+ for (int ic = 0 ; ic < in_channels; ic++) {
59+ for (int kh = 0 ; kh < kernel_size; kh++) {
60+ for (int kw = 0 ; kw < kernel_size; kw++) {
61+ int flat_idx = oc * (in_channels * kernel_size * kernel_size) +
62+ (ic * kernel_size * kernel_size + kh * kernel_size + kw);
63+
64+ // Identity-like kernel for the center position, zeros elsewhere
65+ // For convolution, this should preserve the input features with some edge effects
66+ if (kh == kernel_size/2 && kw == kernel_size/2 && ic == oc % in_channels) {
67+ kernel->value .data [flat_idx] = 1 .0f ;
68+ } else {
69+ kernel->value .data [flat_idx] = 0 .0f ;
70+ }
71+ }
72+ }
73+ }
74+ }
75+
76+ // Create convolution node
77+ ComputeNode* conv = graph.addConv2D (input, kernel, in_height, in_width, in_channels,
78+ stride, stride, padding, padding);
79+
80+ // Forward pass
81+ auto start = std::chrono::high_resolution_clock::now ();
82+ graph.forward ();
83+ auto end = std::chrono::high_resolution_clock::now ();
84+ std::chrono::duration<double , std::milli> forward_duration = end - start;
85+
86+ std::cout << " \n Forward Pass Results:\n " ;
87+ std::cout << " Convolution output shape: [" << conv->value .row << " , " << conv->value .col << " ]\n " ;
88+ std::cout << " Time: " << forward_duration.count () << " ms\n " ;
89+
90+ // Print sample of the output (first batch, first channel)
91+ std::cout << " \n Output sample (first batch, first channel):\n " ;
92+ for (int h = 0 ; h < out_height; h++) {
93+ std::cout << " " ;
94+ for (int w = 0 ; w < out_width; w++) {
95+ int flat_idx = 0 * (out_height * out_width) + (h * out_width + w);
96+ printf (" %.4f " , conv->value .data [flat_idx]);
97+ }
98+ std::cout << std::endl;
99+ }
100+
101+ // Backward pass
102+ start = std::chrono::high_resolution_clock::now ();
103+ graph.backward (conv);
104+ end = std::chrono::high_resolution_clock::now ();
105+ std::chrono::duration<double , std::milli> backward_duration = end - start;
106+
107+ std::cout << " \n Backward Pass Results:\n " ;
108+ std::cout << " Time: " << backward_duration.count () << " ms\n " ;
109+
110+ // Print sample of input gradients (first batch, first channel)
111+ std::cout << " \n Input gradient sample (first batch, first channel):\n " ;
112+ for (int h = 0 ; h < 3 ; h++) {
113+ std::cout << " " ;
114+ for (int w = 0 ; w < 3 ; w++) {
115+ int flat_idx = 0 * (in_height * in_width) + (h * in_width + w);
116+ printf (" %.4f " , input->grad .data [flat_idx]);
117+ }
118+ std::cout << std::endl;
119+ }
120+
121+ // Print sample of kernel gradients (first output channel, first input channel)
122+ std::cout << " \n Kernel gradient sample (first output channel, first input channel):\n " ;
123+ for (int kh = 0 ; kh < kernel_size; kh++) {
124+ std::cout << " " ;
125+ for (int kw = 0 ; kw < kernel_size; kw++) {
126+ int flat_idx = 0 * (in_channels * kernel_size * kernel_size) +
127+ (0 * kernel_size * kernel_size + kh * kernel_size + kw);
128+ printf (" %.4f " , kernel->grad .data [flat_idx]);
129+ }
130+ std::cout << std::endl;
131+ }
132+
133+ std::cout << " \n Convolution Test Completed\n " ;
134+ }
135+
136+ // CPU reference implementation for verification
137+ void conv2dCPU (const float * input, const float * kernel, float * output,
138+ int batch_size, int in_channels, int out_channels,
139+ int in_height, int in_width, int kernel_height, int kernel_width,
140+ int out_height, int out_width, int stride_height, int stride_width,
141+ int pad_height, int pad_width) {
142+
143+ for (int n = 0 ; n < batch_size; n++) {
144+ for (int c = 0 ; c < out_channels; c++) {
145+ for (int h = 0 ; h < out_height; h++) {
146+ for (int w = 0 ; w < out_width; w++) {
147+ float sum = 0 .0f ;
148+
149+ for (int ic = 0 ; ic < in_channels; ic++) {
150+ for (int kh = 0 ; kh < kernel_height; kh++) {
151+ for (int kw = 0 ; kw < kernel_width; kw++) {
152+ int ih = h * stride_height + kh - pad_height;
153+ int iw = w * stride_width + kw - pad_width;
154+
155+ if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
156+ int input_idx = ((n * in_channels + ic) * in_height + ih) * in_width + iw;
157+ int kernel_idx = ((c * in_channels + ic) * kernel_height + kh) * kernel_width + kw;
158+
159+ sum += input[input_idx] * kernel[kernel_idx];
160+ }
161+ }
162+ }
163+ }
164+
165+ int output_idx = ((n * out_channels + c) * out_height + h) * out_width + w;
166+ output[output_idx] = sum;
167+ }
168+ }
169+ }
170+ }
171+ }
172+
173+ // Test function with CPU validation
174+ void testConvolutionWithValidation () {
175+ std::cout << " \n ====== Convolution Test with CPU Validation ======\n " ;
176+
177+ // Configuration (smaller for CPU comparison)
178+ int batch_size = 2 ;
179+ int in_channels = 2 ;
180+ int in_height = 4 ;
181+ int in_width = 4 ;
182+ int out_channels = 2 ;
183+ int kernel_size = 3 ;
184+ int stride = 1 ;
185+ int padding = 1 ;
186+
187+ // Expected output dimensions
188+ int out_height = ((in_height + 2 * padding - kernel_size) / stride) + 1 ;
189+ int out_width = ((in_width + 2 * padding - kernel_size) / stride) + 1 ;
190+
191+ ComputeGraph graph;
192+
193+ // Create input tensor
194+ ComputeNode* input = graph.addInput (batch_size * in_channels, in_height * in_width);
195+ input->batchSize = batch_size;
196+
197+ // Create kernel tensor
198+ ComputeNode* kernel = graph.addInput (out_channels, in_channels * kernel_size * kernel_size);
199+
200+ // Initialize with random values for better test
201+ for (int i = 0 ; i < input->value .row * input->value .col ; i++) {
202+ input->value .data [i] = static_cast <float >(rand ()) / RAND_MAX;
203+ }
204+
205+ for (int i = 0 ; i < kernel->value .row * kernel->value .col ; i++) {
206+ kernel->value .data [i] = static_cast <float >(rand ()) / RAND_MAX * 0 .1f ;
207+ }
208+
209+ // Create convolution node
210+ ComputeNode* conv = graph.addConv2D (input, kernel, in_height, in_width, in_channels,
211+ stride, stride, padding, padding);
212+
213+ // Forward pass
214+ graph.forward ();
215+
216+ // CPU reference calculation
217+ std::vector<float > input_reshaped (batch_size * in_channels * in_height * in_width);
218+ std::vector<float > kernel_reshaped (out_channels * in_channels * kernel_size * kernel_size);
219+ std::vector<float > output_cpu (batch_size * out_channels * out_height * out_width, 0 .0f );
220+
221+ // Reshape input for CPU calculation
222+ for (int n = 0 ; n < batch_size; n++) {
223+ for (int c = 0 ; c < in_channels; c++) {
224+ for (int h = 0 ; h < in_height; h++) {
225+ for (int w = 0 ; w < in_width; w++) {
226+ int flat_idx = (n * in_channels + c) * (in_height * in_width) + (h * in_width + w);
227+ int tensor_idx = ((n * in_channels + c) * in_height + h) * in_width + w;
228+ input_reshaped[tensor_idx] = input->value .data [flat_idx];
229+ }
230+ }
231+ }
232+ }
233+
234+ // Reshape kernel for CPU calculation
235+ for (int oc = 0 ; oc < out_channels; oc++) {
236+ for (int ic = 0 ; ic < in_channels; ic++) {
237+ for (int kh = 0 ; kh < kernel_size; kh++) {
238+ for (int kw = 0 ; kw < kernel_size; kw++) {
239+ int flat_idx = oc * (in_channels * kernel_size * kernel_size) +
240+ (ic * kernel_size * kernel_size + kh * kernel_size + kw);
241+ int tensor_idx = ((oc * in_channels + ic) * kernel_size + kh) * kernel_size + kw;
242+ kernel_reshaped[tensor_idx] = kernel->value .data [flat_idx];
243+ }
244+ }
245+ }
246+ }
247+
248+ // CPU convolution
249+ conv2dCPU (input_reshaped.data (), kernel_reshaped.data (), output_cpu.data (),
250+ batch_size, in_channels, out_channels,
251+ in_height, in_width, kernel_size, kernel_size,
252+ out_height, out_width, stride, stride,
253+ padding, padding);
254+
255+ // Compare results
256+ std::vector<float > output_gpu (batch_size * out_channels * out_height * out_width);
257+ for (int n = 0 ; n < batch_size; n++) {
258+ for (int c = 0 ; c < out_channels; c++) {
259+ for (int h = 0 ; h < out_height; h++) {
260+ for (int w = 0 ; w < out_width; w++) {
261+ int flat_idx = (n * out_channels + c) * (out_height * out_width) + (h * out_width + w);
262+ int tensor_idx = ((n * out_channels + c) * out_height + h) * out_width + w;
263+ output_gpu[tensor_idx] = conv->value .data [flat_idx];
264+ }
265+ }
266+ }
267+ }
268+
269+ // Calculate error
270+ float max_diff = 0 .0f ;
271+ float avg_diff = 0 .0f ;
272+
273+ for (int i = 0 ; i < output_cpu.size (); i++) {
274+ float diff = std::abs (output_cpu[i] - output_gpu[i]);
275+ max_diff = std::max (max_diff, diff);
276+ avg_diff += diff;
277+ }
278+ avg_diff /= output_cpu.size ();
279+
280+ std::cout << " Validation Results:\n " ;
281+ std::cout << " Maximum absolute difference: " << max_diff << std::endl;
282+ std::cout << " Average absolute difference: " << avg_diff << std::endl;
283+ std::cout << " Validation " << (max_diff < 1e-4 ? " PASSED" : " FAILED" ) << std::endl;
284+
285+ std::cout << " \n Convolution Validation Completed\n " ;
286+ }
287+
288+ // Update main function to test convolution
289+ int main () {
290+ // Test convolution
291+ testConvolution ();
292+
293+ // Test with CPU validation
294+ testConvolutionWithValidation ();
295+
296+ // Original code can follow here
297+ // ...
298+
299+ return 0 ;
300+ }
0 commit comments