void cpuMatMul(int N,double *x, double *y, double *ans)
{for(int i=0;i < N;i++)
{
for(int j=0;j < N;j++)
{
for(int k=0;k < N;k++)
{
ans[i*N+j]+=(x[i*N+k]*y[k*N+j]);
}
}
}}
CUDA Matrix multiplication
In this tutorial, we will tackle a well-suited problem for Parallel
Programming and quite a useful one. We will do Matrix Multiplication.
Problem: Given two N*N
matrix X
and Y
of floating point numbers, perform matrix multiplication and store the result on a N*N
matrix answer.
Below is a code for matrix multiplication using C++. It is the standard O(N³)
procedure. Here x
, y
, and ans are three N²
size matrices(N²
sized 1D
array. We are using 1D
arrays as of it were 2D
).
For a matrix of size 1024 X 1024
, it takes about 10
seconds to complete the task. For a matrix of size 4096 X 4096
, it takes about 2500
seconds(larger matrices add extra overhead. So it does not scale up exactly in O(N³)
after 1024) to complete the task. Above (4096 X 4096
), it just gets infeasible to run the experiment.
Parallel Matrix Multiplication
Following is the parallel version of the same code we have seen above. Now let's execute the whole code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define BLOCK_SIZE 16
// GPU gpu_matrix_mult
__global__ void gpu_matrix_mult(int *a,int *b, int *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
if( col < k && row < m)
{
for(int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
// CPU gpu_matrix_mult
void cpu_matrix_mult(int *h_a, int *h_b, int *h_result, int m, int n, int k) {
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
int tmp = 0.0;
for (int h = 0; h < n; ++h)
{
tmp += h_a[i * n + h] * h_b[h * k + j];
}
h_result[i * k + j] = tmp;
}
}
}
int main(int argc, char const *argv[])
{
// Set the matrix dims
int m=16, n=16, k=16;
srand(3333);
// allocate memory in host RAM, h_cc is used to store CPU result
int *h_a, *h_b, *h_c, *h_cc;
cudaMallocHost((void **) &h_a, sizeof(int)*m*n);
cudaMallocHost((void **) &h_b, sizeof(int)*n*k);
cudaMallocHost((void **) &h_c, sizeof(int)*m*k);
cudaMallocHost((void **) &h_cc, sizeof(int)*m*k);
// random initialize matrix A
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
h_a[i * n + j] = rand() % 1024;
}
}
// random initialize matrix B
for (int i = 0; i < n; ++i) {
for (int j = 0; j < k; ++j) {
h_b[i * k + j] = rand() % 1024;
}
}
// Allocate memory space on the device
int *d_a, *d_b, *d_c;
cudaMalloc((void **) &d_a, sizeof(int)*m*n);
cudaMalloc((void **) &d_b, sizeof(int)*n*k);
cudaMalloc((void **) &d_c, sizeof(int)*m*k);
// copy matrix A and B from host to device memory
cudaMemcpy(d_a, h_a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, sizeof(int)*n*k, cudaMemcpyHostToDevice);
unsigned int grid_rows = (m + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (k + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// Kernel call
gpu_matrix_mult<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, m, n, k);
// Transefr results from device to host
cudaMemcpy(h_c, d_c, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
// cpu version call
cpu_matrix_mult(h_a, h_b, h_cc, m, n, k);
// validate results computed by GPU
int all_ok = 1;
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < k; ++j)
{
if(h_cc[i*k + j] != h_c[i*k + j])
{
all_ok = 0;
}
}
}
// roughly compute speedup
if(all_ok)
{
printf("All results are correct!!!");
}
else
{
printf("Incorrect results\n");
}
// free memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
cudaFreeHost(h_cc);
return 0;
}