1. #include <stdio.h> 2. #include <stdlib.h> 3. #include <string.h> 4. #include <cuda.h> 5. #include <cuda_runtime.h> 6. 7. 8. // simple kernel function that adds two vectors 9. __global__ void vect_add(float *a, float *b, int N) 10. { 11. int idx = threadIdx.x; 12. if (idx<N) a[idx] = a[idx] + b[idx]; 13. } 14. 15. // function called from main fortran program 16. extern "C" void kernel_wrapper_(float *a, float *b, int *Np) 17. { 18. float *a_d, *b_d; // declare GPU vector copies 19. 20. int blocks = 1; // uses 1 block of 21. int N = *Np; // N threads on GPU 22. 23. // Allocate memory on GPU 24. cudaMalloc( (void **)&a_d, sizeof(float) * N ); 25. cudaMalloc( (void **)&b_d, sizeof(float) * N ); 26. 27. // copy vectors from CPU to GPU 28. cudaMemcpy( a_d, a, sizeof(float) * N, cudaMemcpyHostToDevice ); 29. cudaMemcpy( b_d, b, sizeof(float) * N, cudaMemcpyHostToDevice ); 30. 31. // call function on GPU 32. vect_add<<< blocks, N >>>( a_d, b_d, N); 33. 34. // copy vectors back from GPU to CPU 35. cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost ); 36. cudaMemcpy( b, b_d, sizeof(float) * N, cudaMemcpyDeviceToHost ); 37. 38. // free GPU memory 39. cudaFree(a_d); 40. cudaFree(b_d); 41. return; 42. }