C++ 简单的 CUDA 测试总是因“遇到非法内存访问”错误而失败
声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow
原文地址: http://stackoverflow.com/questions/25702573/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me):
StackOverFlow
Simple CUDA Test always fails with "an illegal memory access was encountered" error
提问by Henrik
If I run this program I get "an illegal memory access was encountered in matrixMulti.cu at line 48" error. I searched and tried a lot. So I hope somebody can help me.
如果我运行这个程序,我会收到“在第 48 行的 matrixMulti.cu 中遇到非法内存访问”错误。我搜索并尝试了很多。所以我希望有人可以帮助我。
Line 48 : HANDLE_ERROR ( cudaMemcpy(array, devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
第 48 行: HANDLE_ERROR ( cudaMemcpy(array, devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
The program is just to get into CUDA. I tried to implement a matrix multiplication.
该程序只是为了进入CUDA。我试图实现矩阵乘法。
#include <iostream>
#include<cuda.h>
#include <stdio.h>
using namespace std;
#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )
void printVec(int** a, int n);
static void HandleError( cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
__global__ void MatrixMulti(int** a, int** b) {
b[0][0]=4;
}
int main() {
int N =10;
int** array, **devarray;
array = new int*[N];
for(int i = 0; i < N; i++) {
array[i] = new int[N];
}
HANDLE_ERROR ( cudaMalloc((void**)&devarray, N*N*sizeof(int) ) );
HANDLE_ERROR ( cudaMemcpy(devarray, array, N*N*sizeof(int), cudaMemcpyHostToDevice) );
MatrixMulti<<<1,1>>>(array,devarray);
HANDLE_ERROR ( cudaMemcpy(array, devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
HANDLE_ERROR ( cudaFree(devarray) );
printVec(array,N);
return 0;
}
void printVec(int** a , int n) {
for(int i =0 ; i < n; i++) {
for ( int j = 0; j <n; j++) {
cout<< a[i][j] <<" ";
}
cout<<" "<<endl;
}
}
回答by Robert Crovella
In general, your method of allocating and copying a doubly-subscripted C array won't work. cudaMemcpy
expects flat, contiguously allocated, single-pointer, single-subscript arrays.
通常,您分配和复制双下标 C 数组的方法不起作用。 cudaMemcpy
期望平面、连续分配、单指针、单下标数组。
As a result of this confusion, the pointers being passed to your kernel (int** a, int** b
) cannot be properly (safely) dereferenced twice:
由于这种混淆,传递给内核 ( int** a, int** b
)的指针无法正确(安全地)取消引用两次:
b[0][0]=4;
When you try to do the above in kernel code, you get an illegal memory access, because you have not properly allocated a pointer-to-pointer style allocation on the device.
当您尝试在内核代码中执行上述操作时,您会获得非法内存访问,因为您没有在设备上正确分配指针到指针样式的分配。
If you ran your code with cuda-memcheck
, you would get another indication of the illegal memory access in the kernel code.
如果你用 运行你的代码cuda-memcheck
,你会得到内核代码中非法内存访问的另一个指示。
The usual suggestion in these cases is to "flatten" your 2D arrays to single dimension, and use appropriate pointer or index arithmetic to simulate 2D access. It is possibleto allocate 2D arrays (i.e. double-subscript, double-pointer), but it is fairly involved (due in part to the need for a "deep copy"). If you'd like to learn more about that just search on the upper right hand corner for CUDA 2D array
.
在这些情况下,通常的建议是将 2D 数组“展平”为一维,并使用适当的指针或索引算法来模拟 2D 访问。它能够分配的2D阵列(即双下标,双指针),但它是相当复杂的(部分由于需要一个“深拷贝”)。如果您想了解更多相关信息,只需在右上角搜索CUDA 2D array
.
Here's a version of your code that has the array flattening for the device-side array:
这是您的代码的一个版本,它具有设备端数组的数组扁平化:
$ cat t60.cu
#include <iostream>
#include <cuda.h>
#include <stdio.h>
using namespace std;
#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )
void printVec(int** a, int n);
static void HandleError( cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess)
{
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
__global__ void MatrixMulti(int* b, unsigned n) {
for (int row = 0; row < n; row++)
for (int col=0; col < n; col++)
b[(row*n)+col]=col; //simulate 2D access in kernel code
}
int main() {
int N =10;
int** array, *devarray; // flatten device-side array
array = new int*[N];
array[0] = new int[N*N]; // host allocation needs to be contiguous
for (int i = 1; i < N; i++) array[i] = array[i-1]+N; //2D on top of contiguous allocation
HANDLE_ERROR ( cudaMalloc((void**)&devarray, N*N*sizeof(int) ) );
HANDLE_ERROR ( cudaMemcpy(devarray, array[0], N*N*sizeof(int), cudaMemcpyHostToDevice) );
MatrixMulti<<<1,1>>>(devarray, N);
HANDLE_ERROR ( cudaMemcpy(array[0], devarray, N*N*sizeof(int), cudaMemcpyDeviceToHost) );
HANDLE_ERROR ( cudaFree(devarray) );
printVec(array,N);
return 0;
}
void printVec(int** a , int n) {
for(int i =0 ; i < n; i++) {
for ( int j = 0; j <n; j++) {
cout<< a[i][j] <<" ";
}
cout<<" "<<endl;
}
}
$ nvcc -arch=sm_20 -o t60 t60.cu
$ ./t60
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
$