| 查看: 931 | 回复: 0 | ||
cln116捐助贵宾 (知名作家)
|
[求助]
CUDA核函数计算结果与C++循环计算结果又差别,求助。
|
|
使用CUDA时间不长,遇到了棘手的问题,希望高手指点,谢谢。 用CUDA核函数计算的结果与用C++循环的结果有差别,主要是float最后2位。程序中,h_spec 数组用于存储C++循环的计算结果,d_spec用于存储核函数计算结果,将d_spec拷贝到数组h_d_spec,比较h_spec与h_d_spec。当h_spec与h_d_spec均有16641(129*129)个元素时,h_spec与h_d_spec中不相等的元素数为6723。不相等的元素差值很小,例如: h_d_spec[0] = 4.3036454e-008, h_spec[0] = 4.3036462e-008, 差值: -7.1054274e-015 h_d_spec[1] = 4.5101640e-008, h_spec[1] = 4.5101658e-008, 差值: -1.7763568e-014 不知道这种错误产生的原因到底是什么。原因一,可能是程序写得有问题;原因二,CPU和GPU浮点精度有差别。 程序所用GPU: Geforce GTX 460, Geforce GT430。程序代码如下: // complex_test.cpp #include "stdafx.h" #include <math.h> #include <cuda_runtime.h> #include "iostream" #include "stdlib.h" #define _PI 3.14159265358979323846 #define _PI2 _PI*2.0 #define _GRAVITY 9.81 #define _GRAVITY2 96.2361 extern "C" void DEVICE_spec_func(float * spec, float A, float L, float Speed, float2 Dir_vector, int dim, float damping); float spec_func(float A, float2 Dir, float Speed, float2 K, float reflDamping) { float k2 = K.x * K.x + K.y * K.y; float Dir_len2 = Dir.x * Dir.x + Dir.y *Dir.y; if (k2 == 0.f) return 0.f; float k4 = k2 * k2; float KdotW =K.x * Dir.x + K.y * Dir.y; if (KdotW < 0.f) return 0.0f; float KdotWhat = KdotW*KdotW/(k2*Dir_len2); float Speed4 = pow(Speed, 4.0f); float eterm = exp( -1.0f * _GRAVITY2 / (k2*Speed4) ) / k4; float specResult = A * eterm * KdotWhat ; return specResult; } void test_spec() { float L = 256.0f; float speed = 20.0f; float2 dir_vector; dir_vector.x = cos(_PI/4); dir_vector.x = sin(_PI/4); int _N = 128; float _A = pow(_PI2/L, 2.0) * 3.48/1000.0; float2 K; float * h_spec = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //host array float * d_spec = NULL; //device array cudaMalloc((void**)&d_spec,(_N+1)*(_N+1)*sizeof(float)); float * h_d_spec = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //copy device array to the array float * differ = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //difference int num = 0; int h_nonezero_num = 0; int d_nonezero_num = 0; int h_d_zero = 0; for(int y=0; y<=_N; y++) { K.y = _PI2 / L * (float)(-_N/2 + y); for(int x=0; x<=_N; x++) { K.x = _PI2 / L * (float)(-_N/2 + x); int offset = y * (_N+1) + x; h_spec[offset] = spec_func(_A, dir_vector, speed, K, 0.0); if(h_spec[offset]!=0.0) { h_nonezero_num+=1; } } } DEVICE_spec_func(d_spec, _A, L, speed, dir_vector, _N+1, 0.0f); cudaMemcpy(h_d_spec, d_spec, (_N+1)*(_N+1)*sizeof(float), cudaMemcpyDeviceToHost); for(int y=0; y<=_N; y++) { for(int x=0; x<=_N; x++) { int offset = y * (_N+1) + x; differ[offset] = h_d_spec[offset] - h_spec[offset]; if(h_d_spec[offset]!=0.0) { d_nonezero_num += 1; } if(differ[offset] != 0.0) { num += 1; } if(h_spec[offset]!=0.0 && h_d_spec[offset]==0.0 ) { h_d_zero += 1; } } } printf("Not equal: %d \n", num); } int _tmain(int argc, _TCHAR* argv[]) { test_spec(); return 0; } //kernel.cu #include "cuda_runtime.h" #include "device_functions_decls.h" #include "cufft.h" #include "cufftw.h" #include "curand.h" //#include "math.h" #define _PI 3.14159265358979323846 #define _PI2 _PI*2.0 #define _GRAVITY 9.81 #define _GRAVITY2 96.2361 __device__ float d_spec_func(const float A, const float2 Dir, const float Speed, const float2 K, const float reflDamping) { float k2 = K.x * K.x + K.y * K.y; float Dir_len2 = Dir.x * Dir.x + Dir.y *Dir.y; if (k2 == 0.f) return 0.f; float k4 = k2 * k2; float KdotW =K.x * Dir.x + K.y * Dir.y; if (KdotW < 0.f) return 0.0f; float KdotWhat = KdotW*KdotW/(k2*Dir_len2); float Speed4 = pow(Speed, 4.0f); float eterm = exp( -1.0f * _GRAVITY2 / (k2*Speed4) ) / k4; float specResult = A * eterm * KdotWhat ; //float specResult = KdotWhat; return specResult; } __global__ void kernel_spec(float * spec, float A, float L, float Speed, float2 Dir_vector, int dim, float damping) { int x = threadIdx.x + blockIdx.x * blockDim.x; int y = threadIdx.y + blockIdx.y * blockDim.y; int offset = x + y * dim; float2 K; K.x = _PI2 / L * (float)(-(dim-1)/2 + x); K.y = _PI2 / L * (float)(-(dim-1)/2 + y); spec[offset] = d_spec_func(A, Dir_vector, Speed, K, damping); } extern "C" void DEVICE_spec_func(float * spec, float A, float L, float Speed, float2 Dir_vector, int dim, float damping) { dim3 threads(1,1,1); dim3 blocks(dim, dim, 1); kernel_spec<<<blocks, threads>>>(spec, A, L, Speed, Dir_vector, dim, damping); } |
» 猜你喜欢
拟解决的关键科学问题还要不要写
已经有8人回复
请教限项目规定
已经有5人回复
最失望的一年
已经有16人回复
存款400万可以在学校里躺平吗
已经有33人回复
求助一下有机合成大神
已经有3人回复
求推荐英文EI期刊
已经有5人回复
26申博
已经有3人回复
基金委咋了?2026年的指南还没有出来?
已经有10人回复
基金申报
已经有6人回复
疑惑?
已经有5人回复














回复此楼