版块导航: 正在加载中...

登录注册

应《网络安全法》要求，自2017年10月1日起，未进行实名认证将不得使用互联网跟帖服务。为保障您的帐号能够正常使用，请尽快对帐号进行手机号验证，感谢您的理解与支持！

24小时热门版块排行榜

北京石油化工学院2026年研究生招生接收调剂公告

返回列表

木子化十文武

金虫 (小有名气)

应助: 1 (幼儿园)
金币: 1638.6
红花: 7
帖子: 161
在线: 92.8小时
虫号: 1282742
注册: 2011-05-01
性别: MM
专业: 凝聚态物性 II ：电子结构

[求助] GPU加速倍数只有12倍，怎么提高加速倍数

本人是做量子Monte Carlo模拟的，最近在用GPU跑程序，发现相对于CPU来说，只加速了12倍，想了很久不知道怎么解决，求高手指教，只要有帮助，金币可以增加！
（主是应该就是调整DimGrid和DimBlock吧！）

// Implementation of the different kernels in the GPU using CUDA

#include "blaslapack.h"
#include "profile.h"
#include "kernel_gpu.h"

#include
#include

// Thread block size for all kernels
#define K 64

#ifdef DQMC_PROFILE

#define GPU_PROFILE_BEGIN() if (profile_enabled) cudaDeviceSynchronize(); \
                        PROFILE_BEGIN()
#define GPU_PROFILE_END(i, n) if (profile_enabled) cudaDeviceSynchronize(); \
                           PROFILE_END(i, n)
#else

#define GPU_PROFILE_BEGIN()
#define GPU_PROFILE_END(i, n)

#endif

cuda_exception::cuda_exception(const char *file, int line, int code) {
  snprintf(message, sizeof(message), "CUDA error #%i at %s:%i\n",
         code, file, line);
}

const char* cuda_exception::what() const throw() { return message; }

// static stuff for the CUBLAS library

static cublasHandle_t handle = NULL;

void gpu_diag(int n, double *A, double *D)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasDcopy(handle, n, A, n + 1, D, 1));
  GPU_PROFILE_END(profile_dcopy, 16.0 * n);
}

__global__ void kernel_normcol(int n, double *A, double *D, double *c)
{
  int i, tid = threadIdx.x, j = blockIdx.x * n; // working column
  __shared__ volatile double shared[K];
  double t, w = 0.0, d = D[blockIdx.x];
  double *p = A + j + tid;
  for (i = tid; i < n; i += K) {
// t = A[j + i] * d;
t = *p * d;
// A[j + i] = t;
*p = t;
w += t * t;
p += K;
  }
  shared[tid] = w;
  __syncthreads();
  if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); }
  if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); }
  if (K >= 128) { if (tid <  64) { shared[tid] = w = w + shared[tid +  64]; } __syncthreads(); }
  if (tid < 32) {
if (K >=  64) shared[tid] = w = w + shared[tid + 32];
if (K >=  32) shared[tid] = w = w + shared[tid + 16];
if (K >=  16) shared[tid] = w = w + shared[tid +  8];
if (K >= 8) shared[tid] = w = w + shared[tid +  4];
if (K >= 4) shared[tid] = w = w + shared[tid +  2];
if (K >= 2) shared[tid] = w = w + shared[tid +  1];
  }
  if (tid == 0)
c[blockIdx.x] = w;
}

void gpu_normcol(int n, double *A, double *D, double *c)
{
  GPU_PROFILE_BEGIN();
  kernel_normcol <<< n , K >>> (n, A, D, c);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_normcol, 2.0 * n * n);
}

__global__ void kernel_permute(int n, int *ipiv, double *A, double *Q)
{
  int i, j = blockIdx.x * n; // working column
  int p = ipiv[blockIdx.x] * n;
  double *pQ = Q + j + threadIdx.x;
  double *pA = A + p + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
// Q[j + i] = A[p + i];
*pQ = *pA;
pQ += K; pA += K;
  }
}

void gpu_permute(int n, int *ipiv, double *A, double *Q)
{
  GPU_PROFILE_BEGIN();
  kernel_permute <<< n , K >>> (n, ipiv, A, Q);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_permute, 16.0 * n * n);
}

__global__ void kernel_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T)
{
  int i, j = blockIdx.x; // working column
  int p = ipiv[j] * n;
  double *pT = T + p + threadIdx.x;
  double *pQ = Q + j * n + threadIdx.x;
  double *pD = D + threadIdx.x;
  for (i = threadIdx.x; i <= j; i += K) {
// T[p + i] = Q[j * n + i] / D; // T = D^-1*R*P
*pT = *pQ / *pD;
pT += K; pQ += K; pD += K;
  }
  for (; i < n; i += K) {
// T[p + i] = 0;
*pT = 0.0;
pT += K;
  }
}

void gpu_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerowperm <<< n , K >>> (n, D, Q, ipiv, T);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerowperm, 0.5 * n * n);
}

__global__ void kernel_scalerow(int n, double *h, double *B, double *M)
{
  int i, j = blockIdx.x * n; // working column
  double *pM = M + j + threadIdx.x;
  double *ph = h + threadIdx.x;
  double *pB = B + j + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
// M[j + i] = h * B[j + i];
*pM = *ph * *pB;
pM += K; ph += K; pB += K;
  }
}

void gpu_scalerow(int n, double *h, double *B, double *M)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerow <<< n , K >>> (n, h, B, M);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerow, 1.0 * n * n);
}

__global__ void kernel_scalerowcol(int n, double *h, double *G)
{
  // G = diag(h) * G * diag(h)^-1
  int i, j = blockIdx.x * n; // working column
  double t, f = 1.0 / h[blockIdx.x];
  double *pG = G + j + threadIdx.x;
  double *ph = h + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
// G[j + i] = h * G[j + i] / f;
t = *ph * *pG;
*pG = t * f;
pG += K; ph += K;
  }
}

void gpu_scalerowcol(int n, double *h, double *G)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerowcol <<< n , K >>> (n, h, G);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerowcol, 2.0 * n * n);
}

void gpu_dgemm(const char *trans, int m, int n, int k, double alpha, double *a,
            int lda, double *b, int ldb, double beta, double *c, int ldc)
{
  GPU_PROFILE_BEGIN();
  cublasOperation_t transa = trans[0] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t transb = trans[1] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUDACHECK(cublasDgemm(handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
  GPU_PROFILE_END(profile_dgemm, 2.0 * k * m * n);
}

void gpu_dgeqrf(int m, int n, double *dA, int ldda, double *tau, double *dT)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgeqrf3_gpu(m, n, dA, ldda, tau, dT, &info);
  if (info) throw(lapack_exception("GPU DGEQRF", info));
  GPU_PROFILE_END(profile_dgeqrf, 2.0*m*n*n - 2.0*n*n*n/3.0 + m*n + n*n +
               14.0*n/3.0);
}

void gpu_dorgqr(int m, int n, int k, double *da, int ldda, double *tau,
            double *dT, int nb)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dorgqr_gpu(m, n, k, da, ldda, tau, dT, nb, &info);
  if (info) throw(lapack_exception("GPU DORGQR", info));
  GPU_PROFILE_END(profile_dorgqr, 4.0*m*n*k - 2.0*(m+n)*k*k + 4.0*k*k*k/3.0 +
                              3.0*n*k - m*k - k*k - 4.0*k/3.0);
}

void gpu_dgetrf(int m, int n, double *dA, int ldda, int *ipiv)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgetrf_gpu(m, n, dA, ldda, ipiv, &info);
  if (info) throw(lapack_exception("GPU DGETRF", info));
  GPU_PROFILE_END(profile_dgetrf, m*n*n - n*n*n/3.0 - n*n/2.0 + 5.0*n/6.0);
}

void gpu_dgetrs(const char *trans, int n, int nrhs, double *dA, int ldda,
            int *ipiv, double *dB, int lddb)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgetrs_gpu(trans[0], n, nrhs, dA, ldda, ipiv, dB, lddb, &info);
  if (info) throw(lapack_exception("GPU DGETRS", info));
  GPU_PROFILE_END(profile_dgetrs, nrhs * (2.0*n*n - n));
}

void gpu_setvector(int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasSetVector(n, size, src, 1, dst, 1));
  GPU_PROFILE_END(profile_transfer, 2.0 * n * size);
}

void gpu_getvector(int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasGetVector(n, size, src, 1, dst, 1));
  GPU_PROFILE_END(profile_transfer, 2.0 * n * size);
}

void gpu_setmatrix(int m, int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasSetMatrix(m, n, size, src, m, dst, m));
  GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size);
}

void gpu_getmatrix(int m, int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasGetMatrix(m, n, size, src, m, dst, m));
  GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size);
}

void gpu_copy(void *dst, void *src, int size)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice))
  GPU_PROFILE_END(profile_dcopy, 2.0 * size);
}

__global__ void kernel_sort(int length, double *val, int *ind)
{
  const unsigned int idx = blockIdx.x;
  int i, tid = threadIdx.x;
  __shared__ volatile int shared[K];
  int w = 0;
  double myValue = val[idx];
  for (i = tid; i < length; i += K) {
if (myValue < val || (val == myValue && i > idx)) {
   w++;
}
  }
  shared[tid] = w;
  __syncthreads();
  if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); }
  if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); }
  if (K >= 128) { if (tid <  64) { shared[tid] = w = w + shared[tid +  64]; } __syncthreads(); }
  if (tid < 32) {
if (K >=  64) shared[tid] = w = w + shared[tid + 32];
if (K >=  32) shared[tid] = w = w + shared[tid + 16];
if (K >=  16) shared[tid] = w = w + shared[tid +  8];
if (K >= 8) shared[tid] = w = w + shared[tid +  4];
if (K >= 4) shared[tid] = w = w + shared[tid +  2];
if (K >= 2) shared[tid] = w = w + shared[tid +  1];
  }
  if (tid == 0)
ind[w] = idx;
}

void gpu_sort(int n, double *Db, int *ipiv)
{
  // GPU version
  int blocks = n / K;
  if (n % K) blocks++;
  GPU_PROFILE_BEGIN();
  kernel_sort <<< n , K >>> (n, Db, ipiv);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_sort, 8.0 * n * n);
}

void gpu_init()
{
  if (!handle) CUDACHECK(cublasCreate(&handle));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_normcol, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_permute, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowperm, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerow, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowcol, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_sort, cudaFuncCachePreferL1));
}

void gpu_shutdown()
{
  if (handle) CUDACHECK(cublasDestroy(handle));
  handle = NULL;
}

回复此楼

» 猜你喜欢

重庆交大26年硕士生招生拟调剂通知已出！欢迎加入光子学微结构与器件课题组。已经有0人回复
UJN物理学专业调剂已经有12人回复
物理学I论文润色/翻译怎么收费? 已经有183人回复
北京纳米能源与系统研究所王中林院士/曹南颖研究员课题组招收2026硕士研究生1名已经有0人回复
山西大同大学物理学硕士研究生招收调剂生已经有0人回复
[调剂信息]211智能人工感知方向国家青年特聘专家课题组招收调剂研究生已经有0人回复
［调剂信息］211智能人工感知方向国家青年特聘专家课题组招收调剂研究生已经有0人回复
[调剂信息]211智能人工感知方向国家青年特聘专家课题组招收调剂研究生已经有0人回复
一志愿物理所凝聚态物理352求调剂已经有0人回复
0702一志愿吉大B区求调剂本科期间发表一篇Sci 已经有1人回复

平和！谦逊！

1楼 2013-01-07 23:20:44

已阅回复此楼关注TA 给TA发消息送TA红花 TA的回帖

rlin198110

捐助贵宾 (正式写手)

应助: 1 (幼儿园)
金币: 95.4
散金: 512
红花: 3
帖子: 509
在线: 194.3小时
虫号: 1997558
注册: 2012-09-13
性别: GG
专业: 化学动力学

【答案】应助回帖

★
csgt0: 金币+1, 应助指数+1, 谢谢 2013-02-26 10:32:08

GPU并不是更快而是更宽。如果程序的数据密集程度趋高，加速就越明显，也就是说同时处理的数据越多，加速比就趋高。我对于程序的算法不了解，但是上述原因是制约提速的关键之一。我计算谱图时提速能达到单cpu核的2E3倍(i5 pk tesla c2050）。当然与算法密切相关。

赞一下

回复此楼

务实自立

2楼2013-02-25 13:38:12

已阅回复此楼关注TA 给TA发消息送TA红花 TA的回帖

rlin198110

捐助贵宾 (正式写手)

应助: 1 (幼儿园)
金币: 95.4
散金: 512
红花: 3
帖子: 509
在线: 194.3小时
虫号: 1997558
注册: 2012-09-13
性别: GG
专业: 化学动力学

★
csgt0: 金币+1, 欢迎交流 2013-02-26 10:32:16

我现在从事求解偏微分方程（反应扩散系统斑图模拟），CUDA很适用。如有兴趣多联系。

赞一下

回复此楼

务实自立

3楼2013-02-25 13:41:37

已阅回复此楼关注TA 给TA发消息送TA红花 TA的回帖

相关版块跳转我要订阅楼主木子化十文武的主题更新

返回列表

最具人气热帖推荐 [查看全部]		作者	回/看	最后发表

[考研] 求调剂 +4	Hll胡 2026-04-04	4/200	2026-04-04 13:17 by ms629
[考研] 085600，专业课化工原理，321分求调剂 +10	大馋小子 2026-03-28	10/500	2026-04-04 12:28 by 小小树2024
[考研] 材料专业383求调剂 +8	郭阳阳阳成 2026-04-03	8/400	2026-04-04 10:29 by Rednal.
[基金申请] 同年申报，是不是就不会派给你函评本子了？ +4	redcom 2026-03-29	4/200	2026-04-04 10:08 by kudofaye
[考研] 265求调剂 +17	林深温澜 2026-04-01	20/1000	2026-04-04 01:09 by userper
[考研] 材料调剂 +11	一样YWY 2026-04-02	11/550	2026-04-03 23:46 by hemengdong
[考研] 总分328生物与医药考数学求调剂 +7	aaadim 2026-04-02	9/450	2026-04-03 22:53 by syh9288
[考研] 五邑大学土木工程招调剂生2026 +3	wyutj 2026-03-31	4/200	2026-04-03 18:21 by zengxj_7201
[考研] 293求调剂 +5	末未mm 2026-04-02	6/300	2026-04-03 15:20 by 王保杰33
[考研] 274求调剂 +9	顺理成张 2026-04-03	10/500	2026-04-03 15:10 by 啊俊！
[考研] 326分求调剂 +3	于是乎呢 2026-04-01	5/250	2026-04-03 14:23 by 于是乎呢
[考研] 286求调剂 +7	Faune 2026-03-30	7/350	2026-04-03 10:14 by linyelide
[考研] 材料调剂 +7	一样YWY 2026-04-02	7/350	2026-04-02 21:49 by dongzh2009
[考研] 材料专硕322分 +10	哈哈哈吼吼吼哈 2026-04-02	10/500	2026-04-02 21:46 by dongzh2009
[考研] 318求调剂，计算材料方向 +10	吸喵有害笙命 2026-04-01	11/550	2026-04-02 16:29 by oooqiao
[考研] 一志愿西安交大材料学硕（英一数二）347，求调剂到高分子/材料相关专业 +7	zju51 2026-03-31	9/450	2026-04-01 19:35 by CFQZAFU
[考研] 0703求调剂 +4	zizimo 2026-03-31	4/200	2026-04-01 16:04 by yanflower7133
[考研] 省双一流重点一本大学招收调剂 +4	wwwwffffff 2026-03-31	7/350	2026-04-01 15:23 by wwwwffffff
[考研] 291求调剂 +3	迷蒙木木 2026-04-01	4/200	2026-04-01 11:07 by 逆水乘风
[考研] 哈尔滨工业大学材料与化工专硕378求调剂 +3	塔比乌斯 2026-03-30	3/150	2026-03-30 22:55 by 无际的草原