24小时热门版块排行榜    

北京石油化工学院2026年研究生招生接收调剂公告
查看: 661  |  回复: 2

木子化十文武

金虫 (小有名气)

[求助] GPU加速倍数只有12倍,怎么提高加速倍数

本人是做量子Monte Carlo模拟的,最近在用GPU跑程序,发现相对于CPU来说,只加速了12倍,想了很久不知道怎么解决,求高手指教,只要有帮助,金币可以增加!
(主是应该就是调整DimGrid和DimBlock吧!)

// Implementation of the different kernels in the GPU using CUDA

#include "blaslapack.h"
#include "profile.h"
#include "kernel_gpu.h"

#include
#include

// Thread block size for all kernels
#define K 64

#ifdef DQMC_PROFILE

#define GPU_PROFILE_BEGIN() if (profile_enabled) cudaDeviceSynchronize(); \
                            PROFILE_BEGIN()
#define GPU_PROFILE_END(i, n) if (profile_enabled) cudaDeviceSynchronize(); \
                              PROFILE_END(i, n)
#else

#define GPU_PROFILE_BEGIN()
#define GPU_PROFILE_END(i, n)

#endif

cuda_exception::cuda_exception(const char *file, int line, int code) {
  snprintf(message, sizeof(message), "CUDA error #%i at %s:%i\n",
           code, file, line);
}

const char* cuda_exception::what() const throw() { return message; }

// static stuff for the CUBLAS library

static cublasHandle_t handle = NULL;

void gpu_diag(int n, double *A, double *D)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasDcopy(handle, n, A, n + 1, D, 1));
  GPU_PROFILE_END(profile_dcopy, 16.0 * n);
}

__global__ void kernel_normcol(int n, double *A, double *D, double *c)
{
  int i, tid = threadIdx.x, j = blockIdx.x * n; // working column
  __shared__ volatile double shared[K];
  double t, w = 0.0, d = D[blockIdx.x];
  double *p = A + j + tid;
  for (i = tid; i < n; i += K) {
    // t = A[j + i] * d;
    t = *p * d;
    // A[j + i] = t;
    *p = t;
    w += t * t;
    p += K;
  }
  shared[tid] = w;
  __syncthreads();
  if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); }
  if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); }
  if (K >= 128) { if (tid <  64) { shared[tid] = w = w + shared[tid +  64]; } __syncthreads(); }
  if (tid < 32) {
    if (K >=  64) shared[tid] = w = w + shared[tid + 32];
    if (K >=  32) shared[tid] = w = w + shared[tid + 16];
    if (K >=  16) shared[tid] = w = w + shared[tid +  8];
    if (K >=   8) shared[tid] = w = w + shared[tid +  4];
    if (K >=   4) shared[tid] = w = w + shared[tid +  2];
    if (K >=   2) shared[tid] = w = w + shared[tid +  1];
  }   
  if (tid == 0)
    c[blockIdx.x] = w;
}

void gpu_normcol(int n, double *A, double *D, double *c)
{
  GPU_PROFILE_BEGIN();
  kernel_normcol <<< n , K >>> (n, A, D, c);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_normcol, 2.0 * n * n);
}

__global__ void kernel_permute(int n, int *ipiv, double *A, double *Q)
{
  int i, j = blockIdx.x * n; // working column
  int p = ipiv[blockIdx.x] * n;
  double *pQ = Q + j + threadIdx.x;
  double *pA = A + p + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
    // Q[j + i] = A[p + i];
    *pQ = *pA;
    pQ += K; pA += K;
  }
}

void gpu_permute(int n, int *ipiv, double *A, double *Q)
{
  GPU_PROFILE_BEGIN();
  kernel_permute <<< n , K >>> (n, ipiv, A, Q);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_permute, 16.0 * n * n);
}

__global__ void kernel_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T)
{
  int i, j = blockIdx.x; // working column
  int p = ipiv[j] * n;
  double *pT = T + p + threadIdx.x;
  double *pQ = Q + j * n + threadIdx.x;
  double *pD = D + threadIdx.x;
  for (i = threadIdx.x; i <= j; i += K) {
    // T[p + i] = Q[j * n + i] / D; // T = D^-1*R*P
    *pT = *pQ / *pD;
    pT += K; pQ += K; pD += K;
  }
  for (; i < n; i += K) {
    // T[p + i] = 0;
    *pT = 0.0;
    pT += K;
  }
}

void gpu_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerowperm <<< n , K >>> (n, D, Q, ipiv, T);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerowperm, 0.5 * n * n);
}

__global__ void kernel_scalerow(int n, double *h, double *B, double *M)
{
  int i, j = blockIdx.x * n; // working column
  double *pM = M + j + threadIdx.x;
  double *ph = h + threadIdx.x;
  double *pB = B + j + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
    // M[j + i] = h * B[j + i];
    *pM = *ph * *pB;
    pM += K; ph += K; pB += K;
  }
}

void gpu_scalerow(int n, double *h, double *B, double *M)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerow <<< n , K >>> (n, h, B, M);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerow, 1.0 * n * n);
}

__global__ void kernel_scalerowcol(int n, double *h, double *G)
{
  // G = diag(h) * G * diag(h)^-1
  int i, j = blockIdx.x * n; // working column
  double t, f = 1.0 / h[blockIdx.x];
  double *pG = G + j + threadIdx.x;
  double *ph = h + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
    // G[j + i] = h * G[j + i] / f;
    t = *ph * *pG;
    *pG = t * f;
    pG += K; ph += K;
  }
}

void gpu_scalerowcol(int n, double *h, double *G)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerowcol <<< n , K >>> (n, h, G);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerowcol, 2.0 * n * n);
}

void gpu_dgemm(const char *trans, int m, int n, int k, double alpha, double *a,
               int lda, double *b, int ldb, double beta, double *c, int ldc)
{
  GPU_PROFILE_BEGIN();
  cublasOperation_t transa = trans[0] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t transb = trans[1] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUDACHECK(cublasDgemm(handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
  GPU_PROFILE_END(profile_dgemm, 2.0 * k * m * n);
}

void gpu_dgeqrf(int m, int n, double *dA, int ldda, double *tau, double *dT)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgeqrf3_gpu(m, n, dA, ldda, tau, dT, &info);
  if (info) throw(lapack_exception("GPU DGEQRF", info));
  GPU_PROFILE_END(profile_dgeqrf, 2.0*m*n*n - 2.0*n*n*n/3.0 + m*n + n*n +
                  14.0*n/3.0);
}

void gpu_dorgqr(int m, int n, int k, double *da, int ldda, double *tau,
                double *dT, int nb)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dorgqr_gpu(m, n, k, da, ldda, tau, dT, nb, &info);
  if (info) throw(lapack_exception("GPU DORGQR", info));
  GPU_PROFILE_END(profile_dorgqr, 4.0*m*n*k - 2.0*(m+n)*k*k + 4.0*k*k*k/3.0 +
                                  3.0*n*k - m*k - k*k - 4.0*k/3.0);
}

void gpu_dgetrf(int m, int n, double *dA, int ldda, int *ipiv)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgetrf_gpu(m, n, dA, ldda, ipiv, &info);
  if (info) throw(lapack_exception("GPU DGETRF", info));
  GPU_PROFILE_END(profile_dgetrf, m*n*n - n*n*n/3.0 - n*n/2.0 + 5.0*n/6.0);
}

void gpu_dgetrs(const char *trans, int n, int nrhs, double *dA, int ldda,
                int *ipiv, double *dB, int lddb)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgetrs_gpu(trans[0], n, nrhs, dA, ldda, ipiv, dB, lddb, &info);
  if (info) throw(lapack_exception("GPU DGETRS", info));
  GPU_PROFILE_END(profile_dgetrs, nrhs * (2.0*n*n - n));
}

void gpu_setvector(int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasSetVector(n, size, src, 1, dst, 1));
  GPU_PROFILE_END(profile_transfer, 2.0 * n * size);
}

void gpu_getvector(int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasGetVector(n, size, src, 1, dst, 1));
  GPU_PROFILE_END(profile_transfer, 2.0 * n * size);
}

void gpu_setmatrix(int m, int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasSetMatrix(m, n, size, src, m, dst, m));
  GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size);
}

void gpu_getmatrix(int m, int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasGetMatrix(m, n, size, src, m, dst, m));
  GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size);
}

void gpu_copy(void *dst, void *src, int size)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice))
  GPU_PROFILE_END(profile_dcopy, 2.0 * size);
}

__global__ void kernel_sort(int length, double *val, int *ind)
{
  const unsigned int idx = blockIdx.x;
  int i, tid = threadIdx.x;
  __shared__ volatile int shared[K];
  int w = 0;
  double myValue = val[idx];
  for (i = tid; i < length; i += K) {
    if (myValue < val || (val == myValue && i > idx)) {
      w++;
    }
  }
  shared[tid] = w;
  __syncthreads();
  if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); }
  if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); }
  if (K >= 128) { if (tid <  64) { shared[tid] = w = w + shared[tid +  64]; } __syncthreads(); }
  if (tid < 32) {
    if (K >=  64) shared[tid] = w = w + shared[tid + 32];
    if (K >=  32) shared[tid] = w = w + shared[tid + 16];
    if (K >=  16) shared[tid] = w = w + shared[tid +  8];
    if (K >=   8) shared[tid] = w = w + shared[tid +  4];
    if (K >=   4) shared[tid] = w = w + shared[tid +  2];
    if (K >=   2) shared[tid] = w = w + shared[tid +  1];
  }   
  if (tid == 0)
    ind[w] = idx;
}

void gpu_sort(int n, double *Db, int *ipiv)
{
  // GPU version
  int blocks = n / K;
  if (n % K) blocks++;
  GPU_PROFILE_BEGIN();
  kernel_sort <<< n , K >>> (n, Db, ipiv);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_sort, 8.0 * n * n);
}

void gpu_init()
{
  if (!handle) CUDACHECK(cublasCreate(&handle));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_normcol, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_permute, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowperm, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerow, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowcol, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_sort, cudaFuncCachePreferL1));
}

void gpu_shutdown()
{
  if (handle) CUDACHECK(cublasDestroy(handle));
  handle = NULL;
}
回复此楼
平和!谦逊!
已阅   回复此楼   关注TA 给TA发消息 送TA红花 TA的回帖

rlin198110

捐助贵宾 (正式写手)

【答案】应助回帖


csgt0: 金币+1, 应助指数+1, 谢谢 2013-02-26 10:32:08
GPU并不是更快而是更宽。如果程序的数据密集程度趋高,加速就越明显,也就是说同时处理的数据越多,加速比就趋高。我对于程序的算法不了解,但是上述原因是制约提速的关键之一。我计算谱图时提速能达到单cpu核的2E3倍(i5 pk tesla c2050)。当然与算法密切相关。
务实自立
2楼2013-02-25 13:38:12
已阅   回复此楼   关注TA 给TA发消息 送TA红花 TA的回帖

rlin198110

捐助贵宾 (正式写手)


csgt0: 金币+1, 欢迎交流 2013-02-26 10:32:16
我现在从事求解偏微分方程(反应扩散系统斑图模拟),CUDA很适用。如有兴趣多联系。
务实自立
3楼2013-02-25 13:41:37
已阅   回复此楼   关注TA 给TA发消息 送TA红花 TA的回帖
相关版块跳转 我要订阅楼主 木子化十文武 的主题更新
最具人气热帖推荐 [查看全部] 作者 回/看 最后发表
[考研] 求调剂 +4 Hll胡 2026-04-04 4/200 2026-04-04 13:17 by ms629
[考研] 085600,专业课化工原理,321分求调剂 +10 大馋小子 2026-03-28 10/500 2026-04-04 12:28 by 小小树2024
[考研] 材料专业383求调剂 +8 郭阳阳阳成 2026-04-03 8/400 2026-04-04 10:29 by Rednal.
[基金申请] 同年申报,是不是就不会派给你函评本子了? +4 redcom 2026-03-29 4/200 2026-04-04 10:08 by kudofaye
[考研] 265求调剂 +17 林深温澜 2026-04-01 20/1000 2026-04-04 01:09 by userper
[考研] 材料调剂 +11 一样YWY 2026-04-02 11/550 2026-04-03 23:46 by hemengdong
[考研] 总分328生物与医药考数学求调剂 +7 aaadim 2026-04-02 9/450 2026-04-03 22:53 by syh9288
[考研] 五邑大学土木工程招调剂生2026 +3 wyutj 2026-03-31 4/200 2026-04-03 18:21 by zengxj_7201
[考研] 293求调剂 +5 末未mm 2026-04-02 6/300 2026-04-03 15:20 by 王保杰33
[考研] 274求调剂 +9 顺理成张 2026-04-03 10/500 2026-04-03 15:10 by 啊俊!
[考研] 326分求调剂 +3 于是乎呢 2026-04-01 5/250 2026-04-03 14:23 by 于是乎呢
[考研] 286求调剂 +7 Faune 2026-03-30 7/350 2026-04-03 10:14 by linyelide
[考研] 材料调剂 +7 一样YWY 2026-04-02 7/350 2026-04-02 21:49 by dongzh2009
[考研] 材料专硕322分 +10 哈哈哈吼吼吼哈 2026-04-02 10/500 2026-04-02 21:46 by dongzh2009
[考研] 318求调剂,计算材料方向 +10 吸喵有害笙命 2026-04-01 11/550 2026-04-02 16:29 by oooqiao
[考研] 一志愿西安交大材料学硕(英一数二)347,求调剂到高分子/材料相关专业 +7 zju51 2026-03-31 9/450 2026-04-01 19:35 by CFQZAFU
[考研] 0703求调剂 +4 zizimo 2026-03-31 4/200 2026-04-01 16:04 by yanflower7133
[考研] 省双一流重点一本大学招收调剂 +4 wwwwffffff 2026-03-31 7/350 2026-04-01 15:23 by wwwwffffff
[考研] 291求调剂 +3 迷蒙木木 2026-04-01 4/200 2026-04-01 11:07 by 逆水乘风
[考研] 哈尔滨工业大学材料与化工专硕378求调剂 +3 塔比乌斯 2026-03-30 3/150 2026-03-30 22:55 by 无际的草原
信息提示
请填处理意见