现在的位置: 首页 > 综合 > 正文

gpu合并访问和取模对速度的影响

2018年11月09日 ⁄ 综合 ⁄ 共 7157字 ⁄ 字号 评论关闭
#include <cuda_runtime.h>
#include <sys/time.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <iostream>
#include <math.h>
using namespace std;

#define IDX2C(i,j,rows) (((j)*(rows)+(i)))
#define IDX2R(i,j,cols) (((i)*(cols)+(j)))
#define BLOCK_SIZE 32
#define CHECK_EQ1(a,b) do { \
	if ((a) != (b)) { \ 
		cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
		cout << cudaGetErrorString(a) <<endl;\
		exit(1);\
	}\
} while(0)

#define CUDA_CHECK(condition)\
do {\
	cudaError_t error = condition;\
	CHECK_EQ1(error, cudaSuccess);\
} while(0)



template<class T>
inline void printMtx(T *mtx, int row, int col) {
	for (int i = 0; i < row; ++i) {
		for (int j = 0; j < col; ++j) {
			cout << mtx[IDX2C(i,j,row)] << " ";
		}
		cout << endl;
	}
}

//if mtx is a sub-matrix
// 1. elements is continue storage, row and col is sub-matrix size
// 2. elements is non continue, row is matrix row 
template<class T>
inline void printMtxg(T *mtx, int row, int col) {
	T *c = (T*)malloc(sizeof(T)*row*col);
	CUDA_CHECK(cudaMemcpy(c, mtx, sizeof(T)*row*col, cudaMemcpyDeviceToHost));
	cudaDeviceSynchronize();
	printMtx(c,row,col);
	free(c);
}

template<class T>
inline void printVec(T *vec, int len) {
	for (int i = 0; i < len; ++i) cout <<vec[i] << " ";
	cout << endl;
}

template<class T>
inline void printVecg(T *gvec, int len) {
	T *vec = (T*)malloc(sizeof(T)*len);
	CUDA_CHECK(cudaMemcpy(vec,gvec,sizeof(T)*len,cudaMemcpyDeviceToHost));
	printVec(vec,len);
	free(vec);
}

bool validate(double *rst, double *grst, int row, int col) {
	//cout << "cpu rst\n";
	//printMtxt(rst, row, col);
	//cout << "gpu rst\n";
	//printMtxgt(grst, row, col);
	double *crst = (double *)malloc(sizeof(double)*row*col);
	CUDA_CHECK(cudaMemcpy(crst, grst, sizeof(double)*row*col, cudaMemcpyDeviceToHost));
	bool flag = true;
	for (int i = 0; i < row; ++i) {
		for (int j = 0; j < col; ++j) {
			if (rst[IDX2C(i,j, row)] != crst[IDX2C(i,j,row)]){
				//return false;
				flag = false;
				cout <<i<<","<<j<<" "<<rst[IDX2C(i,j, row)] << "<-->"<<crst[IDX2C(i,j,row)]<<endl;
			}
		}
	}
	return flag;
}

__global__ void addMinusMtx0(double *mat, int row, int col, int *arr, int len) {
	int colId = blockIdx.x;
	//int rowId = threadIdx.y + blockIdx.y * blockDim.y;
	int thdId = threadIdx.x;
	if (colId < col ) {
		for (int i = thdId; i < row; i += blockDim.x) {
			mat[IDX2C(i,colId, row)] = arr[i] - mat[IDX2C(i,colId, row)];
		}
		
	}
}

__global__ void addMinusMtx2(double *mat, int row, int col, int *arr, int len) {
	int colId = blockIdx.x;
	//int rowId = threadIdx.y + blockIdx.y * blockDim.y;
	int thdId = threadIdx.x + blockIdx.y * blockDim.x;
	int step = blockDim.x * gridDim.y;
	if (colId < col ) {
		for (int i = thdId; i < row; i += step) {
			mat[IDX2C(i,colId, row)] = arr[i] - mat[IDX2C(i,colId, row)];
		}
		
	}
}

__global__ void addMinusMtx3(double *mat, int row, int col, int *arr, int len) {
	int id = threadIdx.x + blockIdx.x * blockDim.x;
	int length = row * col;
	if (id < length ) {
		int i = id%row;
		mat[id] = arr[i] - mat[id];
	}
}

__global__ void addMinusMtx4(double *mat, int row, int col, int *arr, int len) {
	int id = threadIdx.x + blockIdx.x * blockDim.x;
	int length = row * col;
	if (id < length ) {
		int i = id + row;
		mat[id] = arr[0] - mat[id];
	}
}

__global__ void addMinusMtx1(double *mat, int row, int col, int *arr, int len) {
	int colId = threadIdx.x + blockIdx.x * blockDim.x;
	int rowId = threadIdx.y + blockIdx.y * blockDim.y;
	
	if (colId < col && rowId < row) {
		mat[IDX2C(rowId, colId, row)] = arr[rowId] -  mat[IDX2C(rowId,colId, row)];
	}
}

void ts(double *mat, int row, int col, int *arr, int len) {
	for (int i = 0; i < row; ++i) {
		for (int j = 0; j < col; ++j) {
			mat[IDX2C(i,j,row)] = arr[i] - mat[IDX2C(i,j,row)];
		}
	}
}

void ts1(double *mat, int row, int col, int *arr, int len) {
	int length = row*col;
	for (int i = 0; i < length; ++i) {
		int j = i % row;
		mat[i] = arr[j] - mat[i];
	}
}

void test(int argc, char *argv[]) {
	if (argc != 3) {
		cout << "row col\n";
		return ;
	}
	int row = atoi(argv[1]);
	int col = atoi(argv[2]);
	int len = row;
	
	int *arr = (int*)malloc(sizeof(int)*len);
	double *mat = (double*)malloc(sizeof(double)*row*col);
	for (int i = 0; i < len; ++i) arr[i] = len - i;
	for (int i = 0; i < row; ++i) {
		for (int j = 0; j < col; ++j) {
			mat[IDX2C(i,j,row)] = row - i - 1;
		}
	}
	
	double *mat1 = (double*)malloc(sizeof(double)*row*col);
	memcpy(mat1, mat, sizeof(double)*row*col);
	
	int *garr;
	double *gmat;
	CUDA_CHECK(cudaMalloc((void**)&garr, sizeof(int)*len));
	CUDA_CHECK(cudaMalloc((void**)&gmat, sizeof(double)*row*col));
	CUDA_CHECK(cudaMemcpy(garr, arr, sizeof(int)*len, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(gmat, mat, sizeof(double)*row*col, cudaMemcpyHostToDevice));
	
	/*int len = 3099121;//2097152;
	int *mat;
	CUDA_CHECK(cudaMalloc((void**)&mat, sizeof(int)*len*BLOCK_SIZE));
	CUDA_CHECK(cudaMemset(mat,0,sizeof(int)*len*BLOCK_SIZE));
	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
	dim3 dimGrid((len + BLOCK_SIZE - 1)/BLOCK_SIZE,(BLOCK_SIZE + BLOCK_SIZE - 1)/BLOCK_SIZE);*/
	struct timeval beg, end, b1, e1;
	
	
	gettimeofday(&b1, NULL);
	addMinusMtx0<<<col, 1024>>>(gmat,  row, col,garr, len);//a block process a column, grid.x maximum dimension is very large, need col blocks in x direction. data access is continue
	CUDA_CHECK(cudaPeekAtLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	gettimeofday(&e1, NULL);
	cout << "gpu0 real time used: " << e1.tv_sec-b1.tv_sec + (double)(e1.tv_usec-b1.tv_usec)/1000000 <<endl;
	
	CUDA_CHECK(cudaMemcpy(gmat, mat, sizeof(double)*row*col, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaDeviceSynchronize());
	
	gettimeofday(&b1, NULL);
	dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
	dim3 dimGrid((col + BLOCK_SIZE - 1)/BLOCK_SIZE,(row + BLOCK_SIZE - 1)/BLOCK_SIZE);
	addMinusMtx1<<<dimGrid, dimBlock>>>(gmat,  row, col,garr, len);//a thread calc an element in gmat. data access is very bad 
	CUDA_CHECK(cudaPeekAtLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	gettimeofday(&e1, NULL);
	cout << "gpu1 real time used: " << e1.tv_sec-b1.tv_sec + (double)(e1.tv_usec-b1.tv_usec)/1000000 <<endl;
	
	CUDA_CHECK(cudaMemcpy(gmat, mat, sizeof(double)*row*col, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaDeviceSynchronize());
	
	gettimeofday(&b1, NULL);
	int y = (row + 1024 -1)/1024;
	if (y > 65535) y = 65535;
	dimGrid.x = col;
	dimGrid.y = y;
	//dim3 dimGrid(col,y);
	addMinusMtx2<<<dimGrid, 1024>>>(gmat,  row, col,garr, len);//almost same with addMinusMtx0, only diff is in y direction, has blocks.
	CUDA_CHECK(cudaPeekAtLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	gettimeofday(&e1, NULL);
	cout << "gpu2 real time used: " << e1.tv_sec-b1.tv_sec + (double)(e1.tv_usec-b1.tv_usec)/1000000 <<endl;
	
	CUDA_CHECK(cudaMemcpy(gmat, mat, sizeof(double)*row*col, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaDeviceSynchronize());
	
	gettimeofday(&b1, NULL);
	
	addMinusMtx3<<<(row*col + 1023)/1024, 1024>>>(gmat,  row, col,garr, len);//process gmat as array, need modulo operation. in gpu, integer divsion and modelo operation are costly: tens of instructions on compute capabllity 1.0, but below 20 instructions for 2.x and higher.
	CUDA_CHECK(cudaPeekAtLastError());
	CUDA_CHECK(cudaDeviceSynchronize());
	gettimeofday(&e1, NULL);
	cout << "gpu3 real time used: " << e1.tv_sec-b1.tv_sec + (double)(e1.tv_usec-b1.tv_usec)/1000000 <<endl;
	
	
	
	gettimeofday(&beg, NULL);
	ts(mat, row, col, arr,len);
	gettimeofday(&end, NULL);
	cout << "cpu real time used: " << end.tv_sec-beg.tv_sec + (double)(end.tv_usec-beg.tv_usec)/1000000 <<endl;
	
	gettimeofday(&beg, NULL);
	ts1(mat1, row, col, arr,len);
	gettimeofday(&end, NULL);
	cout << "cpu1 real time used: " << end.tv_sec-beg.tv_sec + (double)(end.tv_usec-beg.tv_usec)/1000000 <<endl;
	
	if (validate(mat1, gmat, row, col)) {
		cout << "yes\n";
	}
	else {
		cout << "no\n";
	}
	
	
	
}

int main(int argc, char *argv[] ) {
	test(argc, argv);
	return 1;
}

nvcc  -arch=sm_35  mtxOp.cu -o mtxOp

./mtxOp 30000 8000

gpu0 real time used: 0.028922
gpu1 real time used: 0.106911
gpu2 real time used: 0.028231
gpu3 real time used: 0.027024

因为矩阵存储是column-major,所以方法1的速度最慢,主要是访问显存不能合并访问,一个warp中的连续的线程不能访问连续的数据

方法0的思路是,有多少个列,就有多少个block, 只有x方向的block,这是因为x方向可以有2147483647个block, 可以认为在显存的大小下,一般不能超过这个block数量的限度

一个bock有1024个线程,也是只有x方向上的,这1024个线程循环处理这一列,这样就能保证合并访问


方法2在方法0的基础上又更近一步,在y方向上也有block


方法3就是把矩阵看做一个向量,但是需要用到取模操作,对于计算能力在1.0而言,取模操作非常慢,由于3.5计算能力对取模优化的还不错,所以速度是最快的



如果方法3去掉取模操作,就可以对比取模操作的影响,时间是0.025607, 速度提升了0.0014s

【上篇】
【下篇】

抱歉!评论已关闭.