|
- #include <cublas_v2.h>
- #include <cuda_runtime.h>
- #include <gflags/gflags.h>
- #include <omp.h>
-
- #include <Eigen/Core>
- #include <ctime>
- #include <iostream>
- #include <utility>
-
- #define declGemmFn(name) \
- void name(const float *deviceAPtr, const float *deviceBPtr, \
- float *deviceCPtr, float alpha, float beta, unsigned M, \
- unsigned N, unsigned K)
-
- declGemmFn(gemmFinal);
- declGemmFn(gemmUse128);
- declGemmFn(gemmUseTile);
- declGemmFn(gemmNaive);
- declGemmFn(gemmHideSmemLatency);
- declGemmFn(gemmTransposeSmem);
- declGemmFn(gemmUseSmem);
-
- class GemmTester {
- class cuTimer {
- cudaEvent_t startEvent{}, stopEvent{};
-
- public:
- cuTimer() {
- cudaEventCreate(&startEvent);
- cudaEventCreate(&stopEvent);
- }
- ~cuTimer() {
- cudaEventDestroy(stopEvent);
- cudaEventDestroy(startEvent);
- }
-
- void start() { cudaEventRecord(startEvent); }
-
- float end() {
- cudaEventRecord(stopEvent);
- auto error = cudaEventSynchronize(stopEvent);
- if (error != cudaSuccess) {
- throw std::runtime_error(cudaGetErrorString(error));
- }
- float milliseconds = 0;
- cudaEventElapsedTime(&milliseconds, startEvent, stopEvent);
-
- return milliseconds;
- }
- };
-
- cuTimer timer{};
- Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> hostC;
- Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
- deviceCCopied;
- const float *deviceAPtr, *deviceBPtr;
- float *deviceCPtr;
- const float *deviceCInitPtr;
- float alpha, beta;
- unsigned M, N, K;
- int iteration;
-
- void tearUp() {
- cudaMemcpy(deviceCPtr, deviceCInitPtr, M * N * sizeof(float),
- cudaMemcpyDeviceToDevice);
- }
-
- void checkValue() const {
- printf("Max Error: %f\n", (hostC - deviceCCopied).cwiseAbs().maxCoeff());
- }
-
- template <typename Function>
- void profile(Function &&gemmFunction) {
- double elapsedTime = 0;
- for (int i = 0; i < iteration; ++i) {
- tearUp();
- timer.start();
- gemmFunction(deviceAPtr, deviceBPtr, deviceCPtr, alpha, beta, M, N, K);
- elapsedTime += timer.end();
- }
- elapsedTime /= iteration;
- double GFLOPS = 2 * 1e-9 * M * N * K / (elapsedTime * 1e-3);
- printf("Average Time: %.3f ms, Throughput: %.3f GFLOPS\n", elapsedTime,
- GFLOPS);
- }
-
- public:
- explicit GemmTester(float alpha, float beta, unsigned M, unsigned N,
- unsigned K, int iteration)
- : hostC{M, N},
- deviceCCopied{M, N},
- alpha(alpha),
- beta(beta),
- M(M),
- N(N),
- K(K),
- iteration{iteration} {
- Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> A{M,
- K};
- Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> B{K,
- N};
- A.setRandom();
- B.setRandom();
- hostC.setRandom();
-
- float *_deviceCPtr, *_deviceCInitPtr;
- cudaMalloc(&_deviceCPtr, M * N * sizeof(float));
- cudaMalloc(&_deviceCInitPtr, M * N * sizeof(float));
- deviceCPtr = _deviceCPtr;
- deviceCInitPtr = _deviceCInitPtr;
- cudaMemcpy(_deviceCInitPtr, hostC.data(), M * N * sizeof(float),
- cudaMemcpyHostToDevice);
- cudaDeviceSynchronize();
-
- clock_t begin, end;
- begin = clock();
- hostC = alpha * (A * B) + beta * hostC;
- end = clock();
- printf("CPU use: %.3f ms\n", double(end - begin) / CLOCKS_PER_SEC * 1e3);
-
- float *_deviceAPtr, *_deviceBPtr;
- cudaMalloc(&_deviceAPtr, M * K * sizeof(float));
- cudaMalloc(&_deviceBPtr, K * N * sizeof(float));
- cudaMemcpy(_deviceAPtr, A.data(), M * K * sizeof(float),
- cudaMemcpyHostToDevice);
- cudaMemcpy(_deviceBPtr, B.data(), K * N * sizeof(float),
- cudaMemcpyHostToDevice);
- cudaDeviceSynchronize();
-
- deviceAPtr = _deviceAPtr;
- deviceBPtr = _deviceBPtr;
- }
- ~GemmTester() {
- cudaFree((void *)deviceAPtr);
- cudaFree((void *)deviceBPtr);
- cudaFree(deviceCPtr);
- }
-
- template <typename Function>
- void evaluate(Function &&gemmFunction, const char *name) {
- tearUp();
- printf("-----------------------------------\n");
- printf("Evaluating %s\n", name);
- gemmFunction(deviceAPtr, deviceBPtr, deviceCPtr, alpha, beta, M, N, K);
- cudaMemcpy(deviceCCopied.data(), deviceCPtr, M * N * sizeof(float),
- cudaMemcpyDeviceToHost);
- cudaDeviceSynchronize();
- checkValue();
- profile(std::forward<Function>(gemmFunction));
- printf("-----------------------------------\n");
- }
- };
-
- class gemmCuBlas {
- cublasHandle_t handle{nullptr};
-
- public:
- gemmCuBlas() { cublasCreate(&handle); }
- ~gemmCuBlas() { cublasDestroy(handle); }
-
- void operator()(const float *A, const float *B, float *C, float &alpha,
- float &beta, int M, int N, int K) const {
- int lda = N, ldb = K, ldc = N;
- cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, B, lda, A,
- ldb, &beta, C, ldc);
- }
- };
-
- int getSPcores(cudaDeviceProp devProp) {
- int cores = 0;
- int mp = devProp.multiProcessorCount;
- switch (devProp.major) {
- case 2: // Fermi
- if (devProp.minor == 1)
- cores = mp * 48;
- else
- cores = mp * 32;
- break;
- case 3: // Kepler
- cores = mp * 192;
- break;
- case 5: // Maxwell
- cores = mp * 128;
- break;
- case 6: // Pascal
- if ((devProp.minor == 1) || (devProp.minor == 2))
- cores = mp * 128;
- else if (devProp.minor == 0)
- cores = mp * 64;
- else
- throw std::runtime_error("Unknown device type");
- break;
- case 7: // Volta and Turing
- if ((devProp.minor == 0) || (devProp.minor == 5))
- cores = mp * 64;
- else
- throw std::runtime_error("Unknown device type");
- break;
- case 8: // Ampere
- if (devProp.minor == 0)
- cores = mp * 64;
- else if (devProp.minor == 6)
- cores = mp * 128;
- else
- throw std::runtime_error("Unknown device type");
- break;
- default:
- throw std::runtime_error("Unknown device type");
- }
- return cores;
- }
-
- DEFINE_int32(cpu_procs, omp_get_num_procs(), "processor num used of CPU");
- DEFINE_int32(gpu_rank, 0, "the used GPU rank");
- DEFINE_int32(repeat_iterations, 10,
- "repeat iteration numbers and average the result");
- DEFINE_double(alpha, 1., "alpha");
- DEFINE_double(beta, 1., "beta");
- DEFINE_uint32(M, 2048, "M");
- DEFINE_uint32(N, 2048, "N");
- DEFINE_uint32(K, 1024, "K");
-
- int main(int argc, char *argv[]) {
- GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
-
- printf("Program start with %d CPU processes on the %d-th GPU\n",
- FLAGS_cpu_procs, FLAGS_gpu_rank);
- omp_set_num_threads(FLAGS_cpu_procs);
- cudaDeviceProp deviceProp{};
- cudaGetDeviceProperties(&deviceProp, FLAGS_gpu_rank);
- cudaSetDevice(FLAGS_gpu_rank);
- printf("GPU %s status: ", deviceProp.name);
- double boostFrequency = deviceProp.clockRate / 1e6;
- int fp32CoresNum = getSPcores(deviceProp);
- double peakPerformance = boostFrequency * fp32CoresNum * 2;
- printf(
- "clock rate %.3f GHz, FP32 cores num %d, FP32 peak throughput %.3f "
- "GFLOPS\n",
- boostFrequency, fp32CoresNum, peakPerformance);
- printf("A: %d x %d, B: %d x %d, C: %d x %d\n", FLAGS_M, FLAGS_K, FLAGS_K,
- FLAGS_N, FLAGS_M, FLAGS_N);
-
- GemmTester tester{
- (float)FLAGS_alpha, (float)FLAGS_beta, FLAGS_M, FLAGS_N, FLAGS_K,
- FLAGS_repeat_iterations};
- tester.evaluate(gemmCuBlas{}, "cuBlas");
- tester.evaluate(gemmNaive, "Naive");
- tester.evaluate(gemmUse128, "Use128");
- tester.evaluate(gemmUseTile, "UseTile");
- tester.evaluate(gemmUseSmem, "UseSmem");
- tester.evaluate(gemmTransposeSmem, "TransposeSmem");
- tester.evaluate(gemmHideSmemLatency, "HideSmemLatency");
- tester.evaluate(gemmFinal, "Final");
-
- GFLAGS_NAMESPACE::ShutDownCommandLineFlags();
- return 0;
- }
|