~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ make
/usr/local/cuda-9.1/bin/nvcc -ccbin g++ -I../../common/inc? -m64? ? -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o deviceQuery.o -c deviceQuery.cpp
/usr/local/cuda-9.1/bin/nvcc -ccbin g++? -m64? ? ? -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o deviceQuery deviceQuery.o
mkdir -p ../../bin/x86_64/linux/release
cp deviceQuery ../../bin/x86_64/linux/release
~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ ls
deviceQuery? deviceQuery.cpp? deviceQuery.o? Makefile? NsightEclipse.xml? readme.txt
~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ ./deviceQuery
./deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 1050"
? CUDA Driver Version / Runtime Version? ? ? ? ? 10.1 / 9.1
? CUDA Capability Major/Minor version number:? ? 6.1
? Total amount of global memory:? ? ? ? ? ? ? ? 1998 MBytes (2095382528 bytes)
? ( 5) Multiprocessors, (128) CUDA Cores/MP:? ? 640 CUDA Cores
? GPU Max Clock rate:? ? ? ? ? ? ? ? ? ? ? ? ? ? 1455 MHz (1.46 GHz)
? Memory Clock rate:? ? ? ? ? ? ? ? ? ? ? ? ? ? 3504 Mhz
? Memory Bus Width:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 128-bit
? L2 Cache Size:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 1048576 bytes
? Maximum Texture Dimension Size (x,y,z)? ? ? ? 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
? Maximum Layered 1D Texture Size, (num) layers? 1D=(32768), 2048 layers
? Maximum Layered 2D Texture Size, (num) layers? 2D=(32768, 32768), 2048 layers
? Total amount of constant memory:? ? ? ? ? ? ? 65536 bytes
? Total amount of shared memory per block:? ? ? 49152 bytes
? Total number of registers available per block: 65536
? Warp size:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? 32
? Maximum number of threads per multiprocessor:? 2048
? Maximum number of threads per block:? ? ? ? ? 1024
? Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
? Max dimension size of a grid size? ? (x,y,z): (2147483647, 65535, 65535)
? Maximum memory pitch:? ? ? ? ? ? ? ? ? ? ? ? ? 2147483647 bytes
? Texture alignment:? ? ? ? ? ? ? ? ? ? ? ? ? ? 512 bytes
? Concurrent copy and kernel execution:? ? ? ? ? Yes with 2 copy engine(s)
? Run time limit on kernels:? ? ? ? ? ? ? ? ? ? No
? Integrated GPU sharing Host Memory:? ? ? ? ? ? No
? Support host page-locked memory mapping:? ? ? Yes
? Alignment requirement for Surfaces:? ? ? ? ? ? Yes
? Device has ECC support:? ? ? ? ? ? ? ? ? ? ? ? Disabled
? Device supports Unified Addressing (UVA):? ? ? Yes
? Supports Cooperative Kernel Launch:? ? ? ? ? ? Yes
? Supports MultiDevice Co-op Kernel Launch:? ? ? Yes
? Device PCI Domain ID / Bus ID / location ID:? 0 / 2 / 0
? Compute Mode:
? ? < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.1, CUDA Runtime Version = 9.1, NumDevs = 1
Result = PASS
~/samples/NVIDIA_CUDA-9.1_Samples/1_Utilities/deviceQuery$ cat deviceQuery.cpp
/*
* Copyright 1993-2015 NVIDIA Corporation.? All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* This sample queries the properties of the CUDA devices present in the system via CUDA Runtime API. */
// Shared Utilities (QA Testing)
// std::system includes
#include <memory>
#include <iostream>
#include <cuda_runtime.h>
#include <helper_cuda.h>
int *pArgc = NULL;
char **pArgv = NULL;
#if CUDART_VERSION < 5000
// CUDA-C includes
#include <cuda.h>
// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
? ? CUresult error =? ? cuDeviceGetAttribute(attribute, device_attribute, device);
? ? if (CUDA_SUCCESS != error)
? ? {
? ? ? ? fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
? ? ? ? ? ? ? ? error, __FILE__, __LINE__);
? ? ? ? exit(EXIT_FAILURE);
? ? }
}
#endif /* CUDART_VERSION < 5000 */
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
? ? pArgc = &argc;
? ? pArgv = argv;
? ? printf("%s Starting...\n\n", argv[0]);
? ? printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
? ? int deviceCount = 0;
? ? cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
? ? if (error_id != cudaSuccess)
? ? {
? ? ? ? printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
? ? ? ? printf("Result = FAIL\n");
? ? ? ? exit(EXIT_FAILURE);
? ? }
? ? // This function call returns 0 if there are no CUDA capable devices.
? ? if (deviceCount == 0)
? ? {
? ? ? ? printf("There are no available device(s) that support CUDA\n");
? ? }
? ? else
? ? {
? ? ? ? printf("Detected %d CUDA Capable device(s)\n", deviceCount);
? ? }
? ? int dev, driverVersion = 0, runtimeVersion = 0;
? ? for (dev = 0; dev < deviceCount; ++dev)
? ? {
? ? ? ? cudaSetDevice(dev);
? ? ? ? cudaDeviceProp deviceProp;
? ? ? ? cudaGetDeviceProperties(&deviceProp, dev);
? ? ? ? printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
? ? ? ? // Console log
? ? ? ? cudaDriverGetVersion(&driverVersion);
? ? ? ? cudaRuntimeGetVersion(&runtimeVersion);
? ? ? ? printf("? CUDA Driver Version / Runtime Version? ? ? ? ? %d.%d / %d.%d\n", driverVersion/1000, (driverVersion%100)/10, runtimeVersion/1000, (runtimeVersion%100)/10);
? ? ? ? printf("? CUDA Capability Major/Minor version number:? ? %d.%d\n", deviceProp.major, deviceProp.minor);
? ? ? ? char msg[256];
? ? ? ? SPRINTF(msg, "? Total amount of global memory:? ? ? ? ? ? ? ? %.0f MBytes (%llu bytes)\n",
? ? ? ? ? ? ? ? (float)deviceProp.totalGlobalMem/1048576.0f, (unsigned long long) deviceProp.totalGlobalMem);
? ? ? ? printf("%s", msg);
? ? ? ? printf("? (%2d) Multiprocessors, (%3d) CUDA Cores/MP:? ? %d CUDA Cores\n",
? ? ? ? ? ? ? deviceProp.multiProcessorCount,
? ? ? ? ? ? ? _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
? ? ? ? ? ? ? _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
? ? ? ? printf("? GPU Max Clock rate:? ? ? ? ? ? ? ? ? ? ? ? ? ? %.0f MHz (%0.2f GHz)\n", deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 5000
? ? ? ? // This is supported in CUDA 5.0 (runtime API device properties)
? ? ? ? printf("? Memory Clock rate:? ? ? ? ? ? ? ? ? ? ? ? ? ? %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
? ? ? ? printf("? Memory Bus Width:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? %d-bit\n",? deviceProp.memoryBusWidth);
? ? ? ? if (deviceProp.l2CacheSize)
? ? ? ? {
? ? ? ? ? ? printf("? L2 Cache Size:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? %d bytes\n", deviceProp.l2CacheSize);
? ? ? ? }
#else
? ? ? ? // This only available in CUDA 4.0-4.2 (but these were only exposed in the CUDA Driver API)
? ? ? ? int memoryClock;
? ? ? ? getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
? ? ? ? printf("? Memory Clock rate:? ? ? ? ? ? ? ? ? ? ? ? ? ? %.0f Mhz\n", memoryClock * 1e-3f);
? ? ? ? int memBusWidth;
? ? ? ? getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
? ? ? ? printf("? Memory Bus Width:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? %d-bit\n", memBusWidth);
? ? ? ? int L2CacheSize;
? ? ? ? getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
? ? ? ? if (L2CacheSize)
? ? ? ? {
? ? ? ? ? ? printf("? L2 Cache Size:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? %d bytes\n", L2CacheSize);
? ? ? ? }
#endif
? ? ? ? printf("? Maximum Texture Dimension Size (x,y,z)? ? ? ? 1D=(%d), 2D=(%d, %d), 3D=(%d, %d, %d)\n",
? ? ? ? ? ? ? deviceProp.maxTexture1D? , deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
? ? ? ? ? ? ? deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
? ? ? ? printf("? Maximum Layered 1D Texture Size, (num) layers? 1D=(%d), %d layers\n",
? ? ? ? ? ? ? deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
? ? ? ? printf("? Maximum Layered 2D Texture Size, (num) layers? 2D=(%d, %d), %d layers\n",
? ? ? ? ? ? ? deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
? ? ? ? printf("? Total amount of constant memory:? ? ? ? ? ? ? %lu bytes\n", deviceProp.totalConstMem);
? ? ? ? printf("? Total amount of shared memory per block:? ? ? %lu bytes\n", deviceProp.sharedMemPerBlock);
? ? ? ? printf("? Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
? ? ? ? printf("? Warp size:? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? %d\n", deviceProp.warpSize);
? ? ? ? printf("? Maximum number of threads per multiprocessor:? %d\n", deviceProp.maxThreadsPerMultiProcessor);
? ? ? ? printf("? Maximum number of threads per block:? ? ? ? ? %d\n", deviceProp.maxThreadsPerBlock);
? ? ? ? printf("? Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
? ? ? ? ? ? ? deviceProp.maxThreadsDim[0],
? ? ? ? ? ? ? deviceProp.maxThreadsDim[1],
? ? ? ? ? ? ? deviceProp.maxThreadsDim[2]);
? ? ? ? printf("? Max dimension size of a grid size? ? (x,y,z): (%d, %d, %d)\n",
? ? ? ? ? ? ? deviceProp.maxGridSize[0],
? ? ? ? ? ? ? deviceProp.maxGridSize[1],
? ? ? ? ? ? ? deviceProp.maxGridSize[2]);
? ? ? ? printf("? Maximum memory pitch:? ? ? ? ? ? ? ? ? ? ? ? ? %lu bytes\n", deviceProp.memPitch);
? ? ? ? printf("? Texture alignment:? ? ? ? ? ? ? ? ? ? ? ? ? ? %lu bytes\n", deviceProp.textureAlignment);
? ? ? ? printf("? Concurrent copy and kernel execution:? ? ? ? ? %s with %d copy engine(s)\n", (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
? ? ? ? printf("? Run time limit on kernels:? ? ? ? ? ? ? ? ? ? %s\n", deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
? ? ? ? printf("? Integrated GPU sharing Host Memory:? ? ? ? ? ? %s\n", deviceProp.integrated ? "Yes" : "No");
? ? ? ? printf("? Support host page-locked memory mapping:? ? ? %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
? ? ? ? printf("? Alignment requirement for Surfaces:? ? ? ? ? ? %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
? ? ? ? printf("? Device has ECC support:? ? ? ? ? ? ? ? ? ? ? ? %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
? ? ? ? printf("? CUDA Device Driver Mode (TCC or WDDM):? ? ? ? %s\n", deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
? ? ? ? printf("? Device supports Unified Addressing (UVA):? ? ? %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
? ? ? ? printf("? Supports Cooperative Kernel Launch:? ? ? ? ? ? %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
? ? ? ? printf("? Supports MultiDevice Co-op Kernel Launch:? ? ? %s\n", deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
? ? ? ? printf("? Device PCI Domain ID / Bus ID / location ID:? %d / %d / %d\n", deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
? ? ? ? const char *sComputeMode[] =
? ? ? ? {
? ? ? ? ? ? "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
? ? ? ? ? ? "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
? ? ? ? ? ? "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
? ? ? ? ? ? "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
? ? ? ? ? ? "Unknown",
? ? ? ? ? ? NULL
? ? ? ? };
? ? ? ? printf("? Compute Mode:\n");
? ? ? ? printf("? ? < %s >\n", sComputeMode[deviceProp.computeMode]);
? ? }
? ? // If there are 2 or more GPUs, query to determine whether RDMA is supported
? ? if (deviceCount >= 2)
? ? {
? ? ? ? cudaDeviceProp prop[64];
? ? ? ? int gpuid[64]; // we want to find the first two GPUs that can support P2P
? ? ? ? int gpu_p2p_count = 0;
? ? ? ? for (int i=0; i < deviceCount; i++)
? ? ? ? {
? ? ? ? ? ? checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
? ? ? ? ? ? // Only boards based on Fermi or later can support P2P
? ? ? ? ? ? if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
? ? ? ? ? ? ? ? // on Windows (64-bit), the Tesla Compute Cluster driver for windows must be enabled to support this
? ? ? ? ? ? ? ? && prop[i].tccDriver
#endif
? ? ? ? ? ? ? )
? ? ? ? ? ? {
? ? ? ? ? ? ? ? // This is an array of P2P capable GPUs
? ? ? ? ? ? ? ? gpuid[gpu_p2p_count++] = i;
? ? ? ? ? ? }
? ? ? ? }
? ? ? ? // Show all the combinations of support P2P GPUs
? ? ? ? int can_access_peer;
? ? ? ? if (gpu_p2p_count >= 2)
? ? ? ? {
? ? ? ? ? ? for (int i = 0; i < gpu_p2p_count; i++)
? ? ? ? ? ? {
? ? ? ? ? ? ? ? for (int j = 0; j < gpu_p2p_count; j++)
? ? ? ? ? ? ? ? {
? ? ? ? ? ? ? ? ? ? if (gpuid[i] == gpuid[j])
? ? ? ? ? ? ? ? ? ? {
? ? ? ? ? ? ? ? ? ? ? ? continue;
? ? ? ? ? ? ? ? ? ? }
? ? ? ? ? ? ? ? ? ? checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
? ? ? ? ? ? ? ? ? ? ? ? printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid[i]].name, gpuid[i],
? ? ? ? ? ? ? ? ? ? ? ? ? prop[gpuid[j]].name, gpuid[j] ,
? ? ? ? ? ? ? ? ? ? ? ? ? can_access_peer ? "Yes" : "No");
? ? ? ? ? ? ? ? }
? ? ? ? ? ? }
? ? ? ? }
? ? }
? ? // csv masterlog info
? ? // *****************************
? ? // exe and CUDA driver name
? ? printf("\n");
? ? std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
? ? char cTemp[16];
? ? // driver version
? ? sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
? ? sprintf_s(cTemp, 10, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#else
? ? sprintf(cTemp, "%d.%d", driverVersion/1000, (driverVersion%100)/10);
#endif
? ? sProfileString +=? cTemp;
? ? // Runtime version
? ? sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
? ? sprintf_s(cTemp, 10, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#else
? ? sprintf(cTemp, "%d.%d", runtimeVersion/1000, (runtimeVersion%100)/10);
#endif
? ? sProfileString +=? cTemp;
? ? // Device count
? ? sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
? ? sprintf_s(cTemp, 10, "%d", deviceCount);
#else
? ? sprintf(cTemp, "%d", deviceCount);
#endif
? ? sProfileString += cTemp;
? ? sProfileString += "\n";
? ? printf("%s", sProfileString.c_str());
? ? printf("Result = PASS\n");
? ? // finish
? ? exit(EXIT_SUCCESS);
}