[v0.04] Fix bug in which NVML was using the wrong GPU id
This commit is contained in:
@@ -62,6 +62,7 @@ struct gpu_info {
|
|||||||
struct memory* mem;
|
struct memory* mem;
|
||||||
struct cache* cach;
|
struct cache* cach;
|
||||||
int64_t peak_performance;
|
int64_t peak_performance;
|
||||||
|
int32_t idx;
|
||||||
};
|
};
|
||||||
|
|
||||||
VENDOR get_gpu_vendor(struct gpu_info* gpu);
|
VENDOR get_gpu_vendor(struct gpu_info* gpu);
|
||||||
|
|||||||
@@ -67,8 +67,9 @@ int64_t get_peak_performance(struct gpu_info* gpu) {
|
|||||||
struct gpu_info* get_gpu_info(int gpu_idx) {
|
struct gpu_info* get_gpu_info(int gpu_idx) {
|
||||||
struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
|
struct gpu_info* gpu = (struct gpu_info*) emalloc(sizeof(struct gpu_info));
|
||||||
gpu->pci = NULL;
|
gpu->pci = NULL;
|
||||||
|
gpu->idx = gpu_idx;
|
||||||
|
|
||||||
if(gpu_idx < 0) {
|
if(gpu->idx < 0) {
|
||||||
printErr("GPU index must be equal or greater than zero");
|
printErr("GPU index must be equal or greater than zero");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@@ -89,13 +90,13 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(gpu_idx+1 > num_gpus) {
|
if(gpu->idx+1 > num_gpus) {
|
||||||
printErr("Requested GPU index %d in a system with %d GPUs", gpu_idx, num_gpus);
|
printErr("Requested GPU index %d in a system with %d GPUs", gpu->idx, num_gpus);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaDeviceProp deviceProp;
|
cudaDeviceProp deviceProp;
|
||||||
if ((err = cudaGetDeviceProperties(&deviceProp, gpu_idx)) != cudaSuccess) {
|
if ((err = cudaGetDeviceProperties(&deviceProp, gpu->idx)) != cudaSuccess) {
|
||||||
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
|
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@@ -106,7 +107,7 @@ struct gpu_info* get_gpu_info(int gpu_idx) {
|
|||||||
strcpy(gpu->name, deviceProp.name);
|
strcpy(gpu->name, deviceProp.name);
|
||||||
|
|
||||||
gpu->nvmld = nvml_init();
|
gpu->nvmld = nvml_init();
|
||||||
if(nvml_get_pci_info(gpu_idx, gpu->nvmld)) {
|
if(nvml_get_pci_info(gpu->idx, gpu->nvmld)) {
|
||||||
gpu->pci = get_pci_from_nvml(gpu->nvmld);
|
gpu->pci = get_pci_from_nvml(gpu->nvmld);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -252,9 +252,12 @@ void map_chip_to_uarch(struct uarch* arch) {
|
|||||||
struct uarch* get_uarch_from_cuda(struct gpu_info* gpu) {
|
struct uarch* get_uarch_from_cuda(struct gpu_info* gpu) {
|
||||||
struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch));
|
struct uarch* arch = (struct uarch*) emalloc(sizeof(struct uarch));
|
||||||
|
|
||||||
int dev = 0;
|
cudaError_t err = cudaSuccess;
|
||||||
cudaDeviceProp deviceProp;
|
cudaDeviceProp deviceProp;
|
||||||
cudaGetDeviceProperties(&deviceProp, dev);
|
if ((err = cudaGetDeviceProperties(&deviceProp, gpu->idx)) != cudaSuccess) {
|
||||||
|
printErr("%s: %s", cudaGetErrorName(err), cudaGetErrorString(err));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
arch->chip_str = NULL;
|
arch->chip_str = NULL;
|
||||||
arch->cc_major = deviceProp.major;
|
arch->cc_major = deviceProp.major;
|
||||||
|
|||||||
Reference in New Issue
Block a user