milvus/pkg/util/hardware/gpu_mem_info_cuda.go
congqixia 57e5652f1a
enhance: Log error instead of panicking if load lock wait timeout (#39308)
Related to #39205
Previous PR #39206

This PR change wait timeout behavior to log error and return to avoid
making other collection read failure in only some collections have
deadlock

Signed-off-by: Congqi Xia <congqi.xia@zilliz.com>
2025-01-16 02:31:02 +08:00

93 lines
2.6 KiB
Go

//go:build cuda
// +build cuda
package hardware
/*
#cgo CFLAGS: -I/usr/local/cuda/include
#cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart
#include <cuda_runtime.h>
#include <stdlib.h>
// Structure to store GPU memory info
typedef struct {
size_t totalMemory;
size_t freeMemory;
} GPUMemoryInfo;
// Function to get memory info for all GPUs
int getAllGPUMemoryInfo(GPUMemoryInfo** infos) {
int deviceCount = 0;
cudaError_t err = cudaGetDeviceCount(&deviceCount);
if (err != cudaSuccess || deviceCount == 0) {
return 0; // No GPUs found or error occurred
}
// Allocate memory for the output array
*infos = (GPUMemoryInfo*)malloc(deviceCount * sizeof(GPUMemoryInfo));
if (*infos == NULL) {
return 0; // Memory allocation failed
}
for (int i = 0; i < deviceCount; ++i) {
if (cudaSetDevice(i) != cudaSuccess) {
(*infos)[i].totalMemory = 0;
(*infos)[i].freeMemory = 0;
continue; // Skip if the device cannot be set
}
size_t freeMem = 0, totalMem = 0;
if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess) {
(*infos)[i].totalMemory = 0;
(*infos)[i].freeMemory = 0;
continue; // Skip if memory info cannot be fetched
}
(*infos)[i].totalMemory = totalMem;
(*infos)[i].freeMemory = freeMem;
}
return deviceCount; // Return the number of devices processed
}
*/
import "C"
import (
"unsafe"
"github.com/cockroachdb/errors"
)
// GPUMemoryInfo represents a single GPU's memory information.
type GPUMemoryInfo struct {
TotalMemory uint64 // Total memory in bytes
FreeMemory uint64 // Free memory in bytes
}
// GetAllGPUMemoryInfo retrieves the memory information for all available GPUs.
// It returns a slice of GPUMemoryInfo and an error if no GPUs are found or retrieval fails.
func GetAllGPUMemoryInfo() ([]GPUMemoryInfo, error) {
var infos *C.GPUMemoryInfo
// Call the C function to retrieve GPU memory info
deviceCount := int(C.getAllGPUMemoryInfo(&infos))
if deviceCount == 0 {
return nil, errors.New("failed to retrieve GPU memory info or no GPUs found")
}
defer C.free(unsafe.Pointer(infos)) // Free the allocated memory
// Convert C array to Go slice
gpuInfos := make([]GPUMemoryInfo, 0, deviceCount)
infoArray := (*[1 << 30]C.GPUMemoryInfo)(unsafe.Pointer(infos))[:deviceCount:deviceCount]
for i := 0; i < deviceCount; i++ {
info := infoArray[i]
gpuInfos = append(gpuInfos, GPUMemoryInfo{
TotalMemory: uint64(info.totalMemory),
FreeMemory: uint64(info.freeMemory),
})
}
return gpuInfos, nil
}