diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d0c5b1545..f0e66c3d9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Please mark all change in change log and use the issue from GitHub - \#2054 Check if CPU instruction sets are illegal - \#2059 Add lock file avoid multiple instances modifying data at the same time - \#2064 Warn when use SQLite as metadata management +- \#2111 Check GPU environment before start server ## Improvement - \#221 Refactor LOG macro diff --git a/core/src/server/Server.cpp b/core/src/server/Server.cpp index 5f945a5074..931f2b294e 100644 --- a/core/src/server/Server.cpp +++ b/core/src/server/Server.cpp @@ -24,6 +24,7 @@ #include "server/DBWrapper.h" #include "server/grpc_impl/GrpcServer.h" #include "server/init/CpuChecker.h" +#include "server/init/GpuChecker.h" #include "server/web_impl/WebServer.h" #include "src/version.h" //#include "storage/s3/S3ClientWrapper.h" @@ -237,6 +238,13 @@ Server::Start() { if (!s.ok()) { return s; } + +#ifdef MILVUS_GPU_VERSION + s = GpuChecker::CheckGpuEnvironment(); + if (!s.ok()) { + return s; + } +#endif /* record config and hardware information into log */ LogConfigInFile(config_filename_); LogCpuInfo(); diff --git a/core/src/server/init/CpuChecker.cpp b/core/src/server/init/CpuChecker.cpp index cfdcf26ddb..24c7620a5d 100644 --- a/core/src/server/init/CpuChecker.cpp +++ b/core/src/server/init/CpuChecker.cpp @@ -15,6 +15,8 @@ #include #include +#include + #include "faiss/FaissHook.h" #include "faiss/utils/instruction_set.h" #include "utils/Log.h" @@ -28,16 +30,26 @@ CpuChecker::CheckCpuInstructionSet() { std::vector instruction_sets; auto& instruction_set_inst = faiss::InstructionSet::GetInstance(); - if (faiss::support_avx512()) { + + bool support_avx512 = faiss::support_avx512(); + fiu_do_on("CpuChecker.CheckCpuInstructionSet.not_support_avx512", support_avx512 = false); + if (support_avx512) { instruction_sets.emplace_back("avx512"); } - if (instruction_set_inst.AVX2()) { + + bool support_axv2 = instruction_set_inst.AVX2(); + fiu_do_on("CpuChecker.CheckCpuInstructionSet.not_support_avx2", support_axv2 = false); + if (support_axv2) { instruction_sets.emplace_back("avx2"); } - if (instruction_set_inst.SSE42()) { + + bool support_sse4_2 = instruction_set_inst.SSE42(); + fiu_do_on("CpuChecker.CheckCpuInstructionSet.not_support_sse4_2", support_sse4_2 = false); + if (support_sse4_2) { instruction_sets.emplace_back("sse4_2"); } + fiu_do_on("CpuChecker.CheckCpuInstructionSet.instruction_sets_empty", instruction_sets.clear()); if (instruction_sets.empty()) { std::string msg = "CPU instruction sets are not supported. Ensure the CPU supports at least one of the following instruction " diff --git a/core/src/server/init/GpuChecker.cpp b/core/src/server/init/GpuChecker.cpp new file mode 100644 index 0000000000..15e0cbe1c7 --- /dev/null +++ b/core/src/server/init/GpuChecker.cpp @@ -0,0 +1,274 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#ifdef MILVUS_GPU_VERSION +#include "server/init/GpuChecker.h" + +#include +#include +#include + +#include + +#include "config/Config.h" +#include "utils/Log.h" + +namespace milvus { +namespace server { + +namespace { +std::string +ConvertCudaVersion(int version) { + return std::to_string(version / 1000) + "." + std::to_string((version % 100) / 10); +} +} // namespace + +const int CUDA_MIN_VERSION = 10000; // 10.0 +const float GPU_MIN_COMPUTE_CAPACITY = 6.0; +const char* NVIDIA_MIN_DRIVER_VERSION = "418.00"; + +std::string +GpuChecker::NvmlErrorString(nvmlReturn_t error_no) { + return "code: " + std::to_string(error_no) + ", message: " + nvmlErrorString(error_no); +} + +std::string +GpuChecker::CudaErrorString(cudaError_t error_no) { + return "code: " + std::to_string(error_no) + ", message: " + cudaGetErrorString(error_no); +} + +Status +GpuChecker::GetGpuComputeCapacity(nvmlDevice_t device, int& major, int& minor) { + nvmlReturn_t code = nvmlDeviceGetCudaComputeCapability(device, &major, &minor); + if (NVML_SUCCESS != code) { + return Status(SERVER_UNEXPECTED_ERROR, NvmlErrorString(code)); + } + + return Status::OK(); +} + +Status +GpuChecker::GetGpuNvidiaDriverVersion(std::string& version) { + char driver_version[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; + memset(driver_version, 0, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE); + auto nvml_code = nvmlSystemGetDriverVersion(driver_version, NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE); + if (NVML_SUCCESS != nvml_code) { + return Status(SERVER_UNEXPECTED_ERROR, NvmlErrorString(nvml_code)); + } + + version = std::string(driver_version); + return Status::OK(); +} + +Status +GpuChecker::GetGpuCudaDriverVersion(int& version) { + auto cuda_code = cudaDriverGetVersion(&version); + if (cudaSuccess != cuda_code) { + std::string error_msg = "Check cuda driver version failed. " + CudaErrorString(cuda_code); + return Status(SERVER_UNEXPECTED_ERROR, error_msg); + } + return Status::OK(); +} + +Status +GpuChecker::GetGpuCudaRuntimeVersion(int& version) { + auto cuda_code = cudaRuntimeGetVersion(&version); + if (cudaSuccess != cuda_code) { + std::string error_msg = "Check cuda runtime version failed. " + CudaErrorString(cuda_code); + return Status(SERVER_UNEXPECTED_ERROR, error_msg); + } + return Status::OK(); +} + +Status +GpuChecker::CheckGpuEnvironment() { + std::string err_msg; + + auto& config = Config::GetInstance(); + bool gpu_enable = true; + auto status = config.GetGpuResourceConfigEnable(gpu_enable); + if (!status.ok()) { + err_msg = "Cannot check if GPUs are enable from configuration. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + if (!gpu_enable) { + return Status::OK(); + } + + std::vector build_gpus; + status = config.GetGpuResourceConfigBuildIndexResources(build_gpus); + if (!status.ok()) { + err_msg = "Get GPU resources of building index failed. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + std::vector search_gpus; + status = config.GetGpuResourceConfigSearchResources(search_gpus); + if (!status.ok()) { + err_msg = "Get GPU resources of search failed. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + std::set gpu_sets(build_gpus.begin(), build_gpus.end()); + gpu_sets.insert(search_gpus.begin(), search_gpus.end()); + + nvmlReturn_t nvmlresult = nvmlInit(); + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvml_init_fail", nvmlresult = NVML_ERROR_UNKNOWN); + if (NVML_SUCCESS != nvmlresult) { + err_msg = "nvml initialize failed. " + NvmlErrorString(nvmlresult); + LOG_SERVER_FATAL_ << err_msg; + std::cerr << err_msg << std::endl; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + /* Check nvidia driver version */ + std::string nvidia_version; + status = GetGpuNvidiaDriverVersion(nvidia_version); + fiu_do_on("GpuChecker.CheckGpuEnvironment.get_nvidia_driver_fail", status = Status(SERVER_UNEXPECTED_ERROR, "")); + if (!status.ok()) { + err_msg = " Check nvidia driver failed. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvidia_driver_too_slow", + nvidia_version = std::to_string(std::stof(NVIDIA_MIN_DRIVER_VERSION) - 1)); + if (nvidia_version.compare(NVIDIA_MIN_DRIVER_VERSION) < 0) { + err_msg = "Nvidia driver version " + std::string(nvidia_version) + " is slower than " + + std::string(NVIDIA_MIN_DRIVER_VERSION); + LOG_SERVER_FATAL_ << err_msg; + std::cerr << err_msg << std::endl; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + /* Check Cuda version */ + int cuda_driver_version = 0; + status = GetGpuCudaDriverVersion(cuda_driver_version); + fiu_do_on("GpuChecker.CheckGpuEnvironment.cuda_driver_fail", status = Status(SERVER_UNEXPECTED_ERROR, "")); + if (!status.ok()) { + err_msg = " Check Cuda driver failed. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + fiu_do_on("GpuChecker.CheckGpuEnvironment.cuda_driver_too_slow", cuda_driver_version = CUDA_MIN_VERSION - 1); + if (cuda_driver_version < CUDA_MIN_VERSION) { + err_msg = "Cuda driver version is " + ConvertCudaVersion(cuda_driver_version) + + ", slower than minimum required version " + ConvertCudaVersion(CUDA_MIN_VERSION); + LOG_SERVER_FATAL_ << err_msg; + std::cerr << err_msg << std::endl; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + int cuda_runtime_version = 0; + status = GetGpuCudaRuntimeVersion(cuda_runtime_version); + fiu_do_on("GpuChecker.CheckGpuEnvironment.cuda_runtime_driver_fail", status = Status(SERVER_UNEXPECTED_ERROR, "")); + if (!status.ok()) { + err_msg = " Check Cuda runtime driver failed. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + fiu_do_on("GpuChecker.CheckGpuEnvironment.cuda_runtime_driver_too_slow", + cuda_runtime_version = CUDA_MIN_VERSION - 1); + if (cuda_runtime_version < CUDA_MIN_VERSION) { + err_msg = "Cuda runtime version is " + ConvertCudaVersion(cuda_runtime_version) + + ", slow than minimum required version " + ConvertCudaVersion(CUDA_MIN_VERSION); + LOG_SERVER_FATAL_ << err_msg; + std::cerr << err_msg << std::endl; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + /* Compute capacity */ + uint32_t device_count = 0; + nvmlresult = nvmlDeviceGetCount(&device_count); + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvml_get_device_count_fail", nvmlresult = NVML_ERROR_UNKNOWN); + if (NVML_SUCCESS != nvmlresult) { + err_msg = "Obtain GPU count failed. " + NvmlErrorString(nvmlresult); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvml_device_count_zero", device_count = 0); + if (device_count == 0) { + err_msg = "GPU count is zero. Make sure there are available GPUs in host machine"; + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + char device_name[NVML_DEVICE_NAME_BUFFER_SIZE]; + int major, minor; + for (uint32_t i = 0; i < device_count; i++) { + if (gpu_sets.find(i) == gpu_sets.end()) { + continue; + } + + nvmlDevice_t device; + nvmlresult = nvmlDeviceGetHandleByIndex(i, &device); + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvml_get_device_handle_fail", nvmlresult = NVML_ERROR_UNKNOWN); + if (NVML_SUCCESS != nvmlresult) { + err_msg = "Obtain GPU " + std::to_string(i) + " handle failed. " + NvmlErrorString(nvmlresult); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + memset(device_name, 0, NVML_DEVICE_NAME_BUFFER_SIZE); + nvmlresult = nvmlDeviceGetName(device, device_name, NVML_DEVICE_NAME_BUFFER_SIZE); + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvml_get_device_name_fail", nvmlresult = NVML_ERROR_UNKNOWN); + if (NVML_SUCCESS != nvmlresult) { + err_msg = "Obtain GPU " + std::to_string(i) + " name failed. " + NvmlErrorString(nvmlresult); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + major = 0; + minor = 0; + status = GetGpuComputeCapacity(device, major, minor); + fiu_do_on("GpuChecker.CheckGpuEnvironment.device_compute_capacity_fail", + status = Status(SERVER_UNEXPECTED_ERROR, "")); + if (!status.ok()) { + err_msg = "Obtain GPU " + std::to_string(i) + " compute capacity failed. " + status.message(); + LOG_SERVER_FATAL_ << err_msg; + std::cerr << err_msg << std::endl; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + float cc = major + minor / 1.0f; + fiu_do_on("GpuChecker.CheckGpuEnvironment.device_compute_capacity_too_weak", cc = GPU_MIN_COMPUTE_CAPACITY - 1); + if (cc < GPU_MIN_COMPUTE_CAPACITY) { + err_msg = "GPU " + std::to_string(i) + " compute capability " + std::to_string(cc) + + " is too weak. Required least GPU compute capability is " + + std::to_string(GPU_MIN_COMPUTE_CAPACITY); + LOG_SERVER_FATAL_ << err_msg; + std::cerr << err_msg << std::endl; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + LOG_SERVER_INFO_ << "GPU" << i << ": name=" << device_name << ", compute capacity=" << cc; + } + + nvmlresult = nvmlShutdown(); + fiu_do_on("GpuChecker.CheckGpuEnvironment.nvml_shutdown_fail", nvmlresult = NVML_ERROR_UNKNOWN); + if (NVML_SUCCESS != nvmlresult) { + err_msg = "nvml shutdown handle failed. " + NvmlErrorString(nvmlresult); + LOG_SERVER_FATAL_ << err_msg; + return Status(SERVER_UNEXPECTED_ERROR, err_msg); + } + + std::cout << "Nvidia driver version: " << nvidia_version << "\n" + << "CUDA Driver Version / Runtime Version : " << ConvertCudaVersion(cuda_driver_version) << " / " + << ConvertCudaVersion(cuda_runtime_version) << std::endl; + + return Status::OK(); +} + +} // namespace server +} // namespace milvus +#endif diff --git a/core/src/server/init/GpuChecker.h b/core/src/server/init/GpuChecker.h new file mode 100644 index 0000000000..0c3ec078d3 --- /dev/null +++ b/core/src/server/init/GpuChecker.h @@ -0,0 +1,58 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#ifdef MILVUS_GPU_VERSION + +#pragma once + +#include + +#include +#include + +#include "utils/Status.h" + +namespace milvus { +namespace server { + +extern const int CUDA_MIN_VERSION; +extern const float GPU_MIN_COMPUTE_CAPACITY; +extern const char* NVIDIA_MIN_DRIVER_VERSION; + +class GpuChecker { + private: + static std::string + NvmlErrorString(nvmlReturn_t error_no); + + static std::string + CudaErrorString(cudaError_t error_no); + + private: + static Status + GetGpuComputeCapacity(nvmlDevice_t device, int& major, int& minor); + + static Status + GetGpuNvidiaDriverVersion(std::string& version); + + static Status + GetGpuCudaDriverVersion(int& version); + + static Status + GetGpuCudaRuntimeVersion(int& version); + + public: + static Status + CheckGpuEnvironment(); +}; + +} // namespace server +} // namespace milvus +#endif diff --git a/core/unittest/server/CMakeLists.txt b/core/unittest/server/CMakeLists.txt index fc2b10ed01..abd80750de 100644 --- a/core/unittest/server/CMakeLists.txt +++ b/core/unittest/server/CMakeLists.txt @@ -13,6 +13,7 @@ set(test_files ${CMAKE_CURRENT_SOURCE_DIR}/test_cache.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_check.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_config.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_rpc.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_web.cpp diff --git a/core/unittest/server/test_check.cpp b/core/unittest/server/test_check.cpp new file mode 100644 index 0000000000..95131708da --- /dev/null +++ b/core/unittest/server/test_check.cpp @@ -0,0 +1,140 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +#include +#include +#include + +#include "config/Config.h" +#include "server/init/CpuChecker.h" +#ifdef MILVUS_GPU_VERSION +#include "server/init/GpuChecker.h" +#endif + +namespace ms = milvus::server; + +class ServerCheckerTest : public testing::Test { + protected: + void + SetUp() override { + } + + void + TearDown() override { + } +}; + +TEST_F(ServerCheckerTest, CPU_TEST) { + auto status = ms::CpuChecker::CheckCpuInstructionSet(); + ASSERT_TRUE(status.ok()); +} + +TEST_F(ServerCheckerTest, CPU_FAIL_TEST) { + fiu_enable("CpuChecker.CheckCpuInstructionSet.instruction_sets_empty", 1, NULL, 0); + ASSERT_FALSE(ms::CpuChecker::CheckCpuInstructionSet().ok()); + fiu_disable("CpuChecker.CheckCpuInstructionSet.instruction_sets_empty"); + + fiu_enable("CpuChecker.CheckCpuInstructionSet.not_support_avx512", 1, NULL, 0); + // CPU not support avx512, but avx2 and sse4_2 support + ASSERT_TRUE(ms::CpuChecker::CheckCpuInstructionSet().ok()); + + // CPU only support sse4_2 + fiu_enable("CpuChecker.CheckCpuInstructionSet.not_support_avx2", 1, NULL, 0); + ASSERT_TRUE(ms::CpuChecker::CheckCpuInstructionSet().ok()); + + // CPU not support one of sse4_2, avx2, avx512 + fiu_enable("CpuChecker.CheckCpuInstructionSet.not_support_sse4_2", 1, NULL, 0); + ASSERT_FALSE(ms::CpuChecker::CheckCpuInstructionSet().ok()); + + fiu_disable("CpuChecker.CheckCpuInstructionSet.not_support_sse4_2"); + fiu_disable("CpuChecker.CheckCpuInstructionSet.not_support_avx2"); + fiu_disable("CpuChecker.CheckCpuInstructionSet.not_support_avx512"); +} + +#ifdef MILVUS_GPU_VERSION +TEST_F(ServerCheckerTest, GPU_TEST) { + auto& config = ms::Config::GetInstance(); + auto status = config.SetGpuResourceConfigEnable("true"); + ASSERT_TRUE(status.ok()) << status.message(); + + status = ms::GpuChecker::CheckGpuEnvironment(); + ASSERT_TRUE(status.ok()) << status.message(); + + status = config.SetGpuResourceConfigEnable("false"); + ASSERT_TRUE(status.ok()) << status.message(); + + status = ms::GpuChecker::CheckGpuEnvironment(); + ASSERT_TRUE(status.ok()) << status.message(); +} + +TEST_F(ServerCheckerTest, GPU_FAIL_TEST) { + auto& config = ms::Config::GetInstance(); + auto status = config.SetGpuResourceConfigEnable("true"); + ASSERT_TRUE(status.ok()) << status.message(); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvml_init_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvml_init_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.get_nvidia_driver_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.get_nvidia_driver_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvidia_driver_too_slow", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvidia_driver_too_slow"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.cuda_driver_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.cuda_driver_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.cuda_driver_too_slow", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.cuda_driver_too_slow"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.cuda_runtime_driver_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.cuda_runtime_driver_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.cuda_runtime_driver_too_slow", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.cuda_runtime_driver_too_slow"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvml_get_device_count_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvml_get_device_count_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvml_device_count_zero", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvml_device_count_zero"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvml_get_device_handle_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvml_get_device_handle_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvml_get_device_name_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvml_get_device_name_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.device_compute_capacity_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.device_compute_capacity_fail"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.device_compute_capacity_too_weak", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.device_compute_capacity_too_weak"); + + fiu_enable("GpuChecker.CheckGpuEnvironment.nvml_shutdown_fail", 1, NULL, 0); + ASSERT_FALSE(ms::GpuChecker::CheckGpuEnvironment().ok()); + fiu_disable("GpuChecker.CheckGpuEnvironment.nvml_shutdown_fail"); +} + +#endif