From 6d41eb56505ec62533f30d66d1ba090a6fea5038 Mon Sep 17 00:00:00 2001 From: kun yu Date: Tue, 6 Aug 2019 11:27:23 +0800 Subject: [PATCH] add temperature and pcpu Former-commit-id: 79e80d6e4ee42e05a5f9e60021fc4cc7ff9cc04d --- cpp/CHANGELOG.md | 2 ++ cpp/src/db/DBImpl.cpp | 3 ++- cpp/src/metrics/MetricBase.h | 2 ++ cpp/src/metrics/PrometheusMetrics.cpp | 35 +++++++++++++++++------- cpp/src/metrics/PrometheusMetrics.h | 15 +++++++++-- cpp/src/metrics/SystemInfo.cpp | 38 ++++++++++++++++++++++++--- cpp/src/metrics/SystemInfo.h | 5 +++- 7 files changed, 83 insertions(+), 17 deletions(-) diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 6e5fcc2c96..80df09ad28 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -40,6 +40,8 @@ Please mark all change in change log and use the ticket from JIRA. - MS-266 - Improve topk reduce time by using multi-threads - MS-275 - Avoid sqlite logic error excetion - MS-278 - add IndexStatsHelper +- MS-305 - add CPU core percent metric +- MS-310 - add milvus CPU utilization ratio and CPU/GPU temperature metrics ## New Feature - MS-180 - Add new mem manager diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index a79c918b9a..f7c8e986a2 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -319,7 +319,8 @@ void DBImpl::StartMetricTask() { server::Metrics::GetInstance().OctetsSet(); server::Metrics::GetInstance().CPUCoreUsagePercentSet(); - + server::Metrics::GetInstance().GPUTemperature(); + server::Metrics::GetInstance().CPUTemperature(); ENGINE_LOG_TRACE << "Metric task finished"; } diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index 61e9e7680f..23a2427b35 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -66,6 +66,8 @@ class MetricsBase{ virtual void OctetsSet() {}; virtual void CPUCoreUsagePercentSet() {}; + virtual void GPUTemperature() {}; + virtual void CPUTemperature() {}; }; diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index 3d83bff864..c7729ffdbc 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -34,8 +34,6 @@ PrometheusMetrics::Init() { return SERVER_UNEXPECTED_ERROR; } - // - return SERVER_SUCCESS; } @@ -44,8 +42,6 @@ PrometheusMetrics::Init() { void PrometheusMetrics::CPUUsagePercentSet() { if(!startup_) return ; - int numProcessor = server::SystemInfo::GetInstance().num_processor(); - double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); CPU_usage_percent_.Set(usage_percent); } @@ -64,13 +60,11 @@ PrometheusMetrics::GPUPercentGaugeSet() { std::vector used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); - - for (int i = 0; i < numDevice; i++) { + for (int i = 0; i < numDevice; ++i) { prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); double percent = (double)used_memory[i] / (double)used_total[i]; GPU_percent.Set(percent * 100); } - } void PrometheusMetrics::GPUMemoryUsageGaugeSet() { @@ -79,7 +73,7 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() { constexpr unsigned long long MtoB = 1024*1024; int numDevice = server::SystemInfo::GetInstance().num_device(); - for (int i = 0; i < numDevice; i++) { + for (int i = 0; i < numDevice; ++i) { prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); GPU_memory.Set(values[i] / MtoB); } @@ -142,12 +136,35 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() { std::vector cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); - for (int i = 0; i < cpu_core_percent.size(); i++) { + for (int i = 0; i < cpu_core_percent.size(); ++i) { prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); core_percent.Set(cpu_core_percent[i]); } } +void PrometheusMetrics::GPUTemperature() { + if (!startup_) + return; + + std::vector GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature(); + + for (int i = 0; i < GPU_temperatures.size(); ++i) { + prometheus::Gauge &gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}}); + gpu_temp.Set(GPU_temperatures[i]); + } +} + +void PrometheusMetrics::CPUTemperature() { + if (!startup_) + return; + + std::vector CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature(); + + for (int i = 0; i < CPU_temperatures.size(); ++i) { + prometheus::Gauge &cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}}); + cpu_temp.Set(CPU_temperatures[i]); + } +} } } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 590130f444..282c58800c 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -79,7 +79,6 @@ class PrometheusMetrics: public MetricsBase { void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; void CPUUsagePercentSet() override ; - void CPUCoreUsagePercentSet() override; void RAMUsagePercentSet() override ; @@ -93,6 +92,9 @@ class PrometheusMetrics: public MetricsBase { void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);}; void OctetsSet() override ; + void GPUTemperature() override; + void CPUTemperature() override; + @@ -396,7 +398,7 @@ class PrometheusMetrics: public MetricsBase { .Name("CPU_usage_percent") .Help("CPU usage percent by this this process") .Register(*registry_); - prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); + prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "avg"}}); prometheus::Family &RAM_ = prometheus::BuildGauge() @@ -444,6 +446,15 @@ class PrometheusMetrics: public MetricsBase { prometheus::Gauge &outoctets_gauge_ = octets_.Add({{"type", "outoctets"}}); + prometheus::Family &GPU_temperature_ = prometheus::BuildGauge() + .Name("GPU_temperature") + .Help("GPU temperature") + .Register(*registry_); + + prometheus::Family &CPU_temperature_ = prometheus::BuildGauge() + .Name("CPU_temperature") + .Help("CPU temperature") + .Register(*registry_); }; diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp index 7628db78bb..3b6698d42b 100644 --- a/cpp/src/metrics/SystemInfo.cpp +++ b/cpp/src/metrics/SystemInfo.cpp @@ -36,6 +36,9 @@ void SystemInfo::Init() { num_processors_ = 0; while(fgets(line, 128, file) != NULL){ if (strncmp(line, "processor", 9) == 0) num_processors_++; + if (strncmp(line, "physical", 8) == 0) { + num_physical_processors_ = ParseLine(line); + } } total_ram_ = GetPhysicalMemory(); fclose(file); @@ -108,8 +111,6 @@ SystemInfo::MemoryPercent() { return (double)(GetProcessUsedMemory()*100)/(double)total_ram_; } - - std::vector SystemInfo::CPUCorePercent() { std::vector prev_work_time_array; @@ -119,7 +120,7 @@ SystemInfo::CPUCorePercent() { std::vector cur_total_time_array = getTotalCpuTime(cur_work_time_array); std::vector cpu_core_percent; - for (int i = 0; i < num_processors_; i++) { + for (int i = 1; i < num_processors_; i++) { double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i]; double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i]; cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100); @@ -181,7 +182,6 @@ SystemInfo::CPUPercent() { percent = (time_sample.tms_stime - last_sys_cpu_) + (time_sample.tms_utime - last_user_cpu_); percent /= (now - last_cpu_); - percent /= num_processors_; percent *= 100; } last_cpu_ = now; @@ -207,6 +207,36 @@ SystemInfo::GPUMemoryTotal() { return result; } +std::vector +SystemInfo::GPUTemperature(){ + if(!initialized_) Init(); + std::vector result; + for (int i = 0; i < num_device_; i++) { + nvmlDevice_t device; + nvmlDeviceGetHandleByIndex(i, &device); + unsigned int temp; + nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU,&temp); + result.push_back(temp); + } + return result; +} +std::vector +SystemInfo::CPUTemperature(){ + std::vector result; + for (int i = 0; i <= num_physical_processors_; ++i) { + std::string path = "/sys/class/thermal/thermal_zone" + std::to_string(i) + "/temp"; + FILE *file = fopen(path.data(), "r"); + if (file == NULL) { + perror("Could not open thermal file"); + return result; + } + float temp; + fscanf(file, "%f", &temp); + result.push_back(temp / 1000); + } + +} + std::vector SystemInfo::GPUMemoryUsed() { // get GPU memory used diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h index 629aaf7220..ab27375c73 100644 --- a/cpp/src/metrics/SystemInfo.h +++ b/cpp/src/metrics/SystemInfo.h @@ -32,6 +32,7 @@ class SystemInfo { clock_t last_user_cpu_ = clock_t(); std::chrono::system_clock::time_point net_time_ = std::chrono::system_clock::now(); int num_processors_ = 0; + int num_physical_processors_ = 0; //number of GPU unsigned int num_device_ = 0; unsigned long long in_octets_ = 0; @@ -47,6 +48,7 @@ class SystemInfo { void Init(); int num_processor() const { return num_processors_;}; + int num_physical_processors() const { return num_physical_processors_; }; int num_device() const {return num_device_;}; unsigned long long get_inoctets() { return in_octets_;}; unsigned long long get_octets() { return out_octets_;}; @@ -65,7 +67,8 @@ class SystemInfo { std::vector CPUCorePercent(); std::vector getTotalCpuTime(std::vector &workTime); - + std::vector GPUTemperature(); + std::vector CPUTemperature(); };