From 691df6ed18f80cffd94e1fb7286b68c62a225861 Mon Sep 17 00:00:00 2001 From: kun yu Date: Sat, 27 Jul 2019 16:19:34 +0800 Subject: [PATCH 1/4] add CPU_usage_rate Metrics Former-commit-id: a2a914159d759f9724b70954758cd22d7d5c98ff --- cpp/src/metrics/MetricBase.h | 2 + cpp/src/metrics/PrometheusMetrics.cpp | 59 +++++++++++++++++++-------- cpp/src/metrics/PrometheusMetrics.h | 45 +++++++++++--------- cpp/src/metrics/SystemInfo.cpp | 58 +++++++++++++++++++++++++- cpp/src/metrics/SystemInfo.h | 5 +++ 5 files changed, 131 insertions(+), 38 deletions(-) diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index fe9b246503..61e9e7680f 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -64,6 +64,8 @@ class MetricsBase{ virtual void ConnectionGaugeDecrement() {}; virtual void KeepingAliveCounterIncrement(double value = 1) {}; virtual void OctetsSet() {}; + + virtual void CPUCoreUsagePercentSet() {}; }; diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index df0516344e..c8a09b8ea6 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -44,6 +44,8 @@ PrometheusMetrics::Init() { void PrometheusMetrics::CPUUsagePercentSet() { if(!startup_) return ; + int numProcessor = server::SystemInfo::GetInstance().num_processor(); + double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); CPU_usage_percent_.Set(usage_percent); } @@ -60,14 +62,20 @@ PrometheusMetrics::GPUPercentGaugeSet() { if(!startup_) return; int numDevide = server::SystemInfo::GetInstance().num_device(); std::vector values = server::SystemInfo::GetInstance().GPUPercent(); - if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast(values[0])); - if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast(values[1])); - if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast(values[2])); - if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast(values[3])); - if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast(values[4])); - if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast(values[5])); - if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast(values[6])); - if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast(values[7])); + + for (int i = 0; i < values.size(); i++) { + prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); + GPU_percent.Set(static_cast(values[i])); + } + +// if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast(values[0])); +// if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast(values[1])); +// if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast(values[2])); +// if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast(values[3])); +// if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast(values[4])); +// if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast(values[5])); +// if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast(values[6])); +// if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast(values[7])); // to do } @@ -78,16 +86,21 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() { constexpr unsigned long long MtoB = 1024*1024; int numDevice = values.size(); - if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB); - if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB); - if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB); - if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB); - if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB); - if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB); - if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB); - if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB); + for (int i = 0; i < numDevice; i++) { + prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); + GPU_memory.Set(values[i] / MtoB); + } + + +// if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB); +// if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB); +// if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB); +// if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB); +// if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB); +// if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB); +// if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB); +// if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB); - // to do } void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) { // MB/s @@ -140,6 +153,18 @@ void PrometheusMetrics::OctetsSet() { outoctets_gauge_.Set((in_and_out_octets.second-old_outoctets)/total_second); } +void PrometheusMetrics::CPUCoreUsagePercentSet() { + if (!startup_) + return; + + std::vector cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); + + for (int i = 0; i < cpu_core_percent.size(); i++) { + prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); + core_percent.Set(cpu_core_percent[i]); +// std::cout << cpu_core_percent[i] << "+"; + } +} } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 5b651ec14f..be73585310 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -12,6 +12,7 @@ #include #include +#include #include "server/ServerConfig.h" #include "MetricBase.h" @@ -78,6 +79,9 @@ class PrometheusMetrics: public MetricsBase { void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; void CPUUsagePercentSet() override ; + + void CPUCoreUsagePercentSet() override; + void RAMUsagePercentSet() override ; void QueryResponsePerSecondGaugeSet(double value) override {if(startup_) query_response_per_second_gauge.Set(value);}; void GPUPercentGaugeSet() override ; @@ -322,7 +326,7 @@ class PrometheusMetrics: public MetricsBase { prometheus::Gauge &faiss_disk_load_IO_speed_gauge_ = faiss_disk_load_IO_speed_.Add({{"DB","Faiss"}}); - ////all from CacheMgr.cpp + ////all from CacheMgr.cpp //record cache access count prometheus::Family &cache_access_ = prometheus::BuildCounter() .Name("cache_access_total") @@ -392,7 +396,9 @@ class PrometheusMetrics: public MetricsBase { .Name("CPU_usage_percent") .Help("CPU usage percent by this this process") .Register(*registry_); - prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({}); + prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); +// std::vector &CPU_usage_percent_array; + prometheus::Family &RAM_ = prometheus::BuildGauge() .Name("RAM_usage_percent") @@ -405,15 +411,14 @@ class PrometheusMetrics: public MetricsBase { .Name("Gpu_usage_percent") .Help("GPU_usage_percent ") .Register(*registry_); - prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}}); - prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}}); - prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}}); - prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}}); - prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}}); - prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}}); - prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}}); - prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}}); -// std::vector GPU_percent_gauges_; +// prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}}); +// prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}}); +// prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}}); +// prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}}); +// prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}}); +// prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}}); +// prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}}); +// prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}}); @@ -423,15 +428,15 @@ class PrometheusMetrics: public MetricsBase { .Name("GPU_memory_usage_total") .Help("GPU memory usage total ") .Register(*registry_); - prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}}); - prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}}); - prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}}); - prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}}); - prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}}); - prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}}); - prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}}); - prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}}); -// std::vector GPU_memory_usage_gauges_; +// prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}}); +// prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}}); +// prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}}); +// prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}}); +// prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}}); +// prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}}); +// prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}}); +// prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}}); + prometheus::Family &query_index_type_per_second_ = prometheus::BuildGauge() .Name("query_index_throughtout_per_microsecond") diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp index a64cbc4992..9348b0d9c6 100644 --- a/cpp/src/metrics/SystemInfo.cpp +++ b/cpp/src/metrics/SystemInfo.cpp @@ -105,9 +105,65 @@ SystemInfo::GetProcessUsedMemory() { double SystemInfo::MemoryPercent() { if (!initialized_) Init(); - return GetProcessUsedMemory()*100/total_ram_; + return (double)(GetProcessUsedMemory()*100)/(double)total_ram_; } + + +std::vector +SystemInfo::CPUCorePercent() { + std::vector prev_work_time_array; + std::vector prev_total_time_array = getTotalCpuTime(prev_work_time_array); + usleep(100000); + std::vector cur_work_time_array; + std::vector cur_total_time_array = getTotalCpuTime(cur_work_time_array); + + std::vector cpu_core_percent; + for (int i = 0; i < num_processors_; i++) { + double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i]; + double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i]; + cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100); + } + return cpu_core_percent; +} + +std::vector +SystemInfo::getTotalCpuTime(std::vector &work_time_array) +{ + std::vector total_time_array; + FILE* file = fopen("/proc/stat", "r"); + if (file == NULL) { + perror("Could not open stat file"); + return total_time_array; + } + + unsigned long long user = 0, nice = 0, system = 0, idle = 0; + unsigned long long iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guestnice = 0; + + for (int i = 0; i < num_processors_; i++) { + char buffer[1024]; + char* ret = fgets(buffer, sizeof(buffer) - 1, file); + if (ret == NULL) { + perror("Could not read stat file"); + fclose(file); + return total_time_array; + } + + sscanf(buffer, + "cpu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu", + &user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal, &guest, &guestnice); + + work_time_array.push_back(user + nice + system); + total_time_array.push_back(user + nice + system + idle + iowait + irq + softirq + steal); + } + + fclose(file); + return total_time_array; +} + + + + double SystemInfo::CPUPercent() { if (!initialized_) Init(); diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h index 2562e316e4..5ffb2c773f 100644 --- a/cpp/src/metrics/SystemInfo.h +++ b/cpp/src/metrics/SystemInfo.h @@ -46,6 +46,7 @@ class SystemInfo { } void Init(); + int num_processor() const { return num_processors_;}; int num_device() const {return num_device_;}; unsigned long long get_inoctets() { return in_octets_;}; unsigned long long get_octets() { return out_octets_;}; @@ -62,6 +63,10 @@ class SystemInfo { std::vector GPUPercent(); std::vector GPUMemoryUsed(); + std::vector CPUCorePercent(); + std::vector getTotalCpuTime(std::vector &workTime); + + }; } From 54a71a9fa07e884494bfda9bb3aba86a6187c771 Mon Sep 17 00:00:00 2001 From: kun yu Date: Sat, 27 Jul 2019 16:48:04 +0800 Subject: [PATCH 2/4] add CPU metrics Former-commit-id: bb2dffc5abb87340a58a4791a23e55858073531f --- cpp/src/db/DBImpl.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index 9a27f09b3d..272c4ff598 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -312,6 +312,9 @@ void DBImpl::StartMetricTask() { server::Metrics::GetInstance().GPUMemoryUsageGaugeSet(); server::Metrics::GetInstance().OctetsSet(); + server::Metrics::GetInstance().CPUCoreUsagePercentSet(); + + ENGINE_LOG_TRACE << "Metric task finished"; } From af86fb92f3aa1629ab6a436c3b6db3dbfcf48506 Mon Sep 17 00:00:00 2001 From: kun yu Date: Thu, 1 Aug 2019 14:30:00 +0800 Subject: [PATCH 3/4] modify prometheus Former-commit-id: 95f2999b8627673954e16436ec07a890ba5cd22b --- cpp/src/metrics/PrometheusMetrics.cpp | 38 +++++++++------------------ cpp/src/metrics/PrometheusMetrics.h | 21 --------------- 2 files changed, 12 insertions(+), 47 deletions(-) diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index c8a09b8ea6..a730091a46 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -60,47 +60,34 @@ PrometheusMetrics::RAMUsagePercentSet() { void PrometheusMetrics::GPUPercentGaugeSet() { if(!startup_) return; - int numDevide = server::SystemInfo::GetInstance().num_device(); - std::vector values = server::SystemInfo::GetInstance().GPUPercent(); + int numDevice = server::SystemInfo::GetInstance().num_device(); +// std::vector values = server::SystemInfo::GetInstance().GPUPercent(); + std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); + constexpr unsigned long long MtoB = 1024*1024; - for (int i = 0; i < values.size(); i++) { + + for (int i = 0; i < numDevice; i++) { prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); - GPU_percent.Set(static_cast(values[i])); +// std::cout << "nvmlDeviceGetUtilizationRates: " << values[i] << std::endl; +// GPU_percent.Set(static_cast(values[i])); + double percent = (double)used_memory[i] / (double)MtoB; + double res = (percent / 6078) * 100; + GPU_percent.Set(res); } -// if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast(values[0])); -// if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast(values[1])); -// if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast(values[2])); -// if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast(values[3])); -// if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast(values[4])); -// if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast(values[5])); -// if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast(values[6])); -// if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast(values[7])); - - // to do } void PrometheusMetrics::GPUMemoryUsageGaugeSet() { if(!startup_) return; std::vector values = server::SystemInfo::GetInstance().GPUMemoryUsed(); constexpr unsigned long long MtoB = 1024*1024; - int numDevice = values.size(); + int numDevice = server::SystemInfo::GetInstance().num_device(); for (int i = 0; i < numDevice; i++) { prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); GPU_memory.Set(values[i] / MtoB); } - -// if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB); -// if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB); -// if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB); -// if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB); -// if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB); -// if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB); -// if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB); -// if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB); - } void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) { // MB/s @@ -162,7 +149,6 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() { for (int i = 0; i < cpu_core_percent.size(); i++) { prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); core_percent.Set(cpu_core_percent[i]); -// std::cout << cpu_core_percent[i] << "+"; } } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index be73585310..590130f444 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -397,7 +397,6 @@ class PrometheusMetrics: public MetricsBase { .Help("CPU usage percent by this this process") .Register(*registry_); prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); -// std::vector &CPU_usage_percent_array; prometheus::Family &RAM_ = prometheus::BuildGauge() @@ -411,32 +410,12 @@ class PrometheusMetrics: public MetricsBase { .Name("Gpu_usage_percent") .Help("GPU_usage_percent ") .Register(*registry_); -// prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}}); -// prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}}); -// prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}}); -// prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}}); -// prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}}); -// prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}}); -// prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}}); -// prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}}); - - - //GPU Mempry used prometheus::Family &GPU_memory_usage_ = prometheus::BuildGauge() .Name("GPU_memory_usage_total") .Help("GPU memory usage total ") .Register(*registry_); -// prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}}); -// prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}}); -// prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}}); -// prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}}); -// prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}}); -// prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}}); -// prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}}); -// prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}}); - prometheus::Family &query_index_type_per_second_ = prometheus::BuildGauge() .Name("query_index_throughtout_per_microsecond") From 07de60005efa148746785a17e8acc5fa7df90cc3 Mon Sep 17 00:00:00 2001 From: kun yu Date: Thu, 1 Aug 2019 14:42:20 +0800 Subject: [PATCH 4/4] fix GPU Percent bug Former-commit-id: 0afa5d7a6a50c6a545c6fde296702580885d27b5 --- cpp/src/metrics/PrometheusMetrics.cpp | 10 +++------- cpp/src/metrics/SystemInfo.cpp | 12 ++++++------ cpp/src/metrics/SystemInfo.h | 2 +- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index a730091a46..3d83bff864 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -61,18 +61,14 @@ void PrometheusMetrics::GPUPercentGaugeSet() { if(!startup_) return; int numDevice = server::SystemInfo::GetInstance().num_device(); -// std::vector values = server::SystemInfo::GetInstance().GPUPercent(); + std::vector used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); - constexpr unsigned long long MtoB = 1024*1024; for (int i = 0; i < numDevice; i++) { prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); -// std::cout << "nvmlDeviceGetUtilizationRates: " << values[i] << std::endl; -// GPU_percent.Set(static_cast(values[i])); - double percent = (double)used_memory[i] / (double)MtoB; - double res = (percent / 6078) * 100; - GPU_percent.Set(res); + double percent = (double)used_memory[i] / (double)used_total[i]; + GPU_percent.Set(percent * 100); } } diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp index 9348b0d9c6..7628db78bb 100644 --- a/cpp/src/metrics/SystemInfo.cpp +++ b/cpp/src/metrics/SystemInfo.cpp @@ -192,17 +192,17 @@ SystemInfo::CPUPercent() { } -std::vector -SystemInfo::GPUPercent() { +std::vector +SystemInfo::GPUMemoryTotal() { // get GPU usage percent if(!initialized_) Init(); - std::vector result; - nvmlUtilization_t utilization; + std::vector result; + nvmlMemory_t nvmlMemory; for (int i = 0; i < num_device_; ++i) { nvmlDevice_t device; nvmlDeviceGetHandleByIndex(i, &device); - nvmlDeviceGetUtilizationRates(device, &utilization); - result.push_back(utilization.gpu); + nvmlDeviceGetMemoryInfo(device, &nvmlMemory); + result.push_back(nvmlMemory.total); } return result; } diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h index 5ffb2c773f..629aaf7220 100644 --- a/cpp/src/metrics/SystemInfo.h +++ b/cpp/src/metrics/SystemInfo.h @@ -60,7 +60,7 @@ class SystemInfo { double MemoryPercent(); double CPUPercent(); std::pair Octets(); - std::vector GPUPercent(); + std::vector GPUMemoryTotal(); std::vector GPUMemoryUsed(); std::vector CPUCorePercent();