diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index af23543ccb..22aef26daa 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -318,6 +318,9 @@ void DBImpl::StartMetricTask() { server::Metrics::GetInstance().GPUMemoryUsageGaugeSet(); server::Metrics::GetInstance().OctetsSet(); + server::Metrics::GetInstance().CPUCoreUsagePercentSet(); + + ENGINE_LOG_TRACE << "Metric task finished"; } diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index fe9b246503..61e9e7680f 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -64,6 +64,8 @@ class MetricsBase{ virtual void ConnectionGaugeDecrement() {}; virtual void KeepingAliveCounterIncrement(double value = 1) {}; virtual void OctetsSet() {}; + + virtual void CPUCoreUsagePercentSet() {}; }; diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index df0516344e..3d83bff864 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -44,6 +44,8 @@ PrometheusMetrics::Init() { void PrometheusMetrics::CPUUsagePercentSet() { if(!startup_) return ; + int numProcessor = server::SystemInfo::GetInstance().num_processor(); + double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); CPU_usage_percent_.Set(usage_percent); } @@ -58,36 +60,30 @@ PrometheusMetrics::RAMUsagePercentSet() { void PrometheusMetrics::GPUPercentGaugeSet() { if(!startup_) return; - int numDevide = server::SystemInfo::GetInstance().num_device(); - std::vector values = server::SystemInfo::GetInstance().GPUPercent(); - if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast(values[0])); - if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast(values[1])); - if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast(values[2])); - if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast(values[3])); - if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast(values[4])); - if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast(values[5])); - if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast(values[6])); - if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast(values[7])); + int numDevice = server::SystemInfo::GetInstance().num_device(); + std::vector used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); + std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); + + + for (int i = 0; i < numDevice; i++) { + prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); + double percent = (double)used_memory[i] / (double)used_total[i]; + GPU_percent.Set(percent * 100); + } - // to do } void PrometheusMetrics::GPUMemoryUsageGaugeSet() { if(!startup_) return; std::vector values = server::SystemInfo::GetInstance().GPUMemoryUsed(); constexpr unsigned long long MtoB = 1024*1024; - int numDevice = values.size(); + int numDevice = server::SystemInfo::GetInstance().num_device(); - if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB); - if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB); - if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB); - if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB); - if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB); - if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB); - if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB); - if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB); + for (int i = 0; i < numDevice; i++) { + prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); + GPU_memory.Set(values[i] / MtoB); + } - // to do } void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) { // MB/s @@ -140,6 +136,17 @@ void PrometheusMetrics::OctetsSet() { outoctets_gauge_.Set((in_and_out_octets.second-old_outoctets)/total_second); } +void PrometheusMetrics::CPUCoreUsagePercentSet() { + if (!startup_) + return; + + std::vector cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); + + for (int i = 0; i < cpu_core_percent.size(); i++) { + prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); + core_percent.Set(cpu_core_percent[i]); + } +} } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 5b651ec14f..590130f444 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -12,6 +12,7 @@ #include #include +#include #include "server/ServerConfig.h" #include "MetricBase.h" @@ -78,6 +79,9 @@ class PrometheusMetrics: public MetricsBase { void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; void CPUUsagePercentSet() override ; + + void CPUCoreUsagePercentSet() override; + void RAMUsagePercentSet() override ; void QueryResponsePerSecondGaugeSet(double value) override {if(startup_) query_response_per_second_gauge.Set(value);}; void GPUPercentGaugeSet() override ; @@ -322,7 +326,7 @@ class PrometheusMetrics: public MetricsBase { prometheus::Gauge &faiss_disk_load_IO_speed_gauge_ = faiss_disk_load_IO_speed_.Add({{"DB","Faiss"}}); - ////all from CacheMgr.cpp + ////all from CacheMgr.cpp //record cache access count prometheus::Family &cache_access_ = prometheus::BuildCounter() .Name("cache_access_total") @@ -392,7 +396,8 @@ class PrometheusMetrics: public MetricsBase { .Name("CPU_usage_percent") .Help("CPU usage percent by this this process") .Register(*registry_); - prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({}); + prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); + prometheus::Family &RAM_ = prometheus::BuildGauge() .Name("RAM_usage_percent") @@ -405,33 +410,12 @@ class PrometheusMetrics: public MetricsBase { .Name("Gpu_usage_percent") .Help("GPU_usage_percent ") .Register(*registry_); - prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}}); - prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}}); - prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}}); - prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}}); - prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}}); - prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}}); - prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}}); - prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}}); -// std::vector GPU_percent_gauges_; - - - //GPU Mempry used prometheus::Family &GPU_memory_usage_ = prometheus::BuildGauge() .Name("GPU_memory_usage_total") .Help("GPU memory usage total ") .Register(*registry_); - prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}}); - prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}}); - prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}}); - prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}}); - prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}}); - prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}}); - prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}}); - prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}}); -// std::vector GPU_memory_usage_gauges_; prometheus::Family &query_index_type_per_second_ = prometheus::BuildGauge() .Name("query_index_throughtout_per_microsecond") diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp index a64cbc4992..7628db78bb 100644 --- a/cpp/src/metrics/SystemInfo.cpp +++ b/cpp/src/metrics/SystemInfo.cpp @@ -105,9 +105,65 @@ SystemInfo::GetProcessUsedMemory() { double SystemInfo::MemoryPercent() { if (!initialized_) Init(); - return GetProcessUsedMemory()*100/total_ram_; + return (double)(GetProcessUsedMemory()*100)/(double)total_ram_; } + + +std::vector +SystemInfo::CPUCorePercent() { + std::vector prev_work_time_array; + std::vector prev_total_time_array = getTotalCpuTime(prev_work_time_array); + usleep(100000); + std::vector cur_work_time_array; + std::vector cur_total_time_array = getTotalCpuTime(cur_work_time_array); + + std::vector cpu_core_percent; + for (int i = 0; i < num_processors_; i++) { + double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i]; + double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i]; + cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100); + } + return cpu_core_percent; +} + +std::vector +SystemInfo::getTotalCpuTime(std::vector &work_time_array) +{ + std::vector total_time_array; + FILE* file = fopen("/proc/stat", "r"); + if (file == NULL) { + perror("Could not open stat file"); + return total_time_array; + } + + unsigned long long user = 0, nice = 0, system = 0, idle = 0; + unsigned long long iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guestnice = 0; + + for (int i = 0; i < num_processors_; i++) { + char buffer[1024]; + char* ret = fgets(buffer, sizeof(buffer) - 1, file); + if (ret == NULL) { + perror("Could not read stat file"); + fclose(file); + return total_time_array; + } + + sscanf(buffer, + "cpu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu", + &user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal, &guest, &guestnice); + + work_time_array.push_back(user + nice + system); + total_time_array.push_back(user + nice + system + idle + iowait + irq + softirq + steal); + } + + fclose(file); + return total_time_array; +} + + + + double SystemInfo::CPUPercent() { if (!initialized_) Init(); @@ -136,17 +192,17 @@ SystemInfo::CPUPercent() { } -std::vector -SystemInfo::GPUPercent() { +std::vector +SystemInfo::GPUMemoryTotal() { // get GPU usage percent if(!initialized_) Init(); - std::vector result; - nvmlUtilization_t utilization; + std::vector result; + nvmlMemory_t nvmlMemory; for (int i = 0; i < num_device_; ++i) { nvmlDevice_t device; nvmlDeviceGetHandleByIndex(i, &device); - nvmlDeviceGetUtilizationRates(device, &utilization); - result.push_back(utilization.gpu); + nvmlDeviceGetMemoryInfo(device, &nvmlMemory); + result.push_back(nvmlMemory.total); } return result; } diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h index 2562e316e4..629aaf7220 100644 --- a/cpp/src/metrics/SystemInfo.h +++ b/cpp/src/metrics/SystemInfo.h @@ -46,6 +46,7 @@ class SystemInfo { } void Init(); + int num_processor() const { return num_processors_;}; int num_device() const {return num_device_;}; unsigned long long get_inoctets() { return in_octets_;}; unsigned long long get_octets() { return out_octets_;}; @@ -59,9 +60,13 @@ class SystemInfo { double MemoryPercent(); double CPUPercent(); std::pair Octets(); - std::vector GPUPercent(); + std::vector GPUMemoryTotal(); std::vector GPUMemoryUsed(); + std::vector CPUCorePercent(); + std::vector getTotalCpuTime(std::vector &workTime); + + }; }