diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 605e100457..29539d929b 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -15,6 +15,7 @@ Please mark all change in change log and use the ticket from JIRA. - MS-64 - Different table can have different index type - MS-52 - Return search score - MS-66 - Support time range query +- MS-68 - Remove rocksdb from third-party - MS-70 - cmake: remove redundant libs in src ## Task @@ -45,6 +46,7 @@ Please mark all change in change log and use the ticket from JIRA. - MS-37 - Add query, cache usage, disk write speed and file data size metrics - MS-30 - Use faiss v1.5.2 - MS-54 - cmake: Change Thrift third party URL to github.com +- MS-69 - prometheus: add all proposed metrics ## Task diff --git a/cpp/cmake/DefineOptions.cmake b/cpp/cmake/DefineOptions.cmake index ce2e4ae6be..cc358e7f1e 100644 --- a/cpp/cmake/DefineOptions.cmake +++ b/cpp/cmake/DefineOptions.cmake @@ -81,7 +81,7 @@ define_option(MEGASEARCH_WITH_OPENBLAS "Build with OpenBLAS library" ON) define_option(MEGASEARCH_WITH_PROMETHEUS "Build with PROMETHEUS library" ON) -define_option(MEGASEARCH_WITH_ROCKSDB "Build with RocksDB library" ON) +define_option(MEGASEARCH_WITH_ROCKSDB "Build with RocksDB library" OFF) define_option(MEGASEARCH_WITH_SNAPPY "Build with Snappy compression" ON) diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index 6cd7db71ca..5cd78f8047 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -34,6 +34,10 @@ set(service_files thrift/gen-cpp/MegasearchService.cpp thrift/gen-cpp/megasearch_constants.cpp thrift/gen-cpp/megasearch_types.cpp + metrics/SystemInfo.cpp + metrics/SystemInfo.h + server/MegasearchThreadPoolServer.cpp + server/MegasearchThreadPoolServer.h ) set(vecwise_engine_files @@ -73,6 +77,7 @@ set(third_party_libs snappy zlib zstd + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so ) if (GPU_VERSION STREQUAL "ON") @@ -84,6 +89,7 @@ if (GPU_VERSION STREQUAL "ON") libquadmath.a cudart cublas + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so ) else() set(engine_libs @@ -91,6 +97,7 @@ else() libgomp.a libgfortran.a libquadmath.a + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so ) endif () @@ -131,7 +138,6 @@ set(server_libs pthread dl metrics - ) add_executable(vecwise_server diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index 6211c688fb..068e2a8d24 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -108,6 +108,7 @@ Status DBImpl::InsertVectors(const std::string& table_id_, CollectInsertMetrics(total_time, n, status.ok()); return status; + } Status DBImpl::Query(const std::string &table_id, size_t k, size_t nq, @@ -119,6 +120,7 @@ Status DBImpl::Query(const std::string &table_id, size_t k, size_t nq, auto total_time = METRICS_MICROSECONDS(start_time,end_time); CollectQueryMetrics(total_time, nq); + return result; } @@ -304,17 +306,23 @@ void DBImpl::StartTimerTasks(int interval) { void DBImpl::BackgroundTimerTask(int interval) { Status status; + server::SystemInfo::GetInstance().Init(); while (true) { if (!bg_error_.ok()) break; if (shutting_down_.load(std::memory_order_acquire)) break; std::this_thread::sleep_for(std::chrono::seconds(interval)); - int64_t cache_total = cache::CpuCacheMgr::GetInstance()->CacheUsage(); - LOG(DEBUG) << "Cache usage " << cache_total; - server::Metrics::GetInstance().CacheUsageGaugeSet(static_cast(cache_total)); + server::Metrics::GetInstance().KeepingAliveCounterIncrement(interval); + int64_t cache_usage = cache::CpuCacheMgr::GetInstance()->CacheUsage(); + int64_t cache_total = cache::CpuCacheMgr::GetInstance()->CacheCapacity(); + server::Metrics::GetInstance().CacheUsageGaugeSet(cache_usage*100/cache_total); long size; Size(size); server::Metrics::GetInstance().DataFileSizeGaugeSet(size); + server::Metrics::GetInstance().CPUUsagePercentSet(); + server::Metrics::GetInstance().RAMUsagePercentSet(); + server::Metrics::GetInstance().GPUPercentGaugeSet(); + server::Metrics::GetInstance().GPUMemoryUsageGaugeSet(); TrySchedule(); } } diff --git a/cpp/src/db/FaissExecutionEngine.cpp b/cpp/src/db/FaissExecutionEngine.cpp index b25a3150ed..65bdeead1f 100644 --- a/cpp/src/db/FaissExecutionEngine.cpp +++ b/cpp/src/db/FaissExecutionEngine.cpp @@ -131,8 +131,11 @@ Status FaissExecutionEngine::Search(long n, long k, float *distances, long *labels) const { - + auto start_time = METRICS_NOW_TIME; pIndex_->search(n, data, k, distances, labels); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time,end_time); + server::Metrics::GetInstance().QueryIndexTypePerSecondSet(build_index_type_, double(n)/double(total_time)); return Status::OK(); } diff --git a/cpp/src/db/MemManager.cpp b/cpp/src/db/MemManager.cpp index 33d7dc81f3..9bae4c9f21 100644 --- a/cpp/src/db/MemManager.cpp +++ b/cpp/src/db/MemManager.cpp @@ -27,9 +27,14 @@ MemVectors::MemVectors(const std::shared_ptr& meta_ptr, pEE_(EngineFactory::Build(schema_.dimension_, schema_.location_, (EngineType)schema_.engine_type_)) { } + void MemVectors::Add(size_t n_, const float* vectors_, IDNumbers& vector_ids_) { + auto start_time = METRICS_NOW_TIME; pIdGenerator_->GetNextIDNumbers(n_, vector_ids_); pEE_->AddWithIds(n_, vectors_, vector_ids_.data()); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time, end_time); + server::Metrics::GetInstance().AddVectorsPerSecondGaugeSet(static_cast(n_), static_cast(schema_.dimension_), total_time); } size_t MemVectors::Total() const { @@ -97,6 +102,7 @@ Status MemManager::InsertVectors(const std::string& table_id_, const float* vectors_, IDNumbers& vector_ids_) { std::unique_lock lock(mutex_); + return InsertVectorsNoLock(table_id_, n_, vectors_, vector_ids_); } diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index fae4b084e3..96dcf22ed6 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -8,6 +8,7 @@ #include "utils/Error.h" #include "server/ServerConfig.h" +#include "SystemInfo.h" namespace zilliz { namespace vecwise { @@ -71,6 +72,16 @@ class MetricsBase{ virtual void AddVectorsFailGaugeSet(double value) {}; virtual void QueryVectorResponseSummaryObserve(double value, int count = 1) {}; virtual void QueryVectorResponsePerSecondGaugeSet(double value) {}; + virtual void CPUUsagePercentSet() {}; + virtual void RAMUsagePercentSet() {}; + virtual void QueryResponsePerSecondGaugeSet(double value) {}; + virtual void GPUPercentGaugeSet() {}; + virtual void GPUMemoryUsageGaugeSet() {}; + virtual void AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) {}; + virtual void QueryIndexTypePerSecondSet(std::string type, double value) {}; + virtual void ConnectionGaugeIncrement() {}; + virtual void ConnectionGaugeDecrement() {}; + virtual void KeepingAliveCounterIncrement(double value = 1) {}; }; diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index 693051f52c..8672817428 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -6,6 +6,8 @@ #include "PrometheusMetrics.h" #include "utils/Log.h" +#include "SystemInfo.h" + namespace zilliz { namespace vecwise { @@ -32,8 +34,108 @@ PrometheusMetrics::Init() { } return SERVER_SUCCESS; + } + +void +PrometheusMetrics::CPUUsagePercentSet() { + if(!startup_) return ; + double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); + CPU_usage_percent_.Set(usage_percent); +} + +void +PrometheusMetrics::RAMUsagePercentSet() { + if(!startup_) return ; + double usage_percent = server::SystemInfo::GetInstance().MemoryPercent(); + RAM_usage_percent_.Set(usage_percent); +} + +void +PrometheusMetrics::GPUPercentGaugeSet() { + if(!startup_) return; + int numDevide = server::SystemInfo::GetInstance().num_device(); + std::vector values = server::SystemInfo::GetInstance().GPUPercent(); +// for (int i = 0; i < numDevide; ++i) { +// GPU_percent_gauges_[i].Set(static_cast(values[i])); +// } + if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast(values[0])); + if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast(values[1])); + if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast(values[2])); + if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast(values[3])); + if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast(values[4])); + if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast(values[5])); + if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast(values[6])); + if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast(values[7])); + + // to do +} + +void PrometheusMetrics::GPUMemoryUsageGaugeSet() { + if(!startup_) return; + int numDevide = server::SystemInfo::GetInstance().num_device(); + std::vector values = server::SystemInfo::GetInstance().GPUMemoryUsed(); + constexpr unsigned long long MtoB = 1024*1024; + int numDevice = values.size(); +// for (int i = 0; i < numDevice; ++i) { +// GPU_memory_usage_gauges_[i].Set(values[i]/MtoB); +// } + if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB); + if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB); + if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB); + if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB); + if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB); + if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB); + if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB); + if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB); + + // to do +} +void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) { + // MB/s + if(!startup_) return; + + long long MtoB = 1024*1024; + long long size = num_vector * dim * 4; + add_vectors_per_second_gauge_.Set(size/time/MtoB); + +} +void PrometheusMetrics::QueryIndexTypePerSecondSet(std::string type, double value) { + if(!startup_) return; + if(type == "IVF"){ + query_index_IVF_type_per_second_gauge_.Set(value); + } else if(type == "IDMap"){ + query_index_IDMAP_type_per_second_gauge_.Set(value); + } + +} +void PrometheusMetrics::ConnectionGaugeIncrement() { + if(!startup_) return; + connection_gauge_.Increment(); +} +void PrometheusMetrics::ConnectionGaugeDecrement() { + if(!startup_) return; + connection_gauge_.Decrement(); +} + +//void PrometheusMetrics::GpuPercentInit() { +// int num_device = SystemInfo::GetInstance().num_device(); +// constexpr char device_number[] = "DeviceNum"; +// for(int i = 0; i < num_device; ++ i) { +// GPU_percent_gauges_.emplace_back(GPU_percent_.Add({{device_number,std::to_string(i)}})); +// } +// +//} +//void PrometheusMetrics::GpuMemoryInit() { +// int num_device = SystemInfo::GetInstance().num_device(); +// constexpr char device_number[] = "DeviceNum"; +// for(int i = 0; i < num_device; ++ i) { +// GPU_memory_usage_gauges_.emplace_back(GPU_memory_usage_.Add({{device_number,std::to_string(i)}})); +// } +//} + + } } } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index cfb127968f..fc2bef6f60 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -49,6 +49,8 @@ class PrometheusMetrics: public MetricsBase { std::shared_ptr exposer_ptr_; std::shared_ptr registry_ = std::make_shared(); bool startup_ = false; +// void GpuPercentInit(); +// void GpuMemoryInit(); public: void AddGroupSuccessTotalIncrement(double value = 1.0) override { if(startup_) add_group_success_total_.Increment(value);}; @@ -104,51 +106,20 @@ class PrometheusMetrics: public MetricsBase { void AddVectorsFailGaugeSet(double value) override { if(startup_) add_vectors_fail_gauge_.Set(value);}; void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; + void CPUUsagePercentSet() override ; + void RAMUsagePercentSet() override ; + void QueryResponsePerSecondGaugeSet(double value) override {if(startup_) query_response_per_second_gauge.Set(value);}; + void GPUPercentGaugeSet() override ; + void GPUMemoryUsageGaugeSet() override ; + void AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) override ; + void QueryIndexTypePerSecondSet(std::string type, double value) override ; + void ConnectionGaugeIncrement() override ; + void ConnectionGaugeDecrement() override ; + void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);}; -// prometheus::Counter &connection_total() {return connection_total_; } -// -// prometheus::Counter &add_group_success_total() { return add_group_success_total_; } -// prometheus::Counter &add_group_fail_total() { return add_group_fail_total_; } -// -// prometheus::Counter &get_group_success_total() { return get_group_success_total_;} -// prometheus::Counter &get_group_fail_total() { return get_group_fail_total_;} -// -// prometheus::Counter &has_group_success_total() { return has_group_success_total_;} -// prometheus::Counter &has_group_fail_total() { return has_group_fail_total_;} -// -// prometheus::Counter &get_group_files_success_total() { return get_group_files_success_total_;}; -// prometheus::Counter &get_group_files_fail_total() { return get_group_files_fail_total_;} -// -// prometheus::Counter &add_vectors_success_total() { return add_vectors_success_total_; } -// prometheus::Counter &add_vectors_fail_total() { return add_vectors_fail_total_; } -// -// prometheus::Histogram &add_vectors_duration_histogram() { return add_vectors_duration_histogram_;} -// -// prometheus::Counter &search_success_total() { return search_success_total_; } -// prometheus::Counter &search_fail_total() { return search_fail_total_; } -// -// prometheus::Histogram &search_duration_histogram() { return search_duration_histogram_; } -// prometheus::Histogram &raw_files_size_histogram() { return raw_files_size_histogram_; } -// prometheus::Histogram &index_files_size_histogram() { return index_files_size_histogram_; } -// -// prometheus::Histogram &build_index_duration_seconds_histogram() { return build_index_duration_seconds_histogram_; } -// -// prometheus::Histogram &all_build_index_duration_seconds_histogram() { return all_build_index_duration_seconds_histogram_; } -// -// prometheus::Gauge &cache_usage_gauge() { return cache_usage_gauge_; } -// -// prometheus::Counter &meta_visit_total() { return meta_visit_total_; } -// -// prometheus::Histogram &meta_visit_duration_seconds_histogram() { return meta_visit_duration_seconds_histogram_; } -// -// prometheus::Gauge &mem_usage_percent_gauge() { return mem_usage_percent_gauge_; } -// -// prometheus::Gauge &mem_usage_total_gauge() { return mem_usage_total_gauge_; } - - std::shared_ptr &exposer_ptr() {return exposer_ptr_; } @@ -273,7 +244,7 @@ class PrometheusMetrics: public MetricsBase { .Name("build_index_duration_microseconds") .Help("histogram of processing time for building index") .Register(*registry_); - prometheus::Histogram &build_index_duration_seconds_histogram_ = build_index_duration_seconds_.Add({}, BucketBoundaries{2e6, 4e6, 6e6, 8e6, 1e7}); + prometheus::Histogram &build_index_duration_seconds_histogram_ = build_index_duration_seconds_.Add({}, BucketBoundaries{5e5, 2e6, 4e6, 6e6, 8e6, 1e7}); //record processing time for all building index @@ -414,6 +385,12 @@ class PrometheusMetrics: public MetricsBase { .Register(*registry_); prometheus::Gauge &query_vector_response_per_second_gauge_ = query_vector_response_per_second_.Add({}); + prometheus::Family &query_response_per_second_ = prometheus::BuildGauge() + .Name("query_response_per_microsecond") + .Help("the number of queries can be processed every microsecond") + .Register(*registry_); + prometheus::Gauge &query_response_per_second_gauge = query_response_per_second_.Add({}); + prometheus::Family &disk_store_IO_speed_ = prometheus::BuildGauge() .Name("disk_store_IO_speed_bytes_per_microseconds") .Help("disk_store_IO_speed") @@ -433,6 +410,77 @@ class PrometheusMetrics: public MetricsBase { prometheus::Gauge &add_vectors_success_gauge_ = add_vectors_.Add({{"outcome", "success"}}); prometheus::Gauge &add_vectors_fail_gauge_ = add_vectors_.Add({{"outcome", "fail"}}); + prometheus::Family &add_vectors_per_second_ = prometheus::BuildGauge() + .Name("add_vectors_throughput_per_microsecond") + .Help("add vectors throughput per microsecond") + .Register(*registry_); + prometheus::Gauge &add_vectors_per_second_gauge_ = add_vectors_per_second_.Add({}); + + prometheus::Family &CPU_ = prometheus::BuildGauge() + .Name("CPU_usage_percent") + .Help("CPU usage percent by this this process") + .Register(*registry_); + prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({}); + + prometheus::Family &RAM_ = prometheus::BuildGauge() + .Name("RAM_usage_percent") + .Help("RAM usage percent by this process") + .Register(*registry_); + prometheus::Gauge &RAM_usage_percent_ = RAM_.Add({}); + + //GPU Usage Percent + prometheus::Family &GPU_percent_ = prometheus::BuildGauge() + .Name("Gpu_usage_percent") + .Help("GPU_usage_percent ") + .Register(*registry_); + prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}}); + prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}}); + prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}}); + prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}}); + prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}}); + prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}}); + prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}}); + prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}}); +// std::vector GPU_percent_gauges_; + + + + + //GPU Mempry used + prometheus::Family &GPU_memory_usage_ = prometheus::BuildGauge() + .Name("GPU_memory_usage_total") + .Help("GPU memory usage total ") + .Register(*registry_); + prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}}); + prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}}); + prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}}); + prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}}); + prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}}); + prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}}); + prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}}); + prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}}); +// std::vector GPU_memory_usage_gauges_; + + prometheus::Family &query_index_type_per_second_ = prometheus::BuildGauge() + .Name("query_index_throughtout_per_microsecond") + .Help("query index throughtout per microsecond") + .Register(*registry_); + prometheus::Gauge &query_index_IVF_type_per_second_gauge_ = query_index_type_per_second_.Add({{"IndexType","IVF"}}); + prometheus::Gauge &query_index_IDMAP_type_per_second_gauge_ = query_index_type_per_second_.Add({{"IndexType","IDMAP"}}); + + prometheus::Family &connection_ = prometheus::BuildGauge() + .Name("connection_number") + .Help("the number of connections") + .Register(*registry_); + prometheus::Gauge &connection_gauge_ = connection_.Add({}); + + prometheus::Family &keeping_alive_ = prometheus::BuildCounter() + .Name("keeping_alive_seconds_total") + .Help("total seconds of the serve alive") + .Register(*registry_); + prometheus::Counter &keeping_alive_counter_ = keeping_alive_.Add({}); + + }; diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp new file mode 100644 index 0000000000..210817f856 --- /dev/null +++ b/cpp/src/metrics/SystemInfo.cpp @@ -0,0 +1,207 @@ +/******************************************************************************* + * Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved + * Unauthorized copying of this file, via any medium is strictly prohibited. + * Proprietary and confidential. + ******************************************************************************/ + +#include "SystemInfo.h" + +#include +#include +#include +#include +#include "nvml.h" +//#include +// +//std::mutex mutex; + + +namespace zilliz { +namespace vecwise { +namespace server { + +void SystemInfo::Init() { + if(initialized_) return; + + initialized_ = true; + + // initialize CPU information + FILE* file; + struct tms time_sample; + char line[128]; + last_cpu_ = times(&time_sample); + last_sys_cpu_ = time_sample.tms_stime; + last_user_cpu_ = time_sample.tms_utime; + file = fopen("/proc/cpuinfo", "r"); + num_processors_ = 0; + while(fgets(line, 128, file) != NULL){ + if (strncmp(line, "processor", 9) == 0) num_processors_++; + } + total_ram_ = GetPhysicalMemory(); + fclose(file); + + //initialize GPU information + nvmlReturn_t nvmlresult; + nvmlresult = nvmlInit(); + if(NVML_SUCCESS != nvmlresult) { + printf("System information initilization failed"); + return ; + } + nvmlresult = nvmlDeviceGetCount(&num_device_); + if(NVML_SUCCESS != nvmlresult) { + printf("Unable to get devidce number"); + return ; + } + +} + +long long +SystemInfo::ParseLine(char *line) { + // This assumes that a digit will be found and the line ends in " Kb". + int i = strlen(line); + const char *p = line; + while (*p < '0' || *p > '9') p++; + line[i - 3] = '\0'; + i = atoi(p); + return static_cast(i); +} + +unsigned long +SystemInfo::GetPhysicalMemory() { + struct sysinfo memInfo; + sysinfo (&memInfo); + unsigned long totalPhysMem = memInfo.totalram; + //Multiply in next statement to avoid int overflow on right hand side... + totalPhysMem *= memInfo.mem_unit; + return totalPhysMem; +} + +unsigned long +SystemInfo::GetProcessUsedMemory() { + //Note: this value is in KB! + FILE* file = fopen("/proc/self/status", "r"); + constexpr int64_t line_length = 128; + long long result = -1; + constexpr int64_t KB_SIZE = 1024; + char line[line_length]; + + while (fgets(line, line_length, file) != NULL){ + if (strncmp(line, "VmRSS:", 6) == 0){ + result = ParseLine(line); + break; + } + } + fclose(file); + // return value in Byte + return (result*KB_SIZE); + +} + +double +SystemInfo::MemoryPercent() { + if (!initialized_) Init(); + return GetProcessUsedMemory()*100/total_ram_; +} + +double +SystemInfo::CPUPercent() { + if (!initialized_) Init(); + struct tms time_sample; + clock_t now; + double percent; + + now = times(&time_sample); + if (now <= last_cpu_ || time_sample.tms_stime < last_sys_cpu_ || + time_sample.tms_utime < last_user_cpu_){ + //Overflow detection. Just skip this value. + percent = -1.0; + } + else{ + percent = (time_sample.tms_stime - last_sys_cpu_) + + (time_sample.tms_utime - last_user_cpu_); + percent /= (now - last_cpu_); + percent /= num_processors_; + percent *= 100; + } + last_cpu_ = now; + last_sys_cpu_ = time_sample.tms_stime; + last_user_cpu_ = time_sample.tms_utime; + + return percent; +} + +//std::unordered_map> +//SystemInfo::GetGPUMemPercent(){ +// // return GPUID: MEM% +// +// //write GPU info to a file +// system("nvidia-smi pmon -c 1 > GPUInfo.txt"); +// int pid = (int)getpid(); +// +// //parse line +// std::ifstream read_file; +// read_file.open("GPUInfo.txt"); +// std::string line; +// while(getline(read_file, line)){ +// std::vector words = split(line); +// // 0 1 2 3 4 5 6 7 +// //words stand for gpuindex, pid, type, sm, mem, enc, dec, command respectively +// if(std::stoi(words[1]) != pid) continue; +// int GPUindex = std::stoi(words[0]); +// double sm_percent = std::stod(words[3]); +// double mem_percent = std::stod(words[4]); +// +// } +// +//} + +//std::vector +//SystemInfo::split(std::string input) { +// std::vector words; +// input += " "; +// int word_start = 0; +// for (int i = 0; i < input.size(); ++i) { +// if(input[i] != ' ') continue; +// if(input[i] == ' ') { +// word_start = i + 1; +// continue; +// } +// words.push_back(input.substr(word_start,i-word_start)); +// } +// return words; +//} + +std::vector +SystemInfo::GPUPercent() { + // get GPU usage percent + if(!initialized_) Init(); + std::vector result; + nvmlUtilization_t utilization; + for (int i = 0; i < num_device_; ++i) { + nvmlDevice_t device; + nvmlDeviceGetHandleByIndex(i, &device); + nvmlDeviceGetUtilizationRates(device, &utilization); + result.push_back(utilization.gpu); + } + return result; +} + +std::vector +SystemInfo::GPUMemoryUsed() { + // get GPU memory used + if(!initialized_) Init(); + + std::vector result; + nvmlMemory_t nvmlMemory; + for (int i = 0; i < num_device_; ++i) { + nvmlDevice_t device; + nvmlDeviceGetHandleByIndex(i, &device); + nvmlDeviceGetMemoryInfo(device, &nvmlMemory); + result.push_back(nvmlMemory.used); + } + return result; +} + +} +} +} \ No newline at end of file diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h new file mode 100644 index 0000000000..042358c3df --- /dev/null +++ b/cpp/src/metrics/SystemInfo.h @@ -0,0 +1,60 @@ +/******************************************************************************* + * Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved + * Unauthorized copying of this file, via any medium is strictly prohibited. + * Proprietary and confidential. + ******************************************************************************/ + +#pragma once + +#include "sys/types.h" +#include "sys/sysinfo.h" +#include "stdlib.h" +#include "stdio.h" +#include "string.h" +#include "sys/times.h" +#include "sys/vtimes.h" + +#include +#include + + + +namespace zilliz { +namespace vecwise { +namespace server { + +class SystemInfo { + private: + unsigned long total_ram_ = 0; + clock_t last_cpu_ = clock_t(); + clock_t last_sys_cpu_ = clock_t(); + clock_t last_user_cpu_ = clock_t(); + int num_processors_ = 0; + //number of GPU + unsigned int num_device_ = 0; + bool initialized_ = false; + + public: + static SystemInfo & + GetInstance(){ + static SystemInfo instance; + return instance; + } + + void Init(); + int num_device() const {return num_device_;}; + long long ParseLine(char* line); + unsigned long GetPhysicalMemory(); + unsigned long GetProcessUsedMemory(); + double MemoryPercent(); + double CPUPercent(); +// std::unordered_map> GetGPUMemPercent() {}; +// std::vector split(std::string input) {}; + std::vector GPUPercent(); + std::vector GPUMemoryUsed(); + +}; + +} +} +} diff --git a/cpp/src/server/MegasearchServer.cpp b/cpp/src/server/MegasearchServer.cpp index f771fc4dd8..459402c879 100644 --- a/cpp/src/server/MegasearchServer.cpp +++ b/cpp/src/server/MegasearchServer.cpp @@ -8,6 +8,7 @@ #include "megasearch_types.h" #include "megasearch_constants.h" #include "ServerConfig.h" +#include "MegasearchThreadPoolServer.h" #include #include @@ -76,7 +77,7 @@ MegasearchServer::StartService() { threadManager->threadFactory(threadFactory); threadManager->start(); - s_server.reset(new TThreadPoolServer(processor, + s_server.reset(new MegasearchThreadPoolServer(processor, server_transport, transport_factory, protocol_factory, diff --git a/cpp/src/server/MegasearchThreadPoolServer.cpp b/cpp/src/server/MegasearchThreadPoolServer.cpp new file mode 100644 index 0000000000..d227442a45 --- /dev/null +++ b/cpp/src/server/MegasearchThreadPoolServer.cpp @@ -0,0 +1,34 @@ +/******************************************************************************* + * Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved + * Unauthorized copying of this file, via any medium is strictly prohibited. + * Proprietary and confidential. + ******************************************************************************/ +#include "metrics/Metrics.h" +#include "MegasearchThreadPoolServer.h" + +namespace zilliz { +namespace vecwise { +namespace server { + +void +MegasearchThreadPoolServer::onClientConnected(const std::shared_ptr &pClient) { + server::Metrics::GetInstance().ConnectionGaugeIncrement(); + TThreadPoolServer::onClientConnected(pClient); +} + +void +MegasearchThreadPoolServer::onClientDisconnected(apache::thrift::server::TConnectedClient *pClient) { + server::Metrics::GetInstance().ConnectionGaugeDecrement(); + TThreadPoolServer::onClientDisconnected(pClient); +} +zilliz::vecwise::server::MegasearchThreadPoolServer::MegasearchThreadPoolServer(const std::shared_ptr &processor, + const std::shared_ptr &serverTransport, + const std::shared_ptr &transportFactory, + const std::shared_ptr &protocolFactory, + const std::shared_ptr &threadManager) + : TThreadPoolServer(processor, serverTransport, transportFactory, protocolFactory, threadManager) { + +} +} +} +} \ No newline at end of file diff --git a/cpp/src/server/MegasearchThreadPoolServer.h b/cpp/src/server/MegasearchThreadPoolServer.h new file mode 100644 index 0000000000..309c17ef3f --- /dev/null +++ b/cpp/src/server/MegasearchThreadPoolServer.h @@ -0,0 +1,33 @@ +/******************************************************************************* + * Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved + * Unauthorized copying of this file, via any medium is strictly prohibited. + * Proprietary and confidential. + ******************************************************************************/ + +#pragma once + +#include + + +namespace zilliz { +namespace vecwise { +namespace server { + +class MegasearchThreadPoolServer : public apache::thrift::server::TThreadPoolServer { + public: + MegasearchThreadPoolServer( + const std::shared_ptr& processor, + const std::shared_ptr& serverTransport, + const std::shared_ptr& transportFactory, + const std::shared_ptr& protocolFactory, + const std::shared_ptr& threadManager + = apache::thrift::concurrency::ThreadManager::newSimpleThreadManager()); + + protected: + void onClientConnected(const std::shared_ptr& pClient) override ; + void onClientDisconnected(apache::thrift::server::TConnectedClient* pClient) override ; +}; + +} +} +} \ No newline at end of file diff --git a/cpp/src/server/Server.cpp b/cpp/src/server/Server.cpp index 8480c2450c..fa093c1bca 100644 --- a/cpp/src/server/Server.cpp +++ b/cpp/src/server/Server.cpp @@ -173,6 +173,7 @@ Server::Start() { signal(SIGHUP, SignalUtil::HandleSignal); signal(SIGTERM, SignalUtil::HandleSignal); server::Metrics::GetInstance().Init(); + server::SystemInfo::GetInstance().Init(); SERVER_LOG_INFO << "Vecwise server is running..."; StartService(); diff --git a/cpp/unittest/CMakeLists.txt b/cpp/unittest/CMakeLists.txt index addce57da1..4db6674272 100644 --- a/cpp/unittest/CMakeLists.txt +++ b/cpp/unittest/CMakeLists.txt @@ -32,6 +32,7 @@ set(unittest_libs civetweb dl z + ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so ) add_subdirectory(server)