mirror of
https://gitee.com/milvus-io/milvus.git
synced 2026-01-01 08:28:10 +08:00
fix merge conflicts
Former-commit-id: c6d304468fb8e4ac35a8b16b5090ac0c78d38af7
This commit is contained in:
commit
4e1d514c73
@ -15,6 +15,7 @@ Please mark all change in change log and use the ticket from JIRA.
|
||||
- MS-64 - Different table can have different index type
|
||||
- MS-52 - Return search score
|
||||
- MS-66 - Support time range query
|
||||
- MS-68 - Remove rocksdb from third-party
|
||||
- MS-70 - cmake: remove redundant libs in src
|
||||
|
||||
## Task
|
||||
@ -45,6 +46,7 @@ Please mark all change in change log and use the ticket from JIRA.
|
||||
- MS-37 - Add query, cache usage, disk write speed and file data size metrics
|
||||
- MS-30 - Use faiss v1.5.2
|
||||
- MS-54 - cmake: Change Thrift third party URL to github.com
|
||||
- MS-69 - prometheus: add all proposed metrics
|
||||
|
||||
## Task
|
||||
|
||||
|
||||
@ -81,7 +81,7 @@ define_option(MEGASEARCH_WITH_OPENBLAS "Build with OpenBLAS library" ON)
|
||||
|
||||
define_option(MEGASEARCH_WITH_PROMETHEUS "Build with PROMETHEUS library" ON)
|
||||
|
||||
define_option(MEGASEARCH_WITH_ROCKSDB "Build with RocksDB library" ON)
|
||||
define_option(MEGASEARCH_WITH_ROCKSDB "Build with RocksDB library" OFF)
|
||||
|
||||
define_option(MEGASEARCH_WITH_SNAPPY "Build with Snappy compression" ON)
|
||||
|
||||
|
||||
@ -34,6 +34,10 @@ set(service_files
|
||||
thrift/gen-cpp/MegasearchService.cpp
|
||||
thrift/gen-cpp/megasearch_constants.cpp
|
||||
thrift/gen-cpp/megasearch_types.cpp
|
||||
metrics/SystemInfo.cpp
|
||||
metrics/SystemInfo.h
|
||||
server/MegasearchThreadPoolServer.cpp
|
||||
server/MegasearchThreadPoolServer.h
|
||||
)
|
||||
|
||||
set(vecwise_engine_files
|
||||
@ -73,6 +77,7 @@ set(third_party_libs
|
||||
snappy
|
||||
zlib
|
||||
zstd
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so
|
||||
)
|
||||
|
||||
if (GPU_VERSION STREQUAL "ON")
|
||||
@ -84,6 +89,7 @@ if (GPU_VERSION STREQUAL "ON")
|
||||
libquadmath.a
|
||||
cudart
|
||||
cublas
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so
|
||||
)
|
||||
else()
|
||||
set(engine_libs
|
||||
@ -91,6 +97,7 @@ else()
|
||||
libgomp.a
|
||||
libgfortran.a
|
||||
libquadmath.a
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so
|
||||
)
|
||||
endif ()
|
||||
|
||||
@ -131,7 +138,6 @@ set(server_libs
|
||||
pthread
|
||||
dl
|
||||
metrics
|
||||
|
||||
)
|
||||
|
||||
add_executable(vecwise_server
|
||||
|
||||
@ -108,6 +108,7 @@ Status DBImpl::InsertVectors(const std::string& table_id_,
|
||||
|
||||
CollectInsertMetrics(total_time, n, status.ok());
|
||||
return status;
|
||||
|
||||
}
|
||||
|
||||
Status DBImpl::Query(const std::string &table_id, size_t k, size_t nq,
|
||||
@ -119,6 +120,7 @@ Status DBImpl::Query(const std::string &table_id, size_t k, size_t nq,
|
||||
auto total_time = METRICS_MICROSECONDS(start_time,end_time);
|
||||
|
||||
CollectQueryMetrics(total_time, nq);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -304,17 +306,23 @@ void DBImpl::StartTimerTasks(int interval) {
|
||||
|
||||
void DBImpl::BackgroundTimerTask(int interval) {
|
||||
Status status;
|
||||
server::SystemInfo::GetInstance().Init();
|
||||
while (true) {
|
||||
if (!bg_error_.ok()) break;
|
||||
if (shutting_down_.load(std::memory_order_acquire)) break;
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(interval));
|
||||
int64_t cache_total = cache::CpuCacheMgr::GetInstance()->CacheUsage();
|
||||
LOG(DEBUG) << "Cache usage " << cache_total;
|
||||
server::Metrics::GetInstance().CacheUsageGaugeSet(static_cast<double>(cache_total));
|
||||
server::Metrics::GetInstance().KeepingAliveCounterIncrement(interval);
|
||||
int64_t cache_usage = cache::CpuCacheMgr::GetInstance()->CacheUsage();
|
||||
int64_t cache_total = cache::CpuCacheMgr::GetInstance()->CacheCapacity();
|
||||
server::Metrics::GetInstance().CacheUsageGaugeSet(cache_usage*100/cache_total);
|
||||
long size;
|
||||
Size(size);
|
||||
server::Metrics::GetInstance().DataFileSizeGaugeSet(size);
|
||||
server::Metrics::GetInstance().CPUUsagePercentSet();
|
||||
server::Metrics::GetInstance().RAMUsagePercentSet();
|
||||
server::Metrics::GetInstance().GPUPercentGaugeSet();
|
||||
server::Metrics::GetInstance().GPUMemoryUsageGaugeSet();
|
||||
TrySchedule();
|
||||
}
|
||||
}
|
||||
|
||||
@ -131,8 +131,11 @@ Status FaissExecutionEngine::Search(long n,
|
||||
long k,
|
||||
float *distances,
|
||||
long *labels) const {
|
||||
|
||||
auto start_time = METRICS_NOW_TIME;
|
||||
pIndex_->search(n, data, k, distances, labels);
|
||||
auto end_time = METRICS_NOW_TIME;
|
||||
auto total_time = METRICS_MICROSECONDS(start_time,end_time);
|
||||
server::Metrics::GetInstance().QueryIndexTypePerSecondSet(build_index_type_, double(n)/double(total_time));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
||||
@ -27,9 +27,14 @@ MemVectors::MemVectors(const std::shared_ptr<meta::Meta>& meta_ptr,
|
||||
pEE_(EngineFactory::Build(schema_.dimension_, schema_.location_, (EngineType)schema_.engine_type_)) {
|
||||
}
|
||||
|
||||
|
||||
void MemVectors::Add(size_t n_, const float* vectors_, IDNumbers& vector_ids_) {
|
||||
auto start_time = METRICS_NOW_TIME;
|
||||
pIdGenerator_->GetNextIDNumbers(n_, vector_ids_);
|
||||
pEE_->AddWithIds(n_, vectors_, vector_ids_.data());
|
||||
auto end_time = METRICS_NOW_TIME;
|
||||
auto total_time = METRICS_MICROSECONDS(start_time, end_time);
|
||||
server::Metrics::GetInstance().AddVectorsPerSecondGaugeSet(static_cast<int>(n_), static_cast<int>(schema_.dimension_), total_time);
|
||||
}
|
||||
|
||||
size_t MemVectors::Total() const {
|
||||
@ -97,6 +102,7 @@ Status MemManager::InsertVectors(const std::string& table_id_,
|
||||
const float* vectors_,
|
||||
IDNumbers& vector_ids_) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
|
||||
return InsertVectorsNoLock(table_id_, n_, vectors_, vector_ids_);
|
||||
}
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
#include "utils/Error.h"
|
||||
#include "server/ServerConfig.h"
|
||||
#include "SystemInfo.h"
|
||||
|
||||
namespace zilliz {
|
||||
namespace vecwise {
|
||||
@ -71,6 +72,16 @@ class MetricsBase{
|
||||
virtual void AddVectorsFailGaugeSet(double value) {};
|
||||
virtual void QueryVectorResponseSummaryObserve(double value, int count = 1) {};
|
||||
virtual void QueryVectorResponsePerSecondGaugeSet(double value) {};
|
||||
virtual void CPUUsagePercentSet() {};
|
||||
virtual void RAMUsagePercentSet() {};
|
||||
virtual void QueryResponsePerSecondGaugeSet(double value) {};
|
||||
virtual void GPUPercentGaugeSet() {};
|
||||
virtual void GPUMemoryUsageGaugeSet() {};
|
||||
virtual void AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) {};
|
||||
virtual void QueryIndexTypePerSecondSet(std::string type, double value) {};
|
||||
virtual void ConnectionGaugeIncrement() {};
|
||||
virtual void ConnectionGaugeDecrement() {};
|
||||
virtual void KeepingAliveCounterIncrement(double value = 1) {};
|
||||
};
|
||||
|
||||
|
||||
|
||||
@ -6,6 +6,8 @@
|
||||
|
||||
#include "PrometheusMetrics.h"
|
||||
#include "utils/Log.h"
|
||||
#include "SystemInfo.h"
|
||||
|
||||
|
||||
namespace zilliz {
|
||||
namespace vecwise {
|
||||
@ -32,8 +34,108 @@ PrometheusMetrics::Init() {
|
||||
}
|
||||
|
||||
return SERVER_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
PrometheusMetrics::CPUUsagePercentSet() {
|
||||
if(!startup_) return ;
|
||||
double usage_percent = server::SystemInfo::GetInstance().CPUPercent();
|
||||
CPU_usage_percent_.Set(usage_percent);
|
||||
}
|
||||
|
||||
void
|
||||
PrometheusMetrics::RAMUsagePercentSet() {
|
||||
if(!startup_) return ;
|
||||
double usage_percent = server::SystemInfo::GetInstance().MemoryPercent();
|
||||
RAM_usage_percent_.Set(usage_percent);
|
||||
}
|
||||
|
||||
void
|
||||
PrometheusMetrics::GPUPercentGaugeSet() {
|
||||
if(!startup_) return;
|
||||
int numDevide = server::SystemInfo::GetInstance().num_device();
|
||||
std::vector<unsigned int> values = server::SystemInfo::GetInstance().GPUPercent();
|
||||
// for (int i = 0; i < numDevide; ++i) {
|
||||
// GPU_percent_gauges_[i].Set(static_cast<double>(values[i]));
|
||||
// }
|
||||
if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast<double>(values[0]));
|
||||
if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast<double>(values[1]));
|
||||
if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast<double>(values[2]));
|
||||
if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast<double>(values[3]));
|
||||
if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast<double>(values[4]));
|
||||
if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast<double>(values[5]));
|
||||
if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast<double>(values[6]));
|
||||
if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast<double>(values[7]));
|
||||
|
||||
// to do
|
||||
}
|
||||
|
||||
void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
|
||||
if(!startup_) return;
|
||||
int numDevide = server::SystemInfo::GetInstance().num_device();
|
||||
std::vector<unsigned long long> values = server::SystemInfo::GetInstance().GPUMemoryUsed();
|
||||
constexpr unsigned long long MtoB = 1024*1024;
|
||||
int numDevice = values.size();
|
||||
// for (int i = 0; i < numDevice; ++i) {
|
||||
// GPU_memory_usage_gauges_[i].Set(values[i]/MtoB);
|
||||
// }
|
||||
if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB);
|
||||
if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB);
|
||||
if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB);
|
||||
if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB);
|
||||
if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB);
|
||||
if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB);
|
||||
if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB);
|
||||
if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB);
|
||||
|
||||
// to do
|
||||
}
|
||||
void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) {
|
||||
// MB/s
|
||||
if(!startup_) return;
|
||||
|
||||
long long MtoB = 1024*1024;
|
||||
long long size = num_vector * dim * 4;
|
||||
add_vectors_per_second_gauge_.Set(size/time/MtoB);
|
||||
|
||||
}
|
||||
void PrometheusMetrics::QueryIndexTypePerSecondSet(std::string type, double value) {
|
||||
if(!startup_) return;
|
||||
if(type == "IVF"){
|
||||
query_index_IVF_type_per_second_gauge_.Set(value);
|
||||
} else if(type == "IDMap"){
|
||||
query_index_IDMAP_type_per_second_gauge_.Set(value);
|
||||
}
|
||||
|
||||
}
|
||||
void PrometheusMetrics::ConnectionGaugeIncrement() {
|
||||
if(!startup_) return;
|
||||
connection_gauge_.Increment();
|
||||
}
|
||||
void PrometheusMetrics::ConnectionGaugeDecrement() {
|
||||
if(!startup_) return;
|
||||
connection_gauge_.Decrement();
|
||||
}
|
||||
|
||||
//void PrometheusMetrics::GpuPercentInit() {
|
||||
// int num_device = SystemInfo::GetInstance().num_device();
|
||||
// constexpr char device_number[] = "DeviceNum";
|
||||
// for(int i = 0; i < num_device; ++ i) {
|
||||
// GPU_percent_gauges_.emplace_back(GPU_percent_.Add({{device_number,std::to_string(i)}}));
|
||||
// }
|
||||
//
|
||||
//}
|
||||
//void PrometheusMetrics::GpuMemoryInit() {
|
||||
// int num_device = SystemInfo::GetInstance().num_device();
|
||||
// constexpr char device_number[] = "DeviceNum";
|
||||
// for(int i = 0; i < num_device; ++ i) {
|
||||
// GPU_memory_usage_gauges_.emplace_back(GPU_memory_usage_.Add({{device_number,std::to_string(i)}}));
|
||||
// }
|
||||
//}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -49,6 +49,8 @@ class PrometheusMetrics: public MetricsBase {
|
||||
std::shared_ptr<prometheus::Exposer> exposer_ptr_;
|
||||
std::shared_ptr<prometheus::Registry> registry_ = std::make_shared<prometheus::Registry>();
|
||||
bool startup_ = false;
|
||||
// void GpuPercentInit();
|
||||
// void GpuMemoryInit();
|
||||
public:
|
||||
|
||||
void AddGroupSuccessTotalIncrement(double value = 1.0) override { if(startup_) add_group_success_total_.Increment(value);};
|
||||
@ -104,51 +106,20 @@ class PrometheusMetrics: public MetricsBase {
|
||||
void AddVectorsFailGaugeSet(double value) override { if(startup_) add_vectors_fail_gauge_.Set(value);};
|
||||
void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);};
|
||||
void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);};
|
||||
void CPUUsagePercentSet() override ;
|
||||
void RAMUsagePercentSet() override ;
|
||||
void QueryResponsePerSecondGaugeSet(double value) override {if(startup_) query_response_per_second_gauge.Set(value);};
|
||||
void GPUPercentGaugeSet() override ;
|
||||
void GPUMemoryUsageGaugeSet() override ;
|
||||
void AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) override ;
|
||||
void QueryIndexTypePerSecondSet(std::string type, double value) override ;
|
||||
void ConnectionGaugeIncrement() override ;
|
||||
void ConnectionGaugeDecrement() override ;
|
||||
void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);};
|
||||
|
||||
|
||||
|
||||
|
||||
// prometheus::Counter &connection_total() {return connection_total_; }
|
||||
//
|
||||
// prometheus::Counter &add_group_success_total() { return add_group_success_total_; }
|
||||
// prometheus::Counter &add_group_fail_total() { return add_group_fail_total_; }
|
||||
//
|
||||
// prometheus::Counter &get_group_success_total() { return get_group_success_total_;}
|
||||
// prometheus::Counter &get_group_fail_total() { return get_group_fail_total_;}
|
||||
//
|
||||
// prometheus::Counter &has_group_success_total() { return has_group_success_total_;}
|
||||
// prometheus::Counter &has_group_fail_total() { return has_group_fail_total_;}
|
||||
//
|
||||
// prometheus::Counter &get_group_files_success_total() { return get_group_files_success_total_;};
|
||||
// prometheus::Counter &get_group_files_fail_total() { return get_group_files_fail_total_;}
|
||||
//
|
||||
// prometheus::Counter &add_vectors_success_total() { return add_vectors_success_total_; }
|
||||
// prometheus::Counter &add_vectors_fail_total() { return add_vectors_fail_total_; }
|
||||
//
|
||||
// prometheus::Histogram &add_vectors_duration_histogram() { return add_vectors_duration_histogram_;}
|
||||
//
|
||||
// prometheus::Counter &search_success_total() { return search_success_total_; }
|
||||
// prometheus::Counter &search_fail_total() { return search_fail_total_; }
|
||||
//
|
||||
// prometheus::Histogram &search_duration_histogram() { return search_duration_histogram_; }
|
||||
// prometheus::Histogram &raw_files_size_histogram() { return raw_files_size_histogram_; }
|
||||
// prometheus::Histogram &index_files_size_histogram() { return index_files_size_histogram_; }
|
||||
//
|
||||
// prometheus::Histogram &build_index_duration_seconds_histogram() { return build_index_duration_seconds_histogram_; }
|
||||
//
|
||||
// prometheus::Histogram &all_build_index_duration_seconds_histogram() { return all_build_index_duration_seconds_histogram_; }
|
||||
//
|
||||
// prometheus::Gauge &cache_usage_gauge() { return cache_usage_gauge_; }
|
||||
//
|
||||
// prometheus::Counter &meta_visit_total() { return meta_visit_total_; }
|
||||
//
|
||||
// prometheus::Histogram &meta_visit_duration_seconds_histogram() { return meta_visit_duration_seconds_histogram_; }
|
||||
//
|
||||
// prometheus::Gauge &mem_usage_percent_gauge() { return mem_usage_percent_gauge_; }
|
||||
//
|
||||
// prometheus::Gauge &mem_usage_total_gauge() { return mem_usage_total_gauge_; }
|
||||
|
||||
|
||||
|
||||
|
||||
std::shared_ptr<prometheus::Exposer> &exposer_ptr() {return exposer_ptr_; }
|
||||
@ -273,7 +244,7 @@ class PrometheusMetrics: public MetricsBase {
|
||||
.Name("build_index_duration_microseconds")
|
||||
.Help("histogram of processing time for building index")
|
||||
.Register(*registry_);
|
||||
prometheus::Histogram &build_index_duration_seconds_histogram_ = build_index_duration_seconds_.Add({}, BucketBoundaries{2e6, 4e6, 6e6, 8e6, 1e7});
|
||||
prometheus::Histogram &build_index_duration_seconds_histogram_ = build_index_duration_seconds_.Add({}, BucketBoundaries{5e5, 2e6, 4e6, 6e6, 8e6, 1e7});
|
||||
|
||||
|
||||
//record processing time for all building index
|
||||
@ -414,6 +385,12 @@ class PrometheusMetrics: public MetricsBase {
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &query_vector_response_per_second_gauge_ = query_vector_response_per_second_.Add({});
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &query_response_per_second_ = prometheus::BuildGauge()
|
||||
.Name("query_response_per_microsecond")
|
||||
.Help("the number of queries can be processed every microsecond")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &query_response_per_second_gauge = query_response_per_second_.Add({});
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &disk_store_IO_speed_ = prometheus::BuildGauge()
|
||||
.Name("disk_store_IO_speed_bytes_per_microseconds")
|
||||
.Help("disk_store_IO_speed")
|
||||
@ -433,6 +410,77 @@ class PrometheusMetrics: public MetricsBase {
|
||||
prometheus::Gauge &add_vectors_success_gauge_ = add_vectors_.Add({{"outcome", "success"}});
|
||||
prometheus::Gauge &add_vectors_fail_gauge_ = add_vectors_.Add({{"outcome", "fail"}});
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &add_vectors_per_second_ = prometheus::BuildGauge()
|
||||
.Name("add_vectors_throughput_per_microsecond")
|
||||
.Help("add vectors throughput per microsecond")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &add_vectors_per_second_gauge_ = add_vectors_per_second_.Add({});
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &CPU_ = prometheus::BuildGauge()
|
||||
.Name("CPU_usage_percent")
|
||||
.Help("CPU usage percent by this this process")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({});
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &RAM_ = prometheus::BuildGauge()
|
||||
.Name("RAM_usage_percent")
|
||||
.Help("RAM usage percent by this process")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &RAM_usage_percent_ = RAM_.Add({});
|
||||
|
||||
//GPU Usage Percent
|
||||
prometheus::Family<prometheus::Gauge> &GPU_percent_ = prometheus::BuildGauge()
|
||||
.Name("Gpu_usage_percent")
|
||||
.Help("GPU_usage_percent ")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}});
|
||||
prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}});
|
||||
prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}});
|
||||
prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}});
|
||||
prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}});
|
||||
prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}});
|
||||
prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}});
|
||||
prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}});
|
||||
// std::vector<prometheus::Gauge> GPU_percent_gauges_;
|
||||
|
||||
|
||||
|
||||
|
||||
//GPU Mempry used
|
||||
prometheus::Family<prometheus::Gauge> &GPU_memory_usage_ = prometheus::BuildGauge()
|
||||
.Name("GPU_memory_usage_total")
|
||||
.Help("GPU memory usage total ")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}});
|
||||
prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}});
|
||||
prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}});
|
||||
prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}});
|
||||
prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}});
|
||||
prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}});
|
||||
prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}});
|
||||
prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}});
|
||||
// std::vector<prometheus::Gauge> GPU_memory_usage_gauges_;
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &query_index_type_per_second_ = prometheus::BuildGauge()
|
||||
.Name("query_index_throughtout_per_microsecond")
|
||||
.Help("query index throughtout per microsecond")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &query_index_IVF_type_per_second_gauge_ = query_index_type_per_second_.Add({{"IndexType","IVF"}});
|
||||
prometheus::Gauge &query_index_IDMAP_type_per_second_gauge_ = query_index_type_per_second_.Add({{"IndexType","IDMAP"}});
|
||||
|
||||
prometheus::Family<prometheus::Gauge> &connection_ = prometheus::BuildGauge()
|
||||
.Name("connection_number")
|
||||
.Help("the number of connections")
|
||||
.Register(*registry_);
|
||||
prometheus::Gauge &connection_gauge_ = connection_.Add({});
|
||||
|
||||
prometheus::Family<prometheus::Counter> &keeping_alive_ = prometheus::BuildCounter()
|
||||
.Name("keeping_alive_seconds_total")
|
||||
.Help("total seconds of the serve alive")
|
||||
.Register(*registry_);
|
||||
prometheus::Counter &keeping_alive_counter_ = keeping_alive_.Add({});
|
||||
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
||||
207
cpp/src/metrics/SystemInfo.cpp
Normal file
207
cpp/src/metrics/SystemInfo.cpp
Normal file
@ -0,0 +1,207 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved
|
||||
* Unauthorized copying of this file, via any medium is strictly prohibited.
|
||||
* Proprietary and confidential.
|
||||
******************************************************************************/
|
||||
|
||||
#include "SystemInfo.h"
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include "nvml.h"
|
||||
//#include <mutex>
|
||||
//
|
||||
//std::mutex mutex;
|
||||
|
||||
|
||||
namespace zilliz {
|
||||
namespace vecwise {
|
||||
namespace server {
|
||||
|
||||
void SystemInfo::Init() {
|
||||
if(initialized_) return;
|
||||
|
||||
initialized_ = true;
|
||||
|
||||
// initialize CPU information
|
||||
FILE* file;
|
||||
struct tms time_sample;
|
||||
char line[128];
|
||||
last_cpu_ = times(&time_sample);
|
||||
last_sys_cpu_ = time_sample.tms_stime;
|
||||
last_user_cpu_ = time_sample.tms_utime;
|
||||
file = fopen("/proc/cpuinfo", "r");
|
||||
num_processors_ = 0;
|
||||
while(fgets(line, 128, file) != NULL){
|
||||
if (strncmp(line, "processor", 9) == 0) num_processors_++;
|
||||
}
|
||||
total_ram_ = GetPhysicalMemory();
|
||||
fclose(file);
|
||||
|
||||
//initialize GPU information
|
||||
nvmlReturn_t nvmlresult;
|
||||
nvmlresult = nvmlInit();
|
||||
if(NVML_SUCCESS != nvmlresult) {
|
||||
printf("System information initilization failed");
|
||||
return ;
|
||||
}
|
||||
nvmlresult = nvmlDeviceGetCount(&num_device_);
|
||||
if(NVML_SUCCESS != nvmlresult) {
|
||||
printf("Unable to get devidce number");
|
||||
return ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
long long
|
||||
SystemInfo::ParseLine(char *line) {
|
||||
// This assumes that a digit will be found and the line ends in " Kb".
|
||||
int i = strlen(line);
|
||||
const char *p = line;
|
||||
while (*p < '0' || *p > '9') p++;
|
||||
line[i - 3] = '\0';
|
||||
i = atoi(p);
|
||||
return static_cast<long long>(i);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
SystemInfo::GetPhysicalMemory() {
|
||||
struct sysinfo memInfo;
|
||||
sysinfo (&memInfo);
|
||||
unsigned long totalPhysMem = memInfo.totalram;
|
||||
//Multiply in next statement to avoid int overflow on right hand side...
|
||||
totalPhysMem *= memInfo.mem_unit;
|
||||
return totalPhysMem;
|
||||
}
|
||||
|
||||
unsigned long
|
||||
SystemInfo::GetProcessUsedMemory() {
|
||||
//Note: this value is in KB!
|
||||
FILE* file = fopen("/proc/self/status", "r");
|
||||
constexpr int64_t line_length = 128;
|
||||
long long result = -1;
|
||||
constexpr int64_t KB_SIZE = 1024;
|
||||
char line[line_length];
|
||||
|
||||
while (fgets(line, line_length, file) != NULL){
|
||||
if (strncmp(line, "VmRSS:", 6) == 0){
|
||||
result = ParseLine(line);
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(file);
|
||||
// return value in Byte
|
||||
return (result*KB_SIZE);
|
||||
|
||||
}
|
||||
|
||||
double
|
||||
SystemInfo::MemoryPercent() {
|
||||
if (!initialized_) Init();
|
||||
return GetProcessUsedMemory()*100/total_ram_;
|
||||
}
|
||||
|
||||
double
|
||||
SystemInfo::CPUPercent() {
|
||||
if (!initialized_) Init();
|
||||
struct tms time_sample;
|
||||
clock_t now;
|
||||
double percent;
|
||||
|
||||
now = times(&time_sample);
|
||||
if (now <= last_cpu_ || time_sample.tms_stime < last_sys_cpu_ ||
|
||||
time_sample.tms_utime < last_user_cpu_){
|
||||
//Overflow detection. Just skip this value.
|
||||
percent = -1.0;
|
||||
}
|
||||
else{
|
||||
percent = (time_sample.tms_stime - last_sys_cpu_) +
|
||||
(time_sample.tms_utime - last_user_cpu_);
|
||||
percent /= (now - last_cpu_);
|
||||
percent /= num_processors_;
|
||||
percent *= 100;
|
||||
}
|
||||
last_cpu_ = now;
|
||||
last_sys_cpu_ = time_sample.tms_stime;
|
||||
last_user_cpu_ = time_sample.tms_utime;
|
||||
|
||||
return percent;
|
||||
}
|
||||
|
||||
//std::unordered_map<int,std::vector<double>>
|
||||
//SystemInfo::GetGPUMemPercent(){
|
||||
// // return GPUID: MEM%
|
||||
//
|
||||
// //write GPU info to a file
|
||||
// system("nvidia-smi pmon -c 1 > GPUInfo.txt");
|
||||
// int pid = (int)getpid();
|
||||
//
|
||||
// //parse line
|
||||
// std::ifstream read_file;
|
||||
// read_file.open("GPUInfo.txt");
|
||||
// std::string line;
|
||||
// while(getline(read_file, line)){
|
||||
// std::vector<std::string> words = split(line);
|
||||
// // 0 1 2 3 4 5 6 7
|
||||
// //words stand for gpuindex, pid, type, sm, mem, enc, dec, command respectively
|
||||
// if(std::stoi(words[1]) != pid) continue;
|
||||
// int GPUindex = std::stoi(words[0]);
|
||||
// double sm_percent = std::stod(words[3]);
|
||||
// double mem_percent = std::stod(words[4]);
|
||||
//
|
||||
// }
|
||||
//
|
||||
//}
|
||||
|
||||
//std::vector<std::string>
|
||||
//SystemInfo::split(std::string input) {
|
||||
// std::vector<std::string> words;
|
||||
// input += " ";
|
||||
// int word_start = 0;
|
||||
// for (int i = 0; i < input.size(); ++i) {
|
||||
// if(input[i] != ' ') continue;
|
||||
// if(input[i] == ' ') {
|
||||
// word_start = i + 1;
|
||||
// continue;
|
||||
// }
|
||||
// words.push_back(input.substr(word_start,i-word_start));
|
||||
// }
|
||||
// return words;
|
||||
//}
|
||||
|
||||
std::vector<unsigned int>
|
||||
SystemInfo::GPUPercent() {
|
||||
// get GPU usage percent
|
||||
if(!initialized_) Init();
|
||||
std::vector<unsigned int> result;
|
||||
nvmlUtilization_t utilization;
|
||||
for (int i = 0; i < num_device_; ++i) {
|
||||
nvmlDevice_t device;
|
||||
nvmlDeviceGetHandleByIndex(i, &device);
|
||||
nvmlDeviceGetUtilizationRates(device, &utilization);
|
||||
result.push_back(utilization.gpu);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::vector<unsigned long long>
|
||||
SystemInfo::GPUMemoryUsed() {
|
||||
// get GPU memory used
|
||||
if(!initialized_) Init();
|
||||
|
||||
std::vector<unsigned long long int> result;
|
||||
nvmlMemory_t nvmlMemory;
|
||||
for (int i = 0; i < num_device_; ++i) {
|
||||
nvmlDevice_t device;
|
||||
nvmlDeviceGetHandleByIndex(i, &device);
|
||||
nvmlDeviceGetMemoryInfo(device, &nvmlMemory);
|
||||
result.push_back(nvmlMemory.used);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
60
cpp/src/metrics/SystemInfo.h
Normal file
60
cpp/src/metrics/SystemInfo.h
Normal file
@ -0,0 +1,60 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved
|
||||
* Unauthorized copying of this file, via any medium is strictly prohibited.
|
||||
* Proprietary and confidential.
|
||||
******************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "sys/types.h"
|
||||
#include "sys/sysinfo.h"
|
||||
#include "stdlib.h"
|
||||
#include "stdio.h"
|
||||
#include "string.h"
|
||||
#include "sys/times.h"
|
||||
#include "sys/vtimes.h"
|
||||
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
|
||||
|
||||
namespace zilliz {
|
||||
namespace vecwise {
|
||||
namespace server {
|
||||
|
||||
class SystemInfo {
|
||||
private:
|
||||
unsigned long total_ram_ = 0;
|
||||
clock_t last_cpu_ = clock_t();
|
||||
clock_t last_sys_cpu_ = clock_t();
|
||||
clock_t last_user_cpu_ = clock_t();
|
||||
int num_processors_ = 0;
|
||||
//number of GPU
|
||||
unsigned int num_device_ = 0;
|
||||
bool initialized_ = false;
|
||||
|
||||
public:
|
||||
static SystemInfo &
|
||||
GetInstance(){
|
||||
static SystemInfo instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void Init();
|
||||
int num_device() const {return num_device_;};
|
||||
long long ParseLine(char* line);
|
||||
unsigned long GetPhysicalMemory();
|
||||
unsigned long GetProcessUsedMemory();
|
||||
double MemoryPercent();
|
||||
double CPUPercent();
|
||||
// std::unordered_map<int,std::vector<double>> GetGPUMemPercent() {};
|
||||
// std::vector<std::string> split(std::string input) {};
|
||||
std::vector<unsigned int> GPUPercent();
|
||||
std::vector<unsigned long long> GPUMemoryUsed();
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -8,6 +8,7 @@
|
||||
#include "megasearch_types.h"
|
||||
#include "megasearch_constants.h"
|
||||
#include "ServerConfig.h"
|
||||
#include "MegasearchThreadPoolServer.h"
|
||||
|
||||
#include <thrift/protocol/TBinaryProtocol.h>
|
||||
#include <thrift/protocol/TJSONProtocol.h>
|
||||
@ -76,7 +77,7 @@ MegasearchServer::StartService() {
|
||||
threadManager->threadFactory(threadFactory);
|
||||
threadManager->start();
|
||||
|
||||
s_server.reset(new TThreadPoolServer(processor,
|
||||
s_server.reset(new MegasearchThreadPoolServer(processor,
|
||||
server_transport,
|
||||
transport_factory,
|
||||
protocol_factory,
|
||||
|
||||
34
cpp/src/server/MegasearchThreadPoolServer.cpp
Normal file
34
cpp/src/server/MegasearchThreadPoolServer.cpp
Normal file
@ -0,0 +1,34 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved
|
||||
* Unauthorized copying of this file, via any medium is strictly prohibited.
|
||||
* Proprietary and confidential.
|
||||
******************************************************************************/
|
||||
#include "metrics/Metrics.h"
|
||||
#include "MegasearchThreadPoolServer.h"
|
||||
|
||||
namespace zilliz {
|
||||
namespace vecwise {
|
||||
namespace server {
|
||||
|
||||
void
|
||||
MegasearchThreadPoolServer::onClientConnected(const std::shared_ptr<apache::thrift::server::TConnectedClient> &pClient) {
|
||||
server::Metrics::GetInstance().ConnectionGaugeIncrement();
|
||||
TThreadPoolServer::onClientConnected(pClient);
|
||||
}
|
||||
|
||||
void
|
||||
MegasearchThreadPoolServer::onClientDisconnected(apache::thrift::server::TConnectedClient *pClient) {
|
||||
server::Metrics::GetInstance().ConnectionGaugeDecrement();
|
||||
TThreadPoolServer::onClientDisconnected(pClient);
|
||||
}
|
||||
zilliz::vecwise::server::MegasearchThreadPoolServer::MegasearchThreadPoolServer(const std::shared_ptr<apache::thrift::TProcessor> &processor,
|
||||
const std::shared_ptr<apache::thrift::transport::TServerTransport> &serverTransport,
|
||||
const std::shared_ptr<apache::thrift::transport::TTransportFactory> &transportFactory,
|
||||
const std::shared_ptr<apache::thrift::protocol::TProtocolFactory> &protocolFactory,
|
||||
const std::shared_ptr<apache::thrift::concurrency::ThreadManager> &threadManager)
|
||||
: TThreadPoolServer(processor, serverTransport, transportFactory, protocolFactory, threadManager) {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
33
cpp/src/server/MegasearchThreadPoolServer.h
Normal file
33
cpp/src/server/MegasearchThreadPoolServer.h
Normal file
@ -0,0 +1,33 @@
|
||||
/*******************************************************************************
|
||||
* Copyright 上海赜睿信息科技有限公司(Zilliz) - All Rights Reserved
|
||||
* Unauthorized copying of this file, via any medium is strictly prohibited.
|
||||
* Proprietary and confidential.
|
||||
******************************************************************************/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <thrift/server/TThreadPoolServer.h>
|
||||
|
||||
|
||||
namespace zilliz {
|
||||
namespace vecwise {
|
||||
namespace server {
|
||||
|
||||
class MegasearchThreadPoolServer : public apache::thrift::server::TThreadPoolServer {
|
||||
public:
|
||||
MegasearchThreadPoolServer(
|
||||
const std::shared_ptr<apache::thrift::TProcessor>& processor,
|
||||
const std::shared_ptr<apache::thrift::transport::TServerTransport>& serverTransport,
|
||||
const std::shared_ptr<apache::thrift::transport::TTransportFactory>& transportFactory,
|
||||
const std::shared_ptr<apache::thrift::protocol::TProtocolFactory>& protocolFactory,
|
||||
const std::shared_ptr<apache::thrift::concurrency::ThreadManager>& threadManager
|
||||
= apache::thrift::concurrency::ThreadManager::newSimpleThreadManager());
|
||||
|
||||
protected:
|
||||
void onClientConnected(const std::shared_ptr<apache::thrift::server::TConnectedClient>& pClient) override ;
|
||||
void onClientDisconnected(apache::thrift::server::TConnectedClient* pClient) override ;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -173,6 +173,7 @@ Server::Start() {
|
||||
signal(SIGHUP, SignalUtil::HandleSignal);
|
||||
signal(SIGTERM, SignalUtil::HandleSignal);
|
||||
server::Metrics::GetInstance().Init();
|
||||
server::SystemInfo::GetInstance().Init();
|
||||
SERVER_LOG_INFO << "Vecwise server is running...";
|
||||
StartService();
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ set(unittest_libs
|
||||
civetweb
|
||||
dl
|
||||
z
|
||||
${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libnvidia-ml.so
|
||||
)
|
||||
|
||||
add_subdirectory(server)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user