From 2ab5e246ac5e1fed9cc696333698c6eed10d23a7 Mon Sep 17 00:00:00 2001 From: yu yunfeng Date: Fri, 31 May 2019 14:36:36 +0800 Subject: [PATCH] add metrics Former-commit-id: d7acd398be8aeee11de3f972d9bcf6e84df39358 --- cpp/CHANGELOG.md | 1 + cpp/src/db/DBImpl.inl | 20 ++++++++++-- cpp/src/db/MemManager.inl | 7 +++++ cpp/src/main.cpp | 2 +- cpp/src/metrics/MetricBase.h | 6 +++- cpp/src/metrics/Metrics.cpp | 1 - cpp/src/metrics/PrometheusMetrics.h | 44 +++++++++++++++++++++++---- cpp/src/server/Server.cpp | 4 +-- cpp/unittest/metrics/CMakeLists.txt | 3 +- cpp/unittest/metrics/metrics_test.cpp | 2 +- 10 files changed, 74 insertions(+), 16 deletions(-) diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 80d1a70c8b..7ecac566bb 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -26,3 +26,4 @@ Please mark all change in change log and use the ticket from JIRA. - MS-30 - Use faiss v1.5.2 - MS-32 - Fix thrift error - MS-34 - Fix prometheus-cpp thirdparty +- MS-37 - Add query, cache usage, disk write speed and file data size metrics diff --git a/cpp/src/db/DBImpl.inl b/cpp/src/db/DBImpl.inl index 2170358a43..706f8a481b 100644 --- a/cpp/src/db/DBImpl.inl +++ b/cpp/src/db/DBImpl.inl @@ -71,17 +71,26 @@ Status DBImpl::InsertVectors(const std::string& table_id_, // server::Metrics::GetInstance().add_vector_duration_seconds_quantiles().Observe((average_time)); if (!status.ok()) { server::Metrics::GetInstance().AddVectorsFailTotalIncrement(n); + server::Metrics::GetInstance().AddVectorsFailGaugeSet(n); return status; } server::Metrics::GetInstance().AddVectorsSuccessTotalIncrement(n); + server::Metrics::GetInstance().AddVectorsSuccessGaugeSet(n); } template Status DBImpl::Query(const std::string &table_id, size_t k, size_t nq, const float *vectors, QueryResults &results) { - + auto start_time = METRICS_NOW_TIME; meta::DatesT dates = {meta::Meta::GetDate()}; - return Query(table_id, k, nq, vectors, dates, results); + Status result = Query(table_id, k, nq, vectors, dates, results); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time,end_time); + auto average_time = total_time / nq; + for (int i = 0; i < nq; ++i) { + server::Metrics::GetInstance().QueryResponseSummaryObserve(average_time); + } + return result; } template @@ -250,7 +259,12 @@ void DBImpl::BackgroundTimerTask(int interval) { if (shutting_down_.load(std::memory_order_acquire)) break; std::this_thread::sleep_for(std::chrono::seconds(interval)); - + int64_t cache_total = cache::CpuCacheMgr::GetInstance()->CacheUsage(); + LOG(DEBUG) << "Cache usage " << cache_total; + server::Metrics::GetInstance().CacheUsageGaugeSet(static_cast(cache_total)); + long size; + Size(size); + server::Metrics::GetInstance().DataFileSizeGaugeSet(size); TrySchedule(); } } diff --git a/cpp/src/db/MemManager.inl b/cpp/src/db/MemManager.inl index 35e7c70ada..528622795d 100644 --- a/cpp/src/db/MemManager.inl +++ b/cpp/src/db/MemManager.inl @@ -8,6 +8,7 @@ #include "MemManager.h" #include "Meta.h" #include "MetaConsts.h" +#include "metrics/Metrics.h" #include #include @@ -48,8 +49,14 @@ template Status MemVectors::Serialize(std::string& table_id) { table_id = schema_.table_id; auto size = ApproximateSize(); + auto start_time = METRICS_NOW_TIME; pEE_->Serialize(); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time, end_time); schema_.size = size; + + server::Metrics::GetInstance().DiskStoreIOSpeedGaugeSet(size/total_time); + schema_.file_type = (size >= options_.index_trigger_size) ? meta::TableFileSchema::TO_INDEX : meta::TableFileSchema::RAW; diff --git a/cpp/src/main.cpp b/cpp/src/main.cpp index 61614e3557..08ecb8c194 100644 --- a/cpp/src/main.cpp +++ b/cpp/src/main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include "metrics/Metrics.h" #include "utils/SignalUtil.h" #include "utils/CommonUtil.h" @@ -25,7 +26,6 @@ using namespace zilliz::vecwise; int main(int argc, char *argv[]) { printf("Vecwise engine server start...\n"); - // zilliz::lib::gpu::InitMemoryAllocator(); signal(SIGINT, server::SignalUtil::HandleSignal); diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index d720b8de44..ccda229f2a 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -64,7 +64,11 @@ class MetricsBase{ virtual void IndexFileSizeGaugeSet(double value) {}; virtual void RawFileSizeGaugeSet(double value) {}; virtual void FaissDiskLoadIOSpeedGaugeSet(double value) {}; - + virtual void QueryResponseSummaryObserve(double value) {}; + virtual void DiskStoreIOSpeedGaugeSet(double value) {}; + virtual void DataFileSizeGaugeSet(double value) {}; + virtual void AddVectorsSuccessGaugeSet(double value) {}; + virtual void AddVectorsFailGaugeSet(double value) {}; }; diff --git a/cpp/src/metrics/Metrics.cpp b/cpp/src/metrics/Metrics.cpp index feb986b162..1bacf4ff0b 100644 --- a/cpp/src/metrics/Metrics.cpp +++ b/cpp/src/metrics/Metrics.cpp @@ -4,7 +4,6 @@ * Proprietary and confidential. ******************************************************************************/ -#pragma once #include "Metrics.h" #include "PrometheusMetrics.h" diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 000aa31608..cebd48b5aa 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -97,7 +97,11 @@ class PrometheusMetrics: public MetricsBase { void RawFileSizeTotalIncrement(double value = 1) { if(startup_) raw_file_size_total_.Increment(value);}; void IndexFileSizeGaugeSet(double value) { if(startup_) index_file_size_gauge_.Set(value);}; void RawFileSizeGaugeSet(double value) { if(startup_) raw_file_size_gauge_.Set(value);}; - + void QueryResponseSummaryObserve(double value) {if(startup_) query_response_summary_.Observe(value);}; + void DiskStoreIOSpeedGaugeSet(double value) { if(startup_) disk_store_IO_speed_gauge_.Set(value);}; + void DataFileSizeGaugeSet(double value) { if(startup_) data_file_size_gauge_.Set(value);}; + void AddVectorsSuccessGaugeSet(double value) { if(startup_) add_vectors_success_gauge_.Set(value);}; + void AddVectorsFailGaugeSet(double value) { if(startup_) add_vectors_fail_gauge_.Set(value);}; @@ -295,11 +299,6 @@ class PrometheusMetrics: public MetricsBase { ////all form Cache.cpp //record cache usage, when insert/erase/clear/free - prometheus::Family &cache_usage_ = prometheus::BuildGauge() - .Name("cache_usage") - .Help("total bytes that cache used") - .Register(*registry_); - prometheus::Gauge &cache_usage_gauge_ = cache_usage_.Add({}); ////all from Meta.cpp @@ -386,6 +385,39 @@ class PrometheusMetrics: public MetricsBase { .Register(*registry_); prometheus::Counter &cache_access_total_ = cache_access_.Add({}); + // record cache usage and % + prometheus::Family &cache_usage_ = prometheus::BuildGauge() + .Name("cache_usage_bytes") + .Help("current cache usage by bytes") + .Register(*registry_); + prometheus::Gauge &cache_usage_gauge_ = cache_usage_.Add({}); + + // record query response + using Quantiles = std::vector; + prometheus::Family &query_response_ = prometheus::BuildSummary() + .Name("query_response_summary") + .Help("query response summary") + .Register(*registry_); + prometheus::Summary &query_response_summary_ = query_response_.Add({}, Quantiles{{0.95,0.00},{0.9,0.05},{0.8,0.1}}); + + prometheus::Family &disk_store_IO_speed_ = prometheus::BuildGauge() + .Name("disk_store_IO_speed_bytes_per_microseconds") + .Help("disk_store_IO_speed") + .Register(*registry_); + prometheus::Gauge &disk_store_IO_speed_gauge_ = disk_store_IO_speed_.Add({}); + + prometheus::Family &data_file_size_ = prometheus::BuildGauge() + .Name("data_file_size_bytes") + .Help("data file size by bytes") + .Register(*registry_); + prometheus::Gauge &data_file_size_gauge_ = data_file_size_.Add({}); + + prometheus::Family &add_vectors_ = prometheus::BuildGauge() + .Name("add_vectors") + .Help("current added vectors") + .Register(*registry_); + prometheus::Gauge &add_vectors_success_gauge_ = add_vectors_.Add({{"outcome", "success"}}); + prometheus::Gauge &add_vectors_fail_gauge_ = add_vectors_.Add({{"outcome", "fail"}}); }; diff --git a/cpp/src/server/Server.cpp b/cpp/src/server/Server.cpp index b27dd279f5..a9901dc93e 100644 --- a/cpp/src/server/Server.cpp +++ b/cpp/src/server/Server.cpp @@ -138,7 +138,7 @@ int Server::Start() { // server::Metrics::GetInstance().Init(); // server::Metrics::GetInstance().exposer_ptr()->RegisterCollectable(server::Metrics::GetInstance().registry_ptr()); - server::Metrics::GetInstance().Init(); +// server::Metrics::GetInstance().Init(); if (daemonized_) { Daemonize(); @@ -177,7 +177,7 @@ Server::Start() { signal(SIGINT, SignalUtil::HandleSignal); signal(SIGHUP, SignalUtil::HandleSignal); signal(SIGTERM, SignalUtil::HandleSignal); - + server::Metrics::GetInstance().Init(); SERVER_LOG_INFO << "Vecwise server is running..."; StartService(); diff --git a/cpp/unittest/metrics/CMakeLists.txt b/cpp/unittest/metrics/CMakeLists.txt index 954f34d141..2560467c5b 100644 --- a/cpp/unittest/metrics/CMakeLists.txt +++ b/cpp/unittest/metrics/CMakeLists.txt @@ -31,7 +31,8 @@ set(require_files ../../src/metrics/Metrics.cpp # ../../src/cache/CacheMgr.cpp -# ../../src/metrics/PrometheusMetrics.cpp + ../../src/metrics/PrometheusMetrics.cpp + ../../src/metrics/MetricBase.h ../../src/server/ServerConfig.cpp ../../src/utils/CommonUtil.cpp ../../src/utils/TimeRecorder.cpp diff --git a/cpp/unittest/metrics/metrics_test.cpp b/cpp/unittest/metrics/metrics_test.cpp index 31a57d5ac6..0efc36a3bf 100644 --- a/cpp/unittest/metrics/metrics_test.cpp +++ b/cpp/unittest/metrics/metrics_test.cpp @@ -32,7 +32,7 @@ TEST_F(DBTest, Metric_Tes) { // server::Metrics::GetInstance().exposer_ptr()->RegisterCollectable(server::Metrics::GetInstance().registry_ptr()); server::Metrics::GetInstance().Init(); // server::PrometheusMetrics::GetInstance().exposer_ptr()->RegisterCollectable(server::PrometheusMetrics::GetInstance().registry_ptr()); - zilliz::vecwise::cache::CpuCacheMgr::GetInstance()->SetCapacity(1*1024*1024*1024); + zilliz::vecwise::cache::CpuCacheMgr::GetInstance()->SetCapacity(4*1024*1024*1024); std::cout<CacheCapacity()<