diff --git a/.gitignore b/.gitignore index 83486553b5..a63a6086fd 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ cmake_build/ .DS_Store *.sw[po] cwrapper_build +cwrapper_dablooms_build **/cwrapper_rocksdb_build/ **/.clangd/* **/compile_commands.json diff --git a/Makefile b/Makefile index aa1a80d911..fb0eac1b7b 100644 --- a/Makefile +++ b/Makefile @@ -132,6 +132,7 @@ build-go: standalone milvus build-cpp: @(env bash $(PWD)/scripts/core_build.sh -f "$(CUSTOM_THIRDPARTY_PATH)") @(env bash $(PWD)/scripts/cwrapper_build.sh -t Release -f "$(CUSTOM_THIRDPARTY_PATH)") + @(env bash $(PWD)/scripts/cwrapper_dablooms_build.sh -t Release -f "$(CUSTOM_THIRDPARTY_PATH)") @go env -w CGO_CFLAGS="-I$(PWD)/internal/kv/rocksdb/cwrapper/output/include" @go env -w CGO_LDFLAGS="-L$(PWD)/internal/kv/rocksdb/cwrapper/output/lib -l:librocksdb.a -lstdc++ -lm -lz" @(env bash $(PWD)/scripts/cwrapper_rocksdb_build.sh -t Release -f "$(CUSTOM_THIRDPARTY_PATH)") diff --git a/internal/util/dablooms/cwrapper/.gitignore b/internal/util/dablooms/cwrapper/.gitignore new file mode 100644 index 0000000000..87a73e597c --- /dev/null +++ b/internal/util/dablooms/cwrapper/.gitignore @@ -0,0 +1,4 @@ +output +cmake-build-debug +.idea +cmake_build diff --git a/internal/util/dablooms/cwrapper/CMakeLists.txt b/internal/util/dablooms/cwrapper/CMakeLists.txt new file mode 100644 index 0000000000..f9daa06f77 --- /dev/null +++ b/internal/util/dablooms/cwrapper/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (C) 2019-2020 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under the License. + +cmake_minimum_required(VERSION 3.14...3.17 FATAL_ERROR) +project(dablooms) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +add_library(dablooms STATIC dablooms.cpp murmur.cpp) +target_include_directories(dablooms + PUBLIC + ${PROJECT_SOURCE_DIR} +) + +target_sources(dablooms PUBLIC dablooms.cpp murmur.cpp + + ) +set_target_properties( dablooms PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR} ) +# target_link_libraries(dablooms PUBLIC dablooms) + +if(NOT CMAKE_INSTALL_PREFIX) + set(CMAKE_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}) +endif() + +install(TARGETS dablooms DESTINATION ${CMAKE_INSTALL_PREFIX}) diff --git a/internal/util/dablooms/cwrapper/LICENSE b/internal/util/dablooms/cwrapper/LICENSE new file mode 100644 index 0000000000..89de354795 --- /dev/null +++ b/internal/util/dablooms/cwrapper/LICENSE @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/internal/util/dablooms/cwrapper/build.sh b/internal/util/dablooms/cwrapper/build.sh new file mode 100755 index 0000000000..3d10253e6d --- /dev/null +++ b/internal/util/dablooms/cwrapper/build.sh @@ -0,0 +1,74 @@ +# Copyright (C) 2019-2020 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under the License. + +#!/bin/bash + +SOURCE=${BASH_SOURCE[0]} +while [ -h $SOURCE ]; do # resolve $SOURCE until the file is no longer a symlink + DIR=$( cd -P $( dirname $SOURCE ) && pwd ) + SOURCE=$(readlink $SOURCE) + [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR=$( cd -P $( dirname $SOURCE ) && pwd ) +# echo $DIR + +CMAKE_BUILD=${DIR}/cmake_build +OUTPUT_LIB=${DIR}/output + +if [ ! -d ${CMAKE_BUILD} ];then + mkdir ${CMAKE_BUILD} +fi + +if [ -d ${OUTPUT_LIB} ];then + rm -rf ${OUTPUT_LIB} +fi +mkdir ${OUTPUT_LIB} + +BUILD_TYPE="Debug" +CUSTOM_THIRDPARTY_PATH="" + +while getopts "t:h:f:" arg; do + case $arg in + f) + CUSTOM_THIRDPARTY_PATH=$OPTARG + ;; + t) + BUILD_TYPE=$OPTARG # BUILD_TYPE + ;; + h) # help + echo "-t: build type(default: Debug) +-f: custom thirdparty path(default:) +-h: help + " + exit 0 + ;; + ?) + echo "ERROR! unknown argument" + exit 1 + ;; + esac +done +echo "BUILD_TYPE: " $BUILD_TYPE +echo "CUSTOM_THIRDPARTY_PATH: " $CUSTOM_THIRDPARTY_PATH + +pushd ${CMAKE_BUILD} +CMAKE_CMD="cmake \ +-DCMAKE_INSTALL_PREFIX=${OUTPUT_LIB} \ +-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ +-DCUSTOM_THIRDPARTY_DOWNLOAD_PATH=${CUSTOM_THIRDPARTY_PATH} .." + +${CMAKE_CMD} +echo ${CMAKE_CMD} + +if [[ ! ${jobs+1} ]]; then + jobs=$(nproc) +fi +make -j ${jobs} && make install diff --git a/internal/util/dablooms/cwrapper/dablooms.cpp b/internal/util/dablooms/cwrapper/dablooms.cpp new file mode 100644 index 0000000000..3a05bf4d97 --- /dev/null +++ b/internal/util/dablooms/cwrapper/dablooms.cpp @@ -0,0 +1,442 @@ +/* Copyright @2012 by Justin Hines at Bitly under a very liberal license. See LICENSE in the source distribution. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "murmur.h" +#include "dablooms.h" + +#define DABLOOMS_VERSION "0.9.1" + +#define ERROR_TIGHTENING_RATIO 0.5 +#define SALT_CONSTANT 0x97c29b3a + +const char *dablooms_version(void) +{ + return DABLOOMS_VERSION; +} + +void free_bitmap(bitmap_t *bitmap) +{ + if (bitmap != nullptr) { + free(bitmap->array); + free(bitmap); + } +} + +bitmap_t *bitmap_resize(bitmap_t *bitmap, size_t new_size) +{ + size_t old_size = (bitmap->array == nullptr) ? 0 : bitmap->bytes; + char* new_array = (char*)realloc(bitmap->array, new_size); + if (new_array == nullptr) { + // Todo: malloc error + + } else { + bitmap->bytes = new_size; + bitmap->array = new_array; + if (new_size > old_size) { + memset(bitmap->array + old_size, 0, new_size - old_size); + } + } + + return bitmap; +} + +/* Create a new bitmap, not full featured, simple to give + * us a means of interacting with the 4 bit counters */ +bitmap_t *new_bitmap(size_t bytes) +{ + bitmap_t *bitmap; + if ((bitmap = (bitmap_t *)malloc(sizeof(bitmap_t))) == nullptr) { + return nullptr; + } + + if ((bitmap->array = (char*)malloc(bytes)) == nullptr) { + free(bitmap); + return nullptr; + } + + memset(bitmap->array, 0, bytes); + bitmap->bytes = bytes; + return bitmap; +} + +int bitmap_increment(bitmap_t *bitmap, unsigned int index, long offset) +{ + long access = index / 2 + offset; + uint8_t temp; + uint8_t n = bitmap->array[access]; + if (index % 2 != 0) { + temp = (n & 0x0f); + n = (n & 0xf0) + ((n & 0x0f) + 0x01); + } else { + temp = (n & 0xf0) >> 4; + n = (n & 0x0f) + ((n & 0xf0) + 0x10); + } + + if (temp == 0x0f) { +// fprintf(stderr, "Error, 4 bit int Overflow\n"); + return -1; + } + + bitmap->array[access] = n; + return 0; +} + +/* increments the four bit counter */ +int bitmap_decrement(bitmap_t *bitmap, unsigned int index, long offset) +{ + long access = index / 2 + offset; + uint8_t temp; + uint8_t n = bitmap->array[access]; + + if (index % 2 != 0) { + temp = (n & 0x0f); + n = (n & 0xf0) + ((n & 0x0f) - 0x01); + } else { + temp = (n & 0xf0) >> 4; + n = (n & 0x0f) + ((n & 0xf0) - 0x10); + } + + if (temp == 0x00) { +// fprintf(stderr, "Error, Decrementing zero\n"); +// fprintf(stderr, "Bloom filter Error: you have deleted the same id more than 15 times!\n"); + return -1; + } + + bitmap->array[access] = n; + return 0; +} + +/* decrements the four bit counter */ +int bitmap_check(bitmap_t *bitmap, unsigned int index, long offset) +{ + long access = index / 2 + offset; + if (index % 2 != 0 ) { + return bitmap->array[access] & 0x0f; + } else { + return bitmap->array[access] & 0xf0; + } +} + +/* + * Perform the actual hashing for `key` + * + * Only call the hash once to get a pair of initial values (h1 and + * h2). Use these values to generate all hashes in a quick loop. + * + * See paper by Kirsch, Mitzenmacher [2006] + * http://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf + */ +void hash_func(counting_bloom_t *bloom, const char *key, size_t key_len, uint32_t *hashes) +{ + int i; + uint32_t checksum[4]; + + MurmurHash3_x64_128(key, key_len, SALT_CONSTANT, checksum); + uint32_t h1 = checksum[0]; + uint32_t h2 = checksum[1]; + + for (i = 0; i < bloom->nfuncs; i++) { + hashes[i] = (h1 + i * h2) % bloom->counts_per_func; + } +} + +counting_bloom_t *counting_bloom_init(unsigned int capacity, double error_rate, long offset) +{ + counting_bloom_t *bloom; + + if ((bloom = (counting_bloom_t *)malloc(sizeof(counting_bloom_t))) == nullptr) { + return nullptr; + } + bloom->bitmap = nullptr; + bloom->capacity = capacity; + bloom->error_rate = error_rate; + bloom->offset = offset + sizeof(counting_bloom_header_t); + bloom->nfuncs = (size_t) ceil(log(1 / error_rate) / log(2)); + bloom->counts_per_func = (unsigned int) ceil(capacity * fabs(log(error_rate)) / (bloom->nfuncs * pow(log(2), 2))); + bloom->size = bloom->nfuncs * bloom->counts_per_func; + /* rounding-up integer divide by 2 of bloom->size */ + bloom->num_bytes = ((bloom->size + 1) / 2) + sizeof(counting_bloom_header_t); + bloom->hashes = (uint32_t *)calloc(bloom->nfuncs, sizeof(uint32_t)); + + return bloom; +} + +int counting_bloom_add(counting_bloom_t *bloom, const char *s, size_t len) +{ + unsigned int index, i, offset; + unsigned int *hashes = bloom->hashes; + + hash_func(bloom, s, len, hashes); + + bool error = false; + for (i = 0; i < bloom->nfuncs; i++) { + offset = i * bloom->counts_per_func; + index = hashes[i] + offset; + if (bitmap_increment(bloom->bitmap, index, bloom->offset) == -1) { + error = true; + } + } + bloom->header->count++; + + //return 0; + return error ? -1 : 0; +} + +int counting_bloom_remove(counting_bloom_t *bloom, const char *s, size_t len) +{ + unsigned int index, i, offset; + unsigned int *hashes = bloom->hashes; + + hash_func(bloom, s, len, hashes); + + bool error = false; + for (i = 0; i < bloom->nfuncs; i++) { + offset = i * bloom->counts_per_func; + index = hashes[i] + offset; + if (bitmap_decrement(bloom->bitmap, index, bloom->offset) == -1) { + error = true; + } + } + bloom->header->count--; + + //return 0; + return error ? -1 : 0; +} + +int counting_bloom_check(counting_bloom_t *bloom, const char *s, size_t len) +{ + unsigned int index, i, offset; + unsigned int *hashes = bloom->hashes; + + hash_func(bloom, s, len, hashes); + + for (i = 0; i < bloom->nfuncs; i++) { + offset = i * bloom->counts_per_func; + index = hashes[i] + offset; + if (!(bitmap_check(bloom->bitmap, index, bloom->offset))) { + return 0; + } + } + return 1; +} + +int free_scaling_bloom(scaling_bloom_t *bloom) +{ + int i; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + free(bloom->blooms[i]->hashes); + free(bloom->blooms[i]); + } + free(bloom->blooms); + free_bitmap(bloom->bitmap); + free(bloom); + return 0; +} + +/* creates a new counting bloom filter from a given scaling bloom filter, with count and id */ +counting_bloom_t *new_counting_bloom_from_scale(scaling_bloom_t *bloom, bool extern_bitmap = false) +{ + int i; + long offset; + double error_rate; + counting_bloom_t *cur_bloom; + + error_rate = bloom->error_rate * (pow(ERROR_TIGHTENING_RATIO, bloom->num_blooms + 1)); + + if ((bloom->blooms = (counting_bloom_t **)realloc(bloom->blooms, (bloom->num_blooms + 1) * sizeof(counting_bloom_t *))) == nullptr) { + return nullptr; + } + + cur_bloom = counting_bloom_init(bloom->capacity, error_rate, bloom->num_bytes); + bloom->blooms[bloom->num_blooms] = cur_bloom; + bloom->num_blooms++; + + if (!extern_bitmap) { + bloom->bitmap = bitmap_resize(bloom->bitmap, bloom->num_bytes + cur_bloom->num_bytes); + /* reset header pointer, as realloc may have moved */ + bloom->header = (scaling_bloom_header_t *) bloom->bitmap->array; + /* Set the pointers for these header structs to the right location since realloc may have moved */ + for (i = 0; i < bloom->num_blooms; i++) { + offset = bloom->blooms[i]->offset - sizeof(counting_bloom_header_t); + bloom->blooms[i]->header = (counting_bloom_header_t *) (bloom->bitmap->array + offset); + } + } else { + offset = cur_bloom->offset - sizeof(counting_bloom_header_t); + cur_bloom->header = (counting_bloom_header_t *) (bloom->bitmap->array + offset); + } + + bloom->num_bytes += cur_bloom->num_bytes; + cur_bloom->bitmap = bloom->bitmap; + + return cur_bloom; +} + +uint64_t scaling_bloom_clear_seqnums(scaling_bloom_t *bloom) +{ + uint64_t seqnum = bloom->header->mem_seqnum; + bloom->header->mem_seqnum = 0; + return seqnum; +} + +int scaling_bloom_add(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id) +{ + int i; + uint64_t seqnum; + + counting_bloom_t *cur_bloom = nullptr; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + cur_bloom = bloom->blooms[i]; + if (id >= cur_bloom->header->id) { + break; + } + } + + seqnum = scaling_bloom_clear_seqnums(bloom); + + if ((id > bloom->header->max_id) && (cur_bloom->header->count >= cur_bloom->capacity)) { + cur_bloom = new_counting_bloom_from_scale(bloom); + cur_bloom->header->count = 0; + cur_bloom->header->id = bloom->header->max_id + 1; + } + if (bloom->header->max_id < id) { + bloom->header->max_id = id; + } + bool error = false; + if (counting_bloom_add(cur_bloom, s, len) == -1) { + error = true; + } + + bloom->header->mem_seqnum = seqnum + 1; + + //return 1; + return error ? -1 : 1; +} + +int scaling_bloom_remove(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id) +{ + counting_bloom_t *cur_bloom; + int i; + uint64_t seqnum; + + bool error = false; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + cur_bloom = bloom->blooms[i]; + if (id >= cur_bloom->header->id) { + seqnum = scaling_bloom_clear_seqnums(bloom); + + if (counting_bloom_remove(cur_bloom, s, len) == -1) { + error = true; + } + + bloom->header->mem_seqnum = seqnum + 1; + //return 1; + return error ? -1 : 1; + } + } + return 0; +} + +int scaling_bloom_check(scaling_bloom_t *bloom, const char *s, size_t len) +{ + int i; + counting_bloom_t *cur_bloom; + for (i = bloom->num_blooms - 1; i >= 0; i--) { + cur_bloom = bloom->blooms[i]; + if (counting_bloom_check(cur_bloom, s, len)) { + return 1; + } + } + return 0; +} + +scaling_bloom_t *scaling_bloom_init(unsigned int capacity, double error_rate, bitmap_t* bitmap = nullptr) +{ + scaling_bloom_t *bloom; + + if ((bloom = (scaling_bloom_t *)malloc(sizeof(scaling_bloom_t))) == nullptr) { + return nullptr; + } + + if (bitmap == nullptr) { + if ((bloom->bitmap = new_bitmap(sizeof(scaling_bloom_header_t))) == nullptr) { + free(bloom); + return nullptr; + } + } else { + bloom->bitmap = bitmap; + } + + bloom->header = (scaling_bloom_header_t *) bloom->bitmap->array; + bloom->capacity = capacity; + bloom->error_rate = error_rate; + bloom->num_blooms = 0; + bloom->num_bytes = sizeof(scaling_bloom_header_t); + bloom->blooms = nullptr; + + return bloom; +} + +scaling_bloom_t *new_scaling_bloom(unsigned int capacity, double error_rate) +{ + scaling_bloom_t *bloom; + counting_bloom_t *cur_bloom; + + bloom = scaling_bloom_init(capacity, error_rate); + + if (!(cur_bloom = new_counting_bloom_from_scale(bloom))) { + free_scaling_bloom(bloom); + return nullptr; + } + cur_bloom->header->count = 0; + cur_bloom->header->id = 0; + + bloom->header->mem_seqnum = 1; + return bloom; +} + +scaling_bloom_t *new_scaling_bloom_from_bitmap(unsigned int capacity, double error_rate, bitmap_t* bitmap) +{ + scaling_bloom_t *bloom; + counting_bloom_t *cur_bloom; + + if ((bloom = scaling_bloom_init(capacity, error_rate, bitmap)) == nullptr) { + return nullptr; + } + + int size = bitmap->bytes - sizeof(scaling_bloom_header_t); + while (size) { + cur_bloom = new_counting_bloom_from_scale(bloom, true); + // leave count and id as they were set in the file + size -= cur_bloom->num_bytes; + if (size < 0) { + free_scaling_bloom(bloom); + return nullptr; + } + } + + return bloom; +} + +size_t bloom_size(scaling_bloom_t *bloom) { + size_t rst = 0; + if (bloom != nullptr) { + rst = sizeof(scaling_bloom_t); + rst += bloom->num_bytes; + rst += bloom->num_blooms * (sizeof(counting_bloom_t) + sizeof(void*)); + for (unsigned int i = 0; i < bloom->num_blooms; i++) { + rst += bloom->blooms[i]->nfuncs * sizeof(uint32_t); + } + } + return rst; +} \ No newline at end of file diff --git a/internal/util/dablooms/cwrapper/dablooms.h b/internal/util/dablooms/cwrapper/dablooms.h new file mode 100644 index 0000000000..09a63d172b --- /dev/null +++ b/internal/util/dablooms/cwrapper/dablooms.h @@ -0,0 +1,81 @@ +/* Copyright @2012 by Justin Hines at Bitly under a very liberal license. See LICENSE in the source distribution. */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef __BLOOM_H__ +#define __BLOOM_H__ +#include +#include + +const char *dablooms_version(void); + +typedef struct { + size_t bytes; + char *array; +} bitmap_t; + + +bitmap_t *bitmap_resize(bitmap_t *bitmap, size_t old_size, size_t new_size); +bitmap_t *new_bitmap(size_t bytes); + +int bitmap_increment(bitmap_t *bitmap, unsigned int index, long offset); +int bitmap_decrement(bitmap_t *bitmap, unsigned int index, long offset); +int bitmap_check(bitmap_t *bitmap, unsigned int index, long offset); + +void free_bitmap(bitmap_t *bitmap); + +typedef struct { + uint64_t id; + uint32_t count; + uint32_t _pad; +} counting_bloom_header_t; + +typedef struct { + counting_bloom_header_t *header; + unsigned int capacity; + long offset; + unsigned int counts_per_func; + uint32_t *hashes; + size_t nfuncs; + size_t size; + size_t num_bytes; + double error_rate; + bitmap_t *bitmap; +} counting_bloom_t; + +int counting_bloom_add(counting_bloom_t *bloom, const char *s, size_t len); +int counting_bloom_remove(counting_bloom_t *bloom, const char *s, size_t len); +int counting_bloom_check(counting_bloom_t *bloom, const char *s, size_t len); + +typedef struct { + uint64_t max_id; + uint64_t mem_seqnum; + uint64_t reserved; +} scaling_bloom_header_t; + +typedef struct { + scaling_bloom_header_t *header; + unsigned int capacity; + unsigned int num_blooms; + size_t num_bytes; + double error_rate; + counting_bloom_t **blooms; + bitmap_t *bitmap; +} scaling_bloom_t; + +scaling_bloom_t *new_scaling_bloom(unsigned int capacity, double error_rate); +scaling_bloom_t *new_scaling_bloom_from_bitmap(unsigned int capacity, double error_rate, bitmap_t* bitmap); +int free_scaling_bloom(scaling_bloom_t *bloom); +int scaling_bloom_add(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id); +int scaling_bloom_remove(scaling_bloom_t *bloom, const char *s, size_t len, uint64_t id); +int scaling_bloom_check(scaling_bloom_t *bloom, const char *s, size_t len); +size_t bloom_size(scaling_bloom_t *bloom); +#endif + +#ifdef __cplusplus +} +#endif diff --git a/internal/util/dablooms/cwrapper/murmur.cpp b/internal/util/dablooms/cwrapper/murmur.cpp new file mode 100644 index 0000000000..fcf1dc1bcc --- /dev/null +++ b/internal/util/dablooms/cwrapper/murmur.cpp @@ -0,0 +1,120 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "murmur.h" + +#define FORCE_INLINE inline static + +FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#define getblock(x, i) (x[i]) + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint64_t fmix64(uint64_t k) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + int i; + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(i = 0; i < nblocks; i++) { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- diff --git a/internal/util/dablooms/cwrapper/murmur.h b/internal/util/dablooms/cwrapper/murmur.h new file mode 100644 index 0000000000..c7547dbe74 --- /dev/null +++ b/internal/util/dablooms/cwrapper/murmur.h @@ -0,0 +1,12 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +#include + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +#endif // _MURMURHASH3_H_ diff --git a/internal/util/dablooms/dablooms.go b/internal/util/dablooms/dablooms.go new file mode 100644 index 0000000000..3e42f65a62 --- /dev/null +++ b/internal/util/dablooms/dablooms.go @@ -0,0 +1,55 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +package dablooms + +/* +#cgo CFLAGS: -I${SRCDIR}/cwrapper + +#cgo LDFLAGS: -L${SRCDIR}/cwrapper/output -ldablooms -lstdc++ -lm +#include +#include +*/ +import "C" + +import ( + "unsafe" +) + +type ScalingBloom struct { + cfilter *C.scaling_bloom_t +} + +func NewScalingBloom(capacity uint64, errorRate float64) *ScalingBloom { + sb := &ScalingBloom{ + cfilter: C.new_scaling_bloom(C.uint(capacity), C.double(errorRate)), + } + return sb +} + +func (sb *ScalingBloom) Destroy() { + C.free_scaling_bloom(sb.cfilter) +} + +func (sb *ScalingBloom) Add(key []byte, id int64) bool { + cKey := (*C.char)(unsafe.Pointer(&key[0])) + return C.scaling_bloom_add(sb.cfilter, cKey, C.size_t(len(key)), C.uint64_t(id)) == 1 +} + +func (sb *ScalingBloom) Remove(key []byte, id int64) bool { + cKey := (*C.char)(unsafe.Pointer(&key[0])) + return C.scaling_bloom_remove(sb.cfilter, cKey, C.size_t(len(key)), C.uint64_t(id)) == 1 +} + +func (sb *ScalingBloom) Check(key []byte) bool { + cKey := (*C.char)(unsafe.Pointer(&key[0])) + return C.scaling_bloom_check(sb.cfilter, cKey, C.size_t(len(key))) == 1 +} diff --git a/internal/util/dablooms/dablooms_test.go b/internal/util/dablooms/dablooms_test.go new file mode 100644 index 0000000000..094decc920 --- /dev/null +++ b/internal/util/dablooms/dablooms_test.go @@ -0,0 +1,91 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License. + +package dablooms + +import ( + "fmt" + "strconv" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +type stats struct { + TruePositives int64 + TrueNegatives int64 + FalsePositives int64 + FalseNegatives int64 +} + +var Capacity uint64 = 1000000 +var ErrorRate float64 = .05 + +func PrintResults(stats *stats) { + falsePositiveRate := float64(stats.FalsePositives) / float64(stats.FalsePositives+stats.TrueNegatives) + fmt.Printf("True positives: %7d\n", stats.TruePositives) + fmt.Printf("True negatives: %7d\n", stats.TrueNegatives) + fmt.Printf("False positives: %7d\n", stats.FalsePositives) + fmt.Printf("False negatives: %7d\n", stats.FalseNegatives) + fmt.Printf("False positive rate: %f\n", falsePositiveRate) + + if falsePositiveRate > ErrorRate { + fmt.Printf("False positive rate too high\n") + } +} + +func TestDablooms_Correctness(t *testing.T) { + sb := NewScalingBloom(Capacity, ErrorRate) + assert.NotNil(t, sb) + + start := time.Now().UnixNano() + for i := 0; i < int(Capacity*2); i++ { + if i%2 == 0 { + key := strconv.Itoa(i) + sb.Add([]byte(key), int64(i)) + } + } + end := time.Now().UnixNano() + + seconds := float64((end - start) / 1e9) + fmt.Printf("The time cost for add: %fs\n", seconds) + + results := &stats{ + TruePositives: 0, + TrueNegatives: 0, + FalsePositives: 0, + FalseNegatives: 0, + } + + start = time.Now().UnixNano() + for i := 0; i < int(Capacity*2); i++ { + if i%2 == 1 { + key := strconv.Itoa(i) + positive := sb.Check([]byte(key)) + if positive { + results.FalsePositives++ + } else { + results.TrueNegatives++ + } + } + } + end = time.Now().UnixNano() + seconds = float64((end - start) / 1e9) + fmt.Printf("Time cost for check: %fs\n", seconds) + + sb.Destroy() + + PrintResults(results) + + // False negatives means that there should + assert.False(t, results.FalseNegatives > 0) +} diff --git a/scripts/cwrapper_dablooms_build.sh b/scripts/cwrapper_dablooms_build.sh new file mode 100755 index 0000000000..c11aec4f17 --- /dev/null +++ b/scripts/cwrapper_dablooms_build.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +SOURCE=${BASH_SOURCE[0]} +while [ -h $SOURCE ]; do # resolve $SOURCE until the file is no longer a symlink + DIR=$( cd -P $( dirname $SOURCE ) && pwd ) + SOURCE=$(readlink $SOURCE) + [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +DIR=$( cd -P $( dirname $SOURCE ) && pwd ) +# DIR=${DIR}/../internal/util/dablooms/cwrapper + +CMAKE_BUILD=${DIR}/../cwrapper_dablooms_build +OUTPUT_LIB=${DIR}/../internal/util/dablooms/cwrapper/output +SRC_DIR=${DIR}/../internal/util/dablooms/cwrapper + +if [ ! -d ${CMAKE_BUILD} ];then + mkdir ${CMAKE_BUILD} +fi + +if [ -d ${OUTPUT_LIB} ];then + rm -rf ${OUTPUT_LIB} +fi +mkdir ${OUTPUT_LIB} + +BUILD_TYPE="Debug" +CUSTOM_THIRDPARTY_PATH="" + +while getopts "t:h:f:" arg; do + case $arg in + f) + CUSTOM_THIRDPARTY_PATH=$OPTARG + ;; + t) + BUILD_TYPE=$OPTARG # BUILD_TYPE + ;; + h) # help + echo "-t: build type(default: Debug) +-f: custom thirdparty path(default: "") +-h: help + " + exit 0 + ;; + ?) + echo "ERROR! unknown argument" + exit 1 + ;; + esac +done +echo "BUILD_TYPE: " $BUILD_TYPE +echo "CUSTOM_THIRDPARTY_PATH: " $CUSTOM_THIRDPARTY_PATH + +pushd ${CMAKE_BUILD} +CMAKE_CMD="cmake \ +-DCMAKE_INSTALL_PREFIX=${OUTPUT_LIB} \ +-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ +-DCUSTOM_THIRDPARTY_DOWNLOAD_PATH=${CUSTOM_THIRDPARTY_PATH} ${SRC_DIR}" + +${CMAKE_CMD} +echo ${CMAKE_CMD} + +if [[ ! ${jobs+1} ]]; then + jobs=$(nproc) +fi +make -j ${jobs} && make install