milvus/core/src/index/thirdparty/faiss/IndexBinaryIVF.cpp
Jin Hai dab74700b2
Delete and WAL feature branch merge (#1436)
* add read/write lock

* change compact to ddl queue

* add api to get vector data

* add flush / merge / compact lock

* add api to get vector data

* add data size for table info

* add db recovery test

* add data_size check

* change file name to uppercase

Signed-off-by: jinhai <hai.jin@zilliz.com>

* update wal flush_merge_compact_mutex_

* update wal flush_merge_compact_mutex_

* change requirement

* change requirement

* upd requirement

* add logging

* add logging

* add logging

* add logging

* add logging

* add logging

* add logging

* add logging

* add logging

* delete part

* add all size checks

* fix bug

* update faiss get_vector_by_id

* add get_vector case

* update get vector by id

* update server

* fix DBImpl

* attempting to fix #1268

* lint

* update unit test

* fix #1259

* issue 1271 fix wal config

* update

* fix cases

Signed-off-by: del.zhenwu <zhenxiang.li@zilliz.com>

* update read / write error message

* update read / write error message

* [skip ci] get vectors by id from raw files instead faiss

* [skip ci] update FilesByType meta

* update

* fix ci error

* update

* lint

* Hide partition_name parameter

* Remove douban pip source

Signed-off-by: zhenwu <zw@zilliz.com>

* Update epsilon value in test cases

Signed-off-by: zhenwu <zw@zilliz.com>

* Add default partition

* Caiyd crud (#1313)

* fix clang format

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix unittest build error

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* add faiss_bitset_test

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* avoid user directly operate partition table

* fix has table bug

* Caiyd crud (#1323)

* fix clang format

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix unittest build error

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* use compile option -O3

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* update faiss_bitset_test.cpp

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* change open flags

* change OngoingFileChecker to static instance

* mark ongoing files when applying deletes

* update clean up with ttl

* fix centos ci

* update

* lint

* update partition

Signed-off-by: zhenwu <zw@zilliz.com>

* update delete and flush to include partitions

* update

* Update cases

Signed-off-by: zhenwu <zw@zilliz.com>

* Fix test cases crud (#1350)

* fix order

* add wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix invalid operation issue

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix invalid operation issue

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix bug

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix bug

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* crud fix

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* crud fix

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* add table info test cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>
Signed-off-by: JinHai-CN <hai.jin@zilliz.com>

* merge cases

Signed-off-by: zhenwu <zw@zilliz.com>

* Shengjun (#1349)

* Add GPU sharing solution on native Kubernetes  (#1102)

* run hadolint with reviewdog

* add LINCENSE in Dockerfile

* run hadolint with reviewdog

* Reporter of reviewdog command is "github-pr-check"

* format Dockerfile

* ignore DL3007 in hadolint

* clean up old docker images

* Add GPU sharing solution on native Kubernetes

* nightly test mailer

* Fix http server bug (#1096)

* refactoring(create_table done)

* refactoring

* refactor server delivery (insert done)

* refactoring server module (count_table done)

* server refactor done

* cmake pass

* refactor server module done.

* set grpc response status correctly

* format done.

* fix redefine ErrorMap()

* optimize insert reducing ids data copy

* optimize grpc request with reducing data copy

* clang format

* [skip ci] Refactor server module done. update changlog. prepare for PR

* remove explicit and change int32_t to int64_t

* add web server

* [skip ci] add license in web module

* modify header include & comment oatpp environment config

* add port configure & create table in handler

* modify web url

* simple url complation done & add swagger

* make sure web url

* web functionality done. debuging

* add web unittest

* web test pass

* add web server port

* add web server port in template

* update unittest cmake file

* change web server default port to 19121

* rename method in web module & unittest pass

* add search case in unittest for web module

* rename some variables

* fix bug

* unittest pass

* web prepare

* fix cmd bug(check server status)

* update changlog

* add web port validate & default set

* clang-format pass

* add web port test in unittest

* add CORS & redirect root to swagger ui

* add web status

* web table method func cascade test pass

* add config url in web module

* modify thirdparty cmake to avoid building oatpp test

* clang format

* update changlog

* add constants in web module

* reserve Config.cpp

* fix constants reference bug

* replace web server with async module

* modify component to support async

* format

* developing controller & add test clent into unittest

* add web port into demo/server_config

* modify thirdparty cmake to allow build test

* remove  unnecessary comment

* add endpoint info in controller

* finish web test(bug here)

* clang format

* add web test cpp to lint exclusions

* check null field in GetConfig

* add macro RETURN STATUS DTo

* fix cmake conflict

* fix crash when exit server

* remove surplus comments & add http param check

* add uri /docs to direct swagger

* format

* change cmd to system

* add default value & unittest in web module

* add macros to judge if GPU supported

* add macros in unit & add default in index dto & print error message when bind http port fail

* format (fix #788)

* fix cors bug (not completed)

* comment cors

* change web framework to simple api

* comments optimize

* change to simple API

* remove comments in controller.hpp

* remove EP_COMMON_CMAKE_ARGS in oatpp and oatpp-swagger

* add ep cmake args to sqlite

* clang-format

* change a format

* test pass

* change name to

* fix compiler issue(oatpp-swagger depend on oatpp)

* add & in start_server.h

* specify lib location with oatpp and oatpp-swagger

* add comments

* add swagger definition

* [skip ci] change http method options status code

* remove oatpp swagger(fix #970)

* remove comments

* check Start web behavior

* add default to cpu_cache_capacity

* remove swagger component.hpp & /docs url

* remove /docs info

* remove /docs in unittest

* remove space in test rpc

* remove repeate info in CHANGLOG

* change cache_insert_data default value as a constant

* [skip ci] Fix some broken links (#960)

* [skip ci] Fix broken link

* [skip ci] Fix broken link

* [skip ci] Fix broken link

* [skip ci] Fix broken links

* fix issue 373 (#964)

* fix issue 373

* Adjustment format

* Adjustment format

* Adjustment format

* change readme

* #966 update NOTICE.md (#967)

* remove comments

* check Start web behavior

* add default to cpu_cache_capacity

* remove swagger component.hpp & /docs url

* remove /docs info

* remove /docs in unittest

* remove space in test rpc

* remove repeate info in CHANGLOG

* change cache_insert_data default value as a constant

* adjust web port cofig place

* rename web_port variable

* change gpu resources invoke way to cmd()

* set advanced config name add DEFAULT

* change config setting to cmd

* modify ..

* optimize code

* assign TableDto' count default value 0 (fix #995)

* check if table exists when show partitions (fix #1028)

* check table exists when drop partition (fix #1029)

* check if partition name is legal (fix #1022)

* modify status code when partition tag is illegal

* update changlog

* add info to /system url

* add binary index and add bin uri & handler method(not completed)

* optimize http insert and search time(fix #1066) | add binary vectors support(fix #1067)

* fix test partition bug

* fix test bug when check insert records

* add binary vectors test

* add default for offset and page_size

* fix uinttest bug

* [skip ci] remove comments

* optimize web code for PR comments

* add new folder named utils

* check offset and pagesize (fix #1082)

* improve error message if offset or page_size is not legal (fix #1075)

* add log into web module

* update changlog

* check gpu sources setting when assign repeated value (fix #990)

* update changlog

* clang-format pass

* add default handler in http handler

* [skip ci] improve error msg when check gpu resources

* change check offset way

* remove func IsIntStr

* add case

* change int32 to int64 when check number str

* add log in we module(doing)

* update test case

* add log in web controller

Co-authored-by: jielinxu <52057195+jielinxu@users.noreply.github.com>
Co-authored-by: JackLCL <53512883+JackLCL@users.noreply.github.com>
Co-authored-by: Cai Yudong <yudong.cai@zilliz.com>

* Filtering for specific paths in Jenkins CI  (#1107)

* run hadolint with reviewdog

* add LINCENSE in Dockerfile

* run hadolint with reviewdog

* Reporter of reviewdog command is "github-pr-check"

* format Dockerfile

* ignore DL3007 in hadolint

* clean up old docker images

* Add GPU sharing solution on native Kubernetes

* nightly test mailer

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Fix Filtering for specific paths in Jenkins CI bug (#1109)

* run hadolint with reviewdog

* add LINCENSE in Dockerfile

* run hadolint with reviewdog

* Reporter of reviewdog command is "github-pr-check"

* format Dockerfile

* ignore DL3007 in hadolint

* clean up old docker images

* Add GPU sharing solution on native Kubernetes

* nightly test mailer

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Fix Filtering for specific paths in Jenkins CI bug (#1110)

* run hadolint with reviewdog

* add LINCENSE in Dockerfile

* run hadolint with reviewdog

* Reporter of reviewdog command is "github-pr-check"

* format Dockerfile

* ignore DL3007 in hadolint

* clean up old docker images

* Add GPU sharing solution on native Kubernetes

* nightly test mailer

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Don't skip ci when triggered by a time (#1113)

* run hadolint with reviewdog

* add LINCENSE in Dockerfile

* run hadolint with reviewdog

* Reporter of reviewdog command is "github-pr-check"

* format Dockerfile

* ignore DL3007 in hadolint

* clean up old docker images

* Add GPU sharing solution on native Kubernetes

* nightly test mailer

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Don't skip ci when triggered by a time

* Don't skip ci when triggered by a time

* Set default sending to Milvus Dev mail group  (#1121)

* run hadolint with reviewdog

* add LINCENSE in Dockerfile

* run hadolint with reviewdog

* Reporter of reviewdog command is "github-pr-check"

* format Dockerfile

* ignore DL3007 in hadolint

* clean up old docker images

* Add GPU sharing solution on native Kubernetes

* nightly test mailer

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Test filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* Filtering for specific paths in Jenkins CI

* No skip ci when triggered by a time

* Don't skip ci when triggered by a time

* Set default sending to Milvus Dev

* Support hnsw (#1131)

* add hnsw

* add config

* format...

* format..

* Remove test.template (#1129)

* Update framework

* remove files

* Remove files

* Remove ann-acc cases && Update java-sdk cases

* change cn to en

* [skip ci] remove doc test

* [skip ci] change cn to en

* Case stability

* Add mail notification when test failed

* Add main notification

* Add main notification

* gen milvus instance from utils

* Distable case with multiprocess

* Add mail notification when nightly test failed

* add milvus handler param

* add http handler

* Remove test.template

Co-authored-by: quicksilver <zhifeng.zhang@zilliz.com>

* Add doc for the RESTful API / Update contributor number in Milvus readme (#1100)

* [skip ci] Update contributor number.

* [skip ci] Add RESTful API doc.

* [skip ci] Some updates.

* [skip ci] Change port to 19121.

* [skip ci] Update README.md.

Update the descriptions for OPTIONS.

* Update README.md

Fix a typo.

* #1105 update error message when creating IVFSQ8H index without GPU resources (#1117)

* [skip ci] Update README (#1104)

* remove Nvidia owned files from faiss (#1136)

* #1135 remove Nvidia owned files from faiss

* Revert "#1135 remove Nvidia owned files from faiss"

This reverts commit 3bc007c28c8df5861fdd0452fd64c0e2e719eda2.

* #1135 remove Nvidia API implementation

* #1135 remove Nvidia owned files from faiss

* Update CODE_OF_CONDUCT.md (#1163)

* Improve codecov (#1095)

* Optimize config test. Dir src/config 99% lines covered

* add unittest coverage

* optimize cache&config unittest

* code format

* format

* format code

* fix merge conflict

* cover src/utils unittest

*  '#831 fix exe_path judge error'

* #831 fix exe_path judge error

* add some unittest coverage

* add some unittest coverage

* improve coverage of src/wrapper

* improve src/wrapper coverage

* *test optimize db/meta unittest

* fix bug

* *test optimize mysqlMetaImpl unittest

* *style: format code

* import server& scheduler unittest coverage

* handover next work

* *test: add some test_meta test case

* *format code

* *fix: fix typo

* feat(codecov): improve code coverage for src/db(#872)

* feat(codecov): improve code coverage for src/db/engine(#872)

* feat(codecov): improve code coverage(#872)

* fix config unittest bug

* feat(codecov): improve code coverage core/db/engine(#872)

* feat(codecov): improve code coverage core/knowhere

* feat(codecov): improve code coverage core/knowhere

* feat(codecov): improve code coverage

* feat(codecov): fix cpu test some error

* feat(codecov): improve code coverage

* feat(codecov): rename some fiu

* fix(db/meta): fix switch/case default action

* feat(codecov): improve code coverage(#872)
* fix error caused by merge code
* format code

* feat(codecov): improve code coverage & format code(#872)

* feat(codecov): fix test error(#872)

* feat(codecov): fix unittest test_mem(#872)

* feat(codecov): fix unittest(#872)

* feat(codecov): fix unittest for resource manager(#872)

* feat(codecov): code format (#872)

* feat(codecov): trigger ci(#872)

* fix(RequestScheduler): remove a wrong sleep statement

* test(test_rpc): fix rpc test

* Fix format issue

* Remove unused comments

* Fix unit test error

Co-authored-by: ABNER-1 <ABNER-1@users.noreply.github.com>
Co-authored-by: Jin Hai <hai.jin@zilliz.com>

* Support run dev test with http handler in python SDK (#1116)

* refactoring(create_table done)

* refactoring

* refactor server delivery (insert done)

* refactoring server module (count_table done)

* server refactor done

* cmake pass

* refactor server module done.

* set grpc response status correctly

* format done.

* fix redefine ErrorMap()

* optimize insert reducing ids data copy

* optimize grpc request with reducing data copy

* clang format

* [skip ci] Refactor server module done. update changlog. prepare for PR

* remove explicit and change int32_t to int64_t

* add web server

* [skip ci] add license in web module

* modify header include & comment oatpp environment config

* add port configure & create table in handler

* modify web url

* simple url complation done & add swagger

* make sure web url

* web functionality done. debuging

* add web unittest

* web test pass

* add web server port

* add web server port in template

* update unittest cmake file

* change web server default port to 19121

* rename method in web module & unittest pass

* add search case in unittest for web module

* rename some variables

* fix bug

* unittest pass

* web prepare

* fix cmd bug(check server status)

* update changlog

* add web port validate & default set

* clang-format pass

* add web port test in unittest

* add CORS & redirect root to swagger ui

* add web status

* web table method func cascade test pass

* add config url in web module

* modify thirdparty cmake to avoid building oatpp test

* clang format

* update changlog

* add constants in web module

* reserve Config.cpp

* fix constants reference bug

* replace web server with async module

* modify component to support async

* format

* developing controller & add test clent into unittest

* add web port into demo/server_config

* modify thirdparty cmake to allow build test

* remove  unnecessary comment

* add endpoint info in controller

* finish web test(bug here)

* clang format

* add web test cpp to lint exclusions

* check null field in GetConfig

* add macro RETURN STATUS DTo

* fix cmake conflict

* fix crash when exit server

* remove surplus comments & add http param check

* add uri /docs to direct swagger

* format

* change cmd to system

* add default value & unittest in web module

* add macros to judge if GPU supported

* add macros in unit & add default in index dto & print error message when bind http port fail

* format (fix #788)

* fix cors bug (not completed)

* comment cors

* change web framework to simple api

* comments optimize

* change to simple API

* remove comments in controller.hpp

* remove EP_COMMON_CMAKE_ARGS in oatpp and oatpp-swagger

* add ep cmake args to sqlite

* clang-format

* change a format

* test pass

* change name to

* fix compiler issue(oatpp-swagger depend on oatpp)

* add & in start_server.h

* specify lib location with oatpp and oatpp-swagger

* add comments

* add swagger definition

* [skip ci] change http method options status code

* remove oatpp swagger(fix #970)

* remove comments

* check Start web behavior

* add default to cpu_cache_capacity

* remove swagger component.hpp & /docs url

* remove /docs info

* remove /docs in unittest

* remove space in test rpc

* remove repeate info in CHANGLOG

* change cache_insert_data default value as a constant

* [skip ci] Fix some broken links (#960)

* [skip ci] Fix broken link

* [skip ci] Fix broken link

* [skip ci] Fix broken link

* [skip ci] Fix broken links

* fix issue 373 (#964)

* fix issue 373

* Adjustment format

* Adjustment format

* Adjustment format

* change readme

* #966 update NOTICE.md (#967)

* remove comments

* check Start web behavior

* add default to cpu_cache_capacity

* remove swagger component.hpp & /docs url

* remove /docs info

* remove /docs in unittest

* remove space in test rpc

* remove repeate info in CHANGLOG

* change cache_insert_data default value as a constant

* adjust web port cofig place

* rename web_port variable

* change gpu resources invoke way to cmd()

* set advanced config name add DEFAULT

* change config setting to cmd

* modify ..

* optimize code

* assign TableDto' count default value 0 (fix #995)

* check if table exists when show partitions (fix #1028)

* check table exists when drop partition (fix #1029)

* check if partition name is legal (fix #1022)

* modify status code when partition tag is illegal

* update changlog

* add info to /system url

* add binary index and add bin uri & handler method(not completed)

* optimize http insert and search time(fix #1066) | add binary vectors support(fix #1067)

* fix test partition bug

* fix test bug when check insert records

* add binary vectors test

* add default for offset and page_size

* fix uinttest bug

* [skip ci] remove comments

* optimize web code for PR comments

* add new folder named utils

* check offset and pagesize (fix #1082)

* improve error message if offset or page_size is not legal (fix #1075)

* add log into web module

* update changlog

* check gpu sources setting when assign repeated value (fix #990)

* update changlog

* clang-format pass

* add default handler in http handler

* [skip ci] improve error msg when check gpu resources

* change check offset way

* remove func IsIntStr

* add case

* change int32 to int64 when check number str

* add log in we module(doing)

* update test case

* add log in web controller

* remove surplus dot

* add preload into /system/

* change get_milvus() to get_milvus(args['handler'])

* support load table into memory with http server (fix #1115)

* [skip ci] comment surplus dto in VectorDto

Co-authored-by: jielinxu <52057195+jielinxu@users.noreply.github.com>
Co-authored-by: JackLCL <53512883+JackLCL@users.noreply.github.com>
Co-authored-by: Cai Yudong <yudong.cai@zilliz.com>

* Fix #1140 (#1162)

* fix

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* update...

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* fix2

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* fix3

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* update changelog

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* Update INSTALL.md (#1175)

* Update INSTALL.md

1. Change image tag and Milvus source code to latest.
2. Fix a typo

Signed-off-by: Lu Wang <yamasite@qq.com>

* Update INSTALL.md

Signed-off-by: lu.wang <yamasite@qq.com>

* add Tanimoto ground truth (#1138)

* add milvus ground truth

* add milvus groundtruth

* [skip ci] add milvus ground truth

* [skip ci]add tanimoto ground truth

* fix mix case bug (#1208)

* fix mix case bug

Signed-off-by: del.zhenwu <zhenxiang.li@zilliz.com>

* Remove case.md

Signed-off-by: del.zhenwu <zhenxiang.li@zilliz.com>

* Update README.md (#1206)

Add LFAI mailing lists.

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Add design.md to store links to design docs (#1219)

* Update README.md

Add link to Milvus design docs

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Create design.md

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update design.md

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Add troubleshooting info about libmysqlpp.so.3 error (#1225)

* Update INSTALL.md

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update INSTALL.md

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update README.md (#1233)

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* #1240 Update license declaration of each file (#1241)

* #1240 Update license declaration of each files

Signed-off-by: jinhai <hai.jin@zilliz.com>

* #1240 Update CHANGELOG

Signed-off-by: jinhai <hai.jin@zilliz.com>

* Update README.md (#1258)

Add Jenkins master badge.

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update INSTALL.md (#1265)

Fix indentation.

* support CPU profiling (#1251)

* #1250 support CPU profiling

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* #1250 fix code coverage

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* Fix HNSW crash (#1262)

* fix

Signed-off-by: xiaojun.lin <xiaojun.lin@zilliz.com>

* update.

Signed-off-by: xiaojun.lin <xiaojun.lin@zilliz.com>

* Add troubleshooting information for INSTALL.md and enhance readability (#1274)

* Update INSTALL.md

1. Add new troubleshooting message;
2. Enhance readability.

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update INSTALL.md

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update INSTALL.md

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Update INSTALL.md

Add CentOS link.

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* Create COMMUNITY.md (#1292)

Signed-off-by: Lutkin Wang <yamasite@qq.com>

* fix gtest

* add copyright

* fix gtest

* MERGE_NOT_YET

* fix lint

Co-authored-by: quicksilver <zhifeng.zhang@zilliz.com>
Co-authored-by: BossZou <40255591+BossZou@users.noreply.github.com>
Co-authored-by: jielinxu <52057195+jielinxu@users.noreply.github.com>
Co-authored-by: JackLCL <53512883+JackLCL@users.noreply.github.com>
Co-authored-by: Cai Yudong <yudong.cai@zilliz.com>
Co-authored-by: Tinkerrr <linxiaojun.cn@outlook.com>
Co-authored-by: del-zhenwu <56623710+del-zhenwu@users.noreply.github.com>
Co-authored-by: Lutkin Wang <yamasite@qq.com>
Co-authored-by: shengjh <46514371+shengjh@users.noreply.github.com>
Co-authored-by: ABNER-1 <ABNER-1@users.noreply.github.com>
Co-authored-by: Jin Hai <hai.jin@zilliz.com>
Co-authored-by: shiyu22 <cshiyu22@gmail.com>

* #1302 Get all record IDs in a segment by given a segment id

* Remove query time ranges

Signed-off-by: zhenwu <zw@zilliz.com>

* #1295 let wal enable by default

* fix cases

Signed-off-by: zhenwu <zw@zilliz.com>

* fix partition cases

Signed-off-by: zhenwu <zw@zilliz.com>

* [skip ci] update test_db

* update

* fix case bug

Signed-off-by: zhenwu <zw@zilliz.com>

* lint

* fix test case failures

* remove some code

* Caiyd crud 1 (#1377)

* fix clang format

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix unittest build error

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix build issue when enable profiling

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix hastable bug

* update bloom filter

* update

* benchmark

* update benchmark

* update

* update

* remove wal record size

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* remove wal record size config

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* update apply deletes: switch to binary search

* update sdk_simple

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* update apply deletes: switch to binary search

* add test_search_by_id

Signed-off-by: zhenwu <zw@zilliz.com>

* add more log

* flush error with multi same ids

Signed-off-by: zhenwu <zw@zilliz.com>

* modify wal config

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* update

* add binary search_by_id

* fix case bug

Signed-off-by: zhenwu <zw@zilliz.com>

* update cases

Signed-off-by: zhenwu <zw@zilliz.com>

* fix unit test #1395

* improve merge performance

* add uids_ for VectorIndex to improve search performance

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix error

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* update

* fix search

* fix record num

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* refine code

* refine code

* Add get_vector_ids test cases (#1407)

* fix order

* add wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix wal case

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix invalid operation issue

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix invalid operation issue

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix bug

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* fix bug

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* crud fix

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* crud fix

Signed-off-by: sahuang <xiaohaix@student.unimelb.edu.au>

* add table info test cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>
Signed-off-by: JinHai-CN <hai.jin@zilliz.com>

* add to compact case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* add to compact case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* add to compact case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* add case and debug compact

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* test pdb

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* test pdb

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* test pdb

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix cases

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update table_info case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update table_info case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update table_info case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update get vector ids case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update get vector ids case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update get vector ids case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update get vector ids case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* update case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* pdb test

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* pdb test

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* add tests for get_vector_ids

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix case

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* add binary and ip

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix binary index

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* fix pdb

Signed-off-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>

* #1408 fix search result in-correct after DeleteById

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* add one case

* delete failed segment

* update serialize

* update serialize

* fix case

Signed-off-by: zhenwu <zw@zilliz.com>

* update

* update case assertion

Signed-off-by: zhenwu <zw@zilliz.com>

* [skip ci] update config

* change bloom filter msync flag to async

* #1319 add more timing debug info

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* update

* update

* add normalize

Signed-off-by: zhenwu <zw@zilliz.com>

* add normalize

Signed-off-by: zhenwu <zw@zilliz.com>

* add normalize

Signed-off-by: zhenwu <zw@zilliz.com>

* Fix compiling error

Signed-off-by: jinhai <hai.jin@zilliz.com>

* support ip (#1383)

* support ip

Signed-off-by: xiaojun.lin <xiaojun.lin@zilliz.com>

* IP result distance sort by descend

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* update

Signed-off-by: Nicky <nicky.xj.lin@gmail.com>

* format

Signed-off-by: xiaojun.lin <xiaojun.lin@zilliz.com>

* get table lsn

* Remove unused third party

Signed-off-by: jinhai <hai.jin@zilliz.com>

* Refine code

Signed-off-by: jinhai <hai.jin@zilliz.com>

* #1319 fix clang format

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* fix wal applied lsn

Signed-off-by: shengjun.li <shengjun.li@zilliz.com>

* validate partition tag

* #1319 improve search performance

Signed-off-by: yudong.cai <yudong.cai@zilliz.com>

* build error

Co-authored-by: Zhiru Zhu <youny626@hotmail.com>
Co-authored-by: groot <yihua.mo@zilliz.com>
Co-authored-by: Xiaohai Xu <xiaohaix@student.unimelb.edu.au>
Co-authored-by: shengjh <46514371+shengjh@users.noreply.github.com>
Co-authored-by: del-zhenwu <56623710+del-zhenwu@users.noreply.github.com>
Co-authored-by: shengjun.li <49774184+shengjun1985@users.noreply.github.com>
Co-authored-by: Cai Yudong <yudong.cai@zilliz.com>
Co-authored-by: quicksilver <zhifeng.zhang@zilliz.com>
Co-authored-by: BossZou <40255591+BossZou@users.noreply.github.com>
Co-authored-by: jielinxu <52057195+jielinxu@users.noreply.github.com>
Co-authored-by: JackLCL <53512883+JackLCL@users.noreply.github.com>
Co-authored-by: Tinkerrr <linxiaojun.cn@outlook.com>
Co-authored-by: Lutkin Wang <yamasite@qq.com>
Co-authored-by: ABNER-1 <ABNER-1@users.noreply.github.com>
Co-authored-by: shiyu22 <cshiyu22@gmail.com>
2020-02-29 16:11:31 +08:00

920 lines
28 KiB
C++

/**
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
// Copyright 2004-present Facebook. All Rights Reserved
// -*- c++ -*-
#include <faiss/Index.h>
#include <faiss/IndexFlat.h>
#include <faiss/IndexBinaryIVF.h>
#include <cstdio>
#include <memory>
#include <cmath>
#include <faiss/utils/hamming.h>
#include <faiss/utils/jaccard.h>
#include <faiss/utils/utils.h>
#include <faiss/utils/Heap.h>
#include <faiss/impl/AuxIndexStructures.h>
#include <faiss/impl/FaissAssert.h>
namespace faiss {
IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist)
: IndexBinary(d),
invlists(new ArrayInvertedLists(nlist, code_size)),
own_invlists(true),
nprobe(1),
max_codes(0),
maintain_direct_map(false),
quantizer(quantizer),
nlist(nlist),
own_fields(false),
clustering_index(nullptr)
{
FAISS_THROW_IF_NOT (d == quantizer->d);
is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
cp.niter = 10;
}
IndexBinaryIVF::IndexBinaryIVF(IndexBinary *quantizer, size_t d, size_t nlist, MetricType metric)
: IndexBinary(d, metric),
invlists(new ArrayInvertedLists(nlist, code_size)),
own_invlists(true),
nprobe(1),
max_codes(0),
maintain_direct_map(false),
quantizer(quantizer),
nlist(nlist),
own_fields(false),
clustering_index(nullptr)
{
FAISS_THROW_IF_NOT (d == quantizer->d);
is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
cp.niter = 10;
}
IndexBinaryIVF::IndexBinaryIVF()
: invlists(nullptr),
own_invlists(false),
nprobe(1),
max_codes(0),
maintain_direct_map(false),
quantizer(nullptr),
nlist(0),
own_fields(false),
clustering_index(nullptr)
{}
void IndexBinaryIVF::add(idx_t n, const uint8_t *x) {
add_with_ids(n, x, nullptr);
}
void IndexBinaryIVF::add_with_ids(idx_t n, const uint8_t *x, const idx_t *xids) {
add_core(n, x, xids, nullptr);
}
void IndexBinaryIVF::add_core(idx_t n, const uint8_t *x, const idx_t *xids,
const idx_t *precomputed_idx) {
FAISS_THROW_IF_NOT(is_trained);
assert(invlists);
FAISS_THROW_IF_NOT_MSG(!(maintain_direct_map && xids),
"cannot have direct map and add with ids");
const idx_t * idx;
std::unique_ptr<idx_t[]> scoped_idx;
if (precomputed_idx) {
idx = precomputed_idx;
} else {
scoped_idx.reset(new idx_t[n]);
quantizer->assign(n, x, scoped_idx.get());
idx = scoped_idx.get();
}
long n_add = 0;
for (size_t i = 0; i < n; i++) {
idx_t id = xids ? xids[i] : ntotal + i;
idx_t list_no = idx[i];
if (list_no < 0)
continue;
const uint8_t *xi = x + i * code_size;
size_t offset = invlists->add_entry(list_no, id, xi);
if (maintain_direct_map)
direct_map.push_back(list_no << 32 | offset);
n_add++;
}
if (verbose) {
printf("IndexBinaryIVF::add_with_ids: added %ld / %ld vectors\n",
n_add, n);
}
ntotal += n_add;
}
void IndexBinaryIVF::make_direct_map(bool new_maintain_direct_map) {
// nothing to do
if (new_maintain_direct_map == maintain_direct_map)
return;
if (new_maintain_direct_map) {
direct_map.resize(ntotal, -1);
for (size_t key = 0; key < nlist; key++) {
size_t list_size = invlists->list_size(key);
const idx_t *idlist = invlists->get_ids(key);
for (size_t ofs = 0; ofs < list_size; ofs++) {
FAISS_THROW_IF_NOT_MSG(0 <= idlist[ofs] && idlist[ofs] < ntotal,
"direct map supported only for seuquential ids");
direct_map[idlist[ofs]] = key << 32 | ofs;
}
}
} else {
direct_map.clear();
}
maintain_direct_map = new_maintain_direct_map;
}
void IndexBinaryIVF::search(idx_t n, const uint8_t *x, idx_t k, int32_t *distances, idx_t *labels,
ConcurrentBitsetPtr bitset) const {
std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
double t0 = getmillisecs();
quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
indexIVF_stats.quantization_time += getmillisecs() - t0;
t0 = getmillisecs();
invlists->prefetch_lists(idx.get(), n * nprobe);
search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
distances, labels, false, nullptr, bitset);
indexIVF_stats.search_time += getmillisecs() - t0;
}
void IndexBinaryIVF::get_vector_by_id(idx_t n, const idx_t *xid, uint8_t *x, ConcurrentBitsetPtr bitset) {
if (!maintain_direct_map) {
make_direct_map(true);
}
/* only get vector by 1 id */
FAISS_ASSERT(n == 1);
if (!bitset || !bitset->test(xid[0])) {
reconstruct(xid[0], x + 0 * d);
} else {
memset(x, UINT8_MAX, d * sizeof(uint8_t));
}
}
void IndexBinaryIVF::search_by_id (idx_t n, const idx_t *xid, idx_t k, int32_t *distances, idx_t *labels,
ConcurrentBitsetPtr bitset) {
if (!maintain_direct_map) {
make_direct_map(true);
}
auto x = new uint8_t[n * d];
for (idx_t i = 0; i < n; ++i) {
reconstruct(xid[i], x + i * d);
}
search(n, x, k, distances, labels, bitset);
delete []x;
}
void IndexBinaryIVF::reconstruct(idx_t key, uint8_t *recons) const {
FAISS_THROW_IF_NOT_MSG(direct_map.size() == ntotal,
"direct map is not initialized");
idx_t list_no = direct_map[key] >> 32;
idx_t offset = direct_map[key] & 0xffffffff;
reconstruct_from_offset(list_no, offset, recons);
}
void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t *recons) const {
FAISS_THROW_IF_NOT(ni == 0 || (i0 >= 0 && i0 + ni <= ntotal));
for (idx_t list_no = 0; list_no < nlist; list_no++) {
size_t list_size = invlists->list_size(list_no);
const Index::idx_t *idlist = invlists->get_ids(list_no);
for (idx_t offset = 0; offset < list_size; offset++) {
idx_t id = idlist[offset];
if (!(id >= i0 && id < i0 + ni)) {
continue;
}
uint8_t *reconstructed = recons + (id - i0) * d;
reconstruct_from_offset(list_no, offset, reconstructed);
}
}
}
void IndexBinaryIVF::search_and_reconstruct(idx_t n, const uint8_t *x, idx_t k,
int32_t *distances, idx_t *labels,
uint8_t *recons) const {
std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
invlists->prefetch_lists(idx.get(), n * nprobe);
// search_preassigned() with `store_pairs` enabled to obtain the list_no
// and offset into `codes` for reconstruction
search_preassigned(n, x, k, idx.get(), coarse_dis.get(),
distances, labels, /* store_pairs */true);
for (idx_t i = 0; i < n; ++i) {
for (idx_t j = 0; j < k; ++j) {
idx_t ij = i * k + j;
idx_t key = labels[ij];
uint8_t *reconstructed = recons + ij * d;
if (key < 0) {
// Fill with NaNs
memset(reconstructed, -1, sizeof(*reconstructed) * d);
} else {
int list_no = key >> 32;
int offset = key & 0xffffffff;
// Update label to the actual id
labels[ij] = invlists->get_single_id(list_no, offset);
reconstruct_from_offset(list_no, offset, reconstructed);
}
}
}
}
void IndexBinaryIVF::reconstruct_from_offset(idx_t list_no, idx_t offset,
uint8_t *recons) const {
memcpy(recons, invlists->get_single_code(list_no, offset), code_size);
}
void IndexBinaryIVF::reset() {
direct_map.clear();
invlists->reset();
ntotal = 0;
}
size_t IndexBinaryIVF::remove_ids(const IDSelector& sel) {
FAISS_THROW_IF_NOT_MSG(!maintain_direct_map,
"direct map remove not implemented");
std::vector<idx_t> toremove(nlist);
#pragma omp parallel for
for (idx_t i = 0; i < nlist; i++) {
idx_t l0 = invlists->list_size (i), l = l0, j = 0;
const idx_t *idsi = invlists->get_ids(i);
while (j < l) {
if (sel.is_member(idsi[j])) {
l--;
invlists->update_entry(
i, j,
invlists->get_single_id(i, l),
invlists->get_single_code(i, l));
} else {
j++;
}
}
toremove[i] = l0 - l;
}
// this will not run well in parallel on ondisk because of possible shrinks
size_t nremove = 0;
for (idx_t i = 0; i < nlist; i++) {
if (toremove[i] > 0) {
nremove += toremove[i];
invlists->resize(
i, invlists->list_size(i) - toremove[i]);
}
}
ntotal -= nremove;
return nremove;
}
void IndexBinaryIVF::train(idx_t n, const uint8_t *x) {
if (verbose) {
printf("Training quantizer\n");
}
if (quantizer->is_trained && (quantizer->ntotal == nlist)) {
if (verbose) {
printf("IVF quantizer does not need training.\n");
}
} else {
if (verbose) {
printf("Training quantizer on %ld vectors in %dD\n", n, d);
}
Clustering clus(d, nlist, cp);
quantizer->reset();
std::unique_ptr<float[]> x_f(new float[n * d]);
binary_to_real(n * d, x, x_f.get());
IndexFlat index_tmp;
if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
index_tmp = IndexFlat(d, METRIC_Jaccard);
} else {
index_tmp = IndexFlat(d, METRIC_L2);
}
if (clustering_index && verbose) {
printf("using clustering_index of dimension %d to do the clustering\n",
clustering_index->d);
}
clus.train(n, x_f.get(), clustering_index ? *clustering_index : index_tmp);
std::unique_ptr<uint8_t[]> x_b(new uint8_t[clus.k * code_size]);
real_to_binary(d * clus.k, clus.centroids.data(), x_b.get());
quantizer->add(clus.k, x_b.get());
quantizer->is_trained = true;
}
is_trained = true;
}
void IndexBinaryIVF::merge_from(IndexBinaryIVF &other, idx_t add_id) {
// minimal sanity checks
FAISS_THROW_IF_NOT(other.d == d);
FAISS_THROW_IF_NOT(other.nlist == nlist);
FAISS_THROW_IF_NOT(other.code_size == code_size);
FAISS_THROW_IF_NOT_MSG((!maintain_direct_map &&
!other.maintain_direct_map),
"direct map copy not implemented");
FAISS_THROW_IF_NOT_MSG(typeid (*this) == typeid (other),
"can only merge indexes of the same type");
invlists->merge_from (other.invlists, add_id);
ntotal += other.ntotal;
other.ntotal = 0;
}
void IndexBinaryIVF::replace_invlists(InvertedLists *il, bool own) {
FAISS_THROW_IF_NOT(il->nlist == nlist &&
il->code_size == code_size);
if (own_invlists) {
delete invlists;
}
invlists = il;
own_invlists = own;
}
namespace {
using idx_t = Index::idx_t;
template<class HammingComputer, bool store_pairs>
struct IVFBinaryScannerL2: BinaryInvertedListScanner {
HammingComputer hc;
size_t code_size;
IVFBinaryScannerL2 (size_t code_size): code_size (code_size)
{}
void set_query (const uint8_t *query_vector) override {
hc.set (query_vector, code_size);
}
idx_t list_no;
void set_list (idx_t list_no, uint8_t /* coarse_dis */) override {
this->list_no = list_no;
}
uint32_t distance_to_code (const uint8_t *code) const override {
return hc.hamming (code);
}
size_t scan_codes (size_t n,
const uint8_t *codes,
const idx_t *ids,
int32_t *simi, idx_t *idxi,
size_t k,
ConcurrentBitsetPtr bitset) const override
{
using C = CMax<int32_t, idx_t>;
size_t nup = 0;
for (size_t j = 0; j < n; j++) {
if (!bitset || !bitset->test(ids[j])) {
uint32_t dis = hc.hamming (codes);
if (dis < simi[0]) {
heap_pop<C> (k, simi, idxi);
idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
heap_push<C> (k, simi, idxi, dis, id);
nup++;
}
}
codes += code_size;
}
return nup;
}
};
template<class JaccardComputer, bool store_pairs>
struct IVFBinaryScannerJaccard: BinaryInvertedListScanner {
JaccardComputer hc;
size_t code_size;
IVFBinaryScannerJaccard (size_t code_size): code_size (code_size)
{}
void set_query (const uint8_t *query_vector) override {
hc.set (query_vector, code_size);
}
idx_t list_no;
void set_list (idx_t list_no, uint8_t /* coarse_dis */) override {
this->list_no = list_no;
}
uint32_t distance_to_code (const uint8_t *code) const override {
}
size_t scan_codes (size_t n,
const uint8_t *codes,
const idx_t *ids,
int32_t *simi, idx_t *idxi,
size_t k,
ConcurrentBitsetPtr bitset = nullptr) const override
{
using C = CMax<float, idx_t>;
float* psimi = (float*)simi;
size_t nup = 0;
for (size_t j = 0; j < n; j++) {
if(!bitset || !bitset->test(ids[j])){
float dis = hc.jaccard (codes);
if (dis < psimi[0]) {
heap_pop<C> (k, psimi, idxi);
idx_t id = store_pairs ? (list_no << 32 | j) : ids[j];
heap_push<C> (k, psimi, idxi, dis, id);
nup++;
}
}
codes += code_size;
}
return nup;
}
};
template <bool store_pairs>
BinaryInvertedListScanner *select_IVFBinaryScannerL2 (size_t code_size) {
switch (code_size) {
#define HANDLE_CS(cs) \
case cs: \
return new IVFBinaryScannerL2<HammingComputer ## cs, store_pairs> (cs);
HANDLE_CS(4);
HANDLE_CS(8);
HANDLE_CS(16);
HANDLE_CS(20);
HANDLE_CS(32);
HANDLE_CS(64);
#undef HANDLE_CS
default:
if (code_size % 8 == 0) {
return new IVFBinaryScannerL2<HammingComputerM8,
store_pairs> (code_size);
} else if (code_size % 4 == 0) {
return new IVFBinaryScannerL2<HammingComputerM4,
store_pairs> (code_size);
} else {
return new IVFBinaryScannerL2<HammingComputerDefault,
store_pairs> (code_size);
}
}
}
template <bool store_pairs>
BinaryInvertedListScanner *select_IVFBinaryScannerJaccard (size_t code_size) {
switch (code_size) {
#define HANDLE_CS(cs) \
case cs: \
return new IVFBinaryScannerJaccard<JaccardComputer ## cs, store_pairs> (cs);
HANDLE_CS(16)
HANDLE_CS(32)
HANDLE_CS(64)
HANDLE_CS(128)
#undef HANDLE_CS
default:
return new IVFBinaryScannerJaccard<JaccardComputerDefault,
store_pairs>(code_size);
}
}
void search_knn_hamming_heap(const IndexBinaryIVF& ivf,
size_t n,
const uint8_t *x,
idx_t k,
const idx_t *keys,
const int32_t * coarse_dis,
int32_t *distances, idx_t *labels,
bool store_pairs,
const IVFSearchParameters *params,
ConcurrentBitsetPtr bitset = nullptr)
{
long nprobe = params ? params->nprobe : ivf.nprobe;
long max_codes = params ? params->max_codes : ivf.max_codes;
MetricType metric_type = ivf.metric_type;
// almost verbatim copy from IndexIVF::search_preassigned
size_t nlistv = 0, ndis = 0, nheap = 0;
using HeapForIP = CMin<int32_t, idx_t>;
using HeapForL2 = CMax<int32_t, idx_t>;
#pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
{
std::unique_ptr<BinaryInvertedListScanner> scanner
(ivf.get_InvertedListScanner (store_pairs));
#pragma omp for
for (size_t i = 0; i < n; i++) {
const uint8_t *xi = x + i * ivf.code_size;
scanner->set_query(xi);
const idx_t * keysi = keys + i * nprobe;
int32_t * simi = distances + k * i;
idx_t * idxi = labels + k * i;
if (metric_type == METRIC_INNER_PRODUCT) {
heap_heapify<HeapForIP> (k, simi, idxi);
} else {
heap_heapify<HeapForL2> (k, simi, idxi);
}
size_t nscan = 0;
for (size_t ik = 0; ik < nprobe; ik++) {
idx_t key = keysi[ik]; /* select the list */
if (key < 0) {
// not enough centroids for multiprobe
continue;
}
FAISS_THROW_IF_NOT_FMT
(key < (idx_t) ivf.nlist,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
key, ik, ivf.nlist);
scanner->set_list (key, coarse_dis[i * nprobe + ik]);
nlistv++;
size_t list_size = ivf.invlists->list_size(key);
InvertedLists::ScopedCodes scodes (ivf.invlists, key);
std::unique_ptr<InvertedLists::ScopedIds> sids;
const Index::idx_t * ids = nullptr;
if (!store_pairs) {
sids.reset (new InvertedLists::ScopedIds (ivf.invlists, key));
ids = sids->get();
}
nheap += scanner->scan_codes (list_size, scodes.get(),
ids, simi, idxi, k, bitset);
nscan += list_size;
if (max_codes && nscan >= max_codes)
break;
}
ndis += nscan;
if (metric_type == METRIC_INNER_PRODUCT) {
heap_reorder<HeapForIP> (k, simi, idxi);
} else {
heap_reorder<HeapForL2> (k, simi, idxi);
}
} // parallel for
} // parallel
indexIVF_stats.nq += n;
indexIVF_stats.nlist += nlistv;
indexIVF_stats.ndis += ndis;
indexIVF_stats.nheap_updates += nheap;
}
void search_knn_jaccard_heap(const IndexBinaryIVF& ivf,
size_t n,
const uint8_t *x,
idx_t k,
const idx_t *keys,
const float * coarse_dis,
float *distances, idx_t *labels,
bool store_pairs,
const IVFSearchParameters *params,
ConcurrentBitsetPtr bitset = nullptr)
{
long nprobe = params ? params->nprobe : ivf.nprobe;
long max_codes = params ? params->max_codes : ivf.max_codes;
MetricType metric_type = ivf.metric_type;
// almost verbatim copy from IndexIVF::search_preassigned
size_t nlistv = 0, ndis = 0, nheap = 0;
using HeapForJaccard = CMax<float, idx_t>;
#pragma omp parallel if(n > 1) reduction(+: nlistv, ndis, nheap)
{
std::unique_ptr<BinaryInvertedListScanner> scanner
(ivf.get_InvertedListScannerJaccard (store_pairs));
#pragma omp for
for (size_t i = 0; i < n; i++) {
const uint8_t *xi = x + i * ivf.code_size;
scanner->set_query(xi);
const idx_t * keysi = keys + i * nprobe;
float * simi = distances + k * i;
idx_t * idxi = labels + k * i;
heap_heapify<HeapForJaccard> (k, simi, idxi);
size_t nscan = 0;
for (size_t ik = 0; ik < nprobe; ik++) {
idx_t key = keysi[ik]; /* select the list */
if (key < 0) {
// not enough centroids for multiprobe
continue;
}
FAISS_THROW_IF_NOT_FMT
(key < (idx_t) ivf.nlist,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
key, ik, ivf.nlist);
scanner->set_list (key, (int32_t)coarse_dis[i * nprobe + ik]);
nlistv++;
size_t list_size = ivf.invlists->list_size(key);
InvertedLists::ScopedCodes scodes (ivf.invlists, key);
std::unique_ptr<InvertedLists::ScopedIds> sids;
const Index::idx_t * ids = nullptr;
if (!store_pairs) {
sids.reset (new InvertedLists::ScopedIds (ivf.invlists, key));
ids = sids->get();
}
nheap += scanner->scan_codes (list_size, scodes.get(),
ids, (int32_t*)simi, idxi, k, bitset);
nscan += list_size;
if (max_codes && nscan >= max_codes)
break;
}
ndis += nscan;
heap_reorder<HeapForJaccard> (k, simi, idxi);
} // parallel for
} // parallel
indexIVF_stats.nq += n;
indexIVF_stats.nlist += nlistv;
indexIVF_stats.ndis += ndis;
indexIVF_stats.nheap_updates += nheap;
}
template<class HammingComputer, bool store_pairs>
void search_knn_hamming_count(const IndexBinaryIVF& ivf,
size_t nx,
const uint8_t *x,
const idx_t *keys,
int k,
int32_t *distances,
idx_t *labels,
const IVFSearchParameters *params,
ConcurrentBitsetPtr bitset = nullptr) {
const int nBuckets = ivf.d + 1;
std::vector<int> all_counters(nx * nBuckets, 0);
std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
long nprobe = params ? params->nprobe : ivf.nprobe;
long max_codes = params ? params->max_codes : ivf.max_codes;
std::vector<HCounterState<HammingComputer>> cs;
for (size_t i = 0; i < nx; ++i) {
cs.push_back(HCounterState<HammingComputer>(
all_counters.data() + i * nBuckets,
all_ids_per_dis.get() + i * nBuckets * k,
x + i * ivf.code_size,
ivf.d,
k
));
}
size_t nlistv = 0, ndis = 0;
#pragma omp parallel for reduction(+: nlistv, ndis)
for (size_t i = 0; i < nx; i++) {
const idx_t * keysi = keys + i * nprobe;
HCounterState<HammingComputer>& csi = cs[i];
size_t nscan = 0;
for (size_t ik = 0; ik < nprobe; ik++) {
idx_t key = keysi[ik]; /* select the list */
if (key < 0) {
// not enough centroids for multiprobe
continue;
}
FAISS_THROW_IF_NOT_FMT (
key < (idx_t) ivf.nlist,
"Invalid key=%ld at ik=%ld nlist=%ld\n",
key, ik, ivf.nlist);
nlistv++;
size_t list_size = ivf.invlists->list_size(key);
InvertedLists::ScopedCodes scodes (ivf.invlists, key);
const uint8_t *list_vecs = scodes.get();
const Index::idx_t *ids = store_pairs
? nullptr
: ivf.invlists->get_ids(key);
for (size_t j = 0; j < list_size; j++) {
if(!bitset || !bitset->test(ids[j])){
const uint8_t * yj = list_vecs + ivf.code_size * j;
idx_t id = store_pairs ? (key << 32 | j) : ids[j];
csi.update_counter(yj, id);
}
}
if (ids)
ivf.invlists->release_ids (key, ids);
nscan += list_size;
if (max_codes && nscan >= max_codes)
break;
}
ndis += nscan;
int nres = 0;
for (int b = 0; b < nBuckets && nres < k; b++) {
for (int l = 0; l < csi.counters[b] && nres < k; l++) {
labels[i * k + nres] = csi.ids_per_dis[b * k + l];
distances[i * k + nres] = b;
nres++;
}
}
while (nres < k) {
labels[i * k + nres] = -1;
distances[i * k + nres] = std::numeric_limits<int32_t>::max();
++nres;
}
}
indexIVF_stats.nq += nx;
indexIVF_stats.nlist += nlistv;
indexIVF_stats.ndis += ndis;
}
template<bool store_pairs>
void search_knn_hamming_count_1 (
const IndexBinaryIVF& ivf,
size_t nx,
const uint8_t *x,
const idx_t *keys,
int k,
int32_t *distances,
idx_t *labels,
const IVFSearchParameters *params,
ConcurrentBitsetPtr bitset = nullptr) {
switch (ivf.code_size) {
#define HANDLE_CS(cs) \
case cs: \
search_knn_hamming_count<HammingComputer ## cs, store_pairs>( \
ivf, nx, x, keys, k, distances, labels, params, bitset); \
break;
HANDLE_CS(4);
HANDLE_CS(8);
HANDLE_CS(16);
HANDLE_CS(20);
HANDLE_CS(32);
HANDLE_CS(64);
#undef HANDLE_CS
default:
if (ivf.code_size % 8 == 0) {
search_knn_hamming_count<HammingComputerM8, store_pairs>
(ivf, nx, x, keys, k, distances, labels, params, bitset);
} else if (ivf.code_size % 4 == 0) {
search_knn_hamming_count<HammingComputerM4, store_pairs>
(ivf, nx, x, keys, k, distances, labels, params, bitset);
} else {
search_knn_hamming_count<HammingComputerDefault, store_pairs>
(ivf, nx, x, keys, k, distances, labels, params, bitset);
}
break;
}
}
} // namespace
BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScanner
(bool store_pairs) const
{
if (store_pairs) {
return select_IVFBinaryScannerL2<true> (code_size);
} else {
return select_IVFBinaryScannerL2<false> (code_size);
}
}
BinaryInvertedListScanner *IndexBinaryIVF::get_InvertedListScannerJaccard
(bool store_pairs) const
{
if (store_pairs) {
return select_IVFBinaryScannerJaccard<true> (code_size);
} else {
return select_IVFBinaryScannerJaccard<false> (code_size);
}
}
void IndexBinaryIVF::search_preassigned(idx_t n, const uint8_t *x, idx_t k,
const idx_t *idx,
const int32_t * coarse_dis,
int32_t *distances, idx_t *labels,
bool store_pairs,
const IVFSearchParameters *params,
ConcurrentBitsetPtr bitset
) const {
if (metric_type == METRIC_Jaccard || metric_type == METRIC_Tanimoto) {
if (use_heap) {
float *D = new float[k * n];
float *c_dis = new float [n * nprobe];
memcpy(c_dis, coarse_dis, sizeof(float) * n * nprobe);
search_knn_jaccard_heap (*this, n, x, k, idx, c_dis ,
D, labels, store_pairs,
params, bitset);
if (metric_type == METRIC_Tanimoto) {
for (int i = 0; i < k * n; i++) {
D[i] = -log2(1-D[i]);
}
}
memcpy(distances, D, sizeof(float) * n * k);
delete [] D;
delete [] c_dis;
} else {
//not implemented
}
} else {
if (use_heap) {
search_knn_hamming_heap (*this, n, x, k, idx, coarse_dis,
distances, labels, store_pairs,
params, bitset);
} else {
if (store_pairs) {
search_knn_hamming_count_1<true>
(*this, n, x, idx, k, distances, labels, params, bitset);
} else {
search_knn_hamming_count_1<false>
(*this, n, x, idx, k, distances, labels, params, bitset);
}
}
}
}
IndexBinaryIVF::~IndexBinaryIVF() {
if (own_invlists) {
delete invlists;
}
if (own_fields) {
delete quantizer;
}
}
} // namespace faiss